Data sourced from Kaggle competition [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started/overview)

In [1]:
# import core libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
import joblib

In [34]:
# pre-processing

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import nltk
from textblob import TextBlob
from langdetect import detect

In [3]:
# modelling

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [4]:
# metrics/evaluation

import scikitplot as skplt
from matplotlib.colors import ListedColormap
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [5]:
# instantiating the train and test sets

train = pd.read_csv('data/train.csv')
test =  pd.read_csv('data/test.csv')

In [6]:
# combining the train and test sets for the purpose of EDA and Data Cleaning/Feature Engineering

df = pd.concat([train, test], ignore_index=True)

In [7]:
print("Training Dataframe Shape: {}".format(str(train.shape)))
print("Test Dataframe Shape: {}".format(str(test.shape)))
print("Combined Dataframe Shape: {}".format(str(df.shape)))

Training Dataframe Shape: (7613, 5)
Test Dataframe Shape: (3263, 4)
Combined Dataframe Shape: (10876, 5)


In [8]:
# example of tweets

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,,,Forest fire near La Ronge Sask. Canada,1.0
2,5,,,All residents asked to 'shelter in place' are ...,1.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10876 entries, 0 to 10875
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        10876 non-null  int64  
 1   keyword   10789 non-null  object 
 2   location  7238 non-null   object 
 3   text      10876 non-null  object 
 4   target    7613 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 425.0+ KB


### Dealing with null values

In [10]:
# null values in the training set

df.isnull().sum()

id             0
keyword       87
location    3638
text           0
target      3263
dtype: int64

### Keyword

In [11]:
df.keyword.unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [12]:
# trying to understand whether the null values in keyword have any relevance - they don't

df[df.keyword.isnull()].target.value_counts()

1.0    42
0.0    19
Name: target, dtype: int64

In [13]:
# creating a new category for the null keyword and location values

df.fillna({'keyword': 'unknown', 'location': 'unknown'},inplace=True)

In [14]:
# cleaning the keyword column

df.replace({'keyword': '%20'}, {'keyword': '_'}, inplace=True, regex=True)

### Location

In [15]:
# given how messy and the location column is, it's unlikely that we'll be able to clean it for modelling purposes

df.location.value_counts().head(20)

unknown            3638
USA                 141
New York            109
United States        65
London               58
Canada               42
Nigeria              40
Worldwide            35
India                35
Los Angeles, CA      34
UK                   33
Kenya                32
Washington, DC       31
Mumbai               28
United Kingdom       26
Australia            25
California           25
Los Angeles          24
Chicago, IL          23
San Francisco        23
Name: location, dtype: int64

# Text

### Language

In [38]:
# checking that all tweets are in English

# lang_series = df.text.apply(lambda x: detect(x))

In [43]:
# saving lang_series as a joblib file

# joblib.dump(lang_series, 'jlib_files/lang_series.jlib')

['jlib_files/lang_series.jlib']

In [45]:
# loading lang_series jlib file

# lang_series = joblib.load('jlib_files/lang_series.jlib')

In [46]:
df['language'] = lang_series

In [54]:
df[df.language != 'en'].sample(20)

Unnamed: 0,id,keyword,location,text,target,language
991,1440,body_bagging,302???? 815,@Yankees body bagging mfs,1.0,id
8515,2977,dead,My World,Ross better not be dead! #Emmerdale,,no
10713,10262,war_zone,baltimore maryland,@thelovatoagent omg i feel like i am in a war ...,,nl
4494,6391,hurricane,"Haiku, Maui, Hawaii",HURRICANE GUILLERMO LIVE NOAA TRACKING / LOOPI...,1.0,vi
1627,2350,collapse,Behind The Obama Curtain,Greece's tax revenues collapse as debt crisis ...,1.0,fr
7854,772,avalanche,Score More Goals Buying @,2 TIX 10/3 Frozen Fury XVII: Los Angeles Kings...,,de
4519,6421,hurricane,The Globe,HURRICANE GUILLERMO LIVE NOAA TRACKING / LOOPI...,1.0,vi
10260,8845,smoke,Franklinton - BR - Houston,Smoke sumn,,hr
5143,7334,nuclear_reactor,"Washington, D.C.",Salem 2 nuclear reactor shut down over electri...,1.0,ro
4450,6332,hostage,unknown,@gideonstrumpet Have you been held hostage?,0.0,nl


It seems that the language detector function isn't doing a very good job of picking up some of the tweets' language. Regardless, it seems that all of the tweets are in English so we don't have to worry about dealing with other languages.

In [95]:
# dropping language column from dataset

df.drop('language', 1, inplace=True)

### Hash-tag extraction

In [56]:
df.text[0].split()

['Our',
 'Deeds',
 'are',
 'the',
 'Reason',
 'of',
 'this',
 '#earthquake',
 'May',
 'ALLAH',
 'Forgive',
 'us',
 'all']

In [61]:
def hash_tags(x):
    ht_list = []
    for word in x.split():
        if word[0] == '#':
            ht_list.append(word.lower().replace('#',''))
    return ht_list

In [69]:
hash_tags = df.text.apply(hash_tags)

In [71]:
num_hash_tags = hash_tags.apply(lambda x: len(x))

In [75]:
ht_dict = {}

for lst in hash_tags:
    if len(lst) != 0:
        for ht in lst:
            if ht in ht_dict.keys():
                ht_dict[ht] += 1
            else:
                ht_dict[ht] = 1

In [85]:
ht_df = pd.DataFrame.from_dict(ht_dict, orient = 'index', columns=["appearances"])

In [91]:
ht_df.shape

(2700, 1)

In [90]:
ht_df.sort_values('appearances',ascending=False).head(20)

Unnamed: 0,appearances
news,92
hot,42
best,41
prebreak,41
hiroshima,33
???,31
??,28
nowplaying,25
earthquake,24
islam,22


It seems as though it's not much use trying to categorise the hash-tags. There are 2700 different hash-tags in the dataset, which is too broad a range. When we look at the hash-tags that appear the most, they still have a very small number of apparances, which wouldn't be of much to use if we were to use it as a predictor variable. 

For now, I'm just going to use the meta-data of how many hash-tags appear in each tweet. I don't expect this to be a super-useful predictor, but I'm optimistic that it might add some value.

In [92]:
df['hashtags'] = num_hash_tags

In [93]:
df.head()

Unnamed: 0,id,keyword,location,text,target,language,hashtags
0,1,unknown,unknown,Our Deeds are the Reason of this #earthquake M...,1.0,en,1
1,4,unknown,unknown,Forest fire near La Ronge Sask. Canada,1.0,en,0
2,5,unknown,unknown,All residents asked to 'shelter in place' are ...,1.0,en,0
3,6,unknown,unknown,"13,000 people receive #wildfires evacuation or...",1.0,en,1
4,7,unknown,unknown,Just got sent this photo from Ruby #Alaska as ...,1.0,en,2


# Countvectorizer

In [17]:
cvec = CountVectorizer(stop_words='english', strip_accents= 'unicode', lowercase=True, ngram_range=(1,1))

In [18]:
cvec_mat = cvec.fit_transform(df.text)

### To-do list:

- create broader categories for the keyword and, potentially, location columns
- use spacy to extract location from location variable

#### Text Pre-processing

- ~~check the language that the tweet is written in~~
- remove digits and lower the text
- expand contractions
- convert to lowercase
- remove punctuation (maybe include meta-data for punctuation instead?)
- tokenize + lemmatize words
- remove stop-words
- hashtag extraction
- how many hash-tags each tweet contains
- does the text contain emojis?

#### EDA

- word clouds for each target variable
- seperate the below by each target variable
    - number of characters in each tweet
    - average word length in each sentence
    - most commonly appearing ngrams of various lenghts
    - textblob for sentiment analysis
    - use speech tagging
    - frequency of most common words
    - number of words with a given number of appearances
    
#### Other

- Research the use of LDA and NMF
    
    
Useful articles: 

https://towardsdatascience.com/preprocessing-text-data-using-python-576206753c28

https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e

https://medium.com/spatial-data-science/how-to-extract-locations-from-text-with-natural-language-processing-9b77035b3ea4

