Data sourced from Kaggle competition [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started/overview)

In [94]:
# import core libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [95]:
import joblib

In [96]:
# pre-processing

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import nltk
from textblob import TextBlob
from langdetect import detect

import contractions

In [97]:
# modelling

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [98]:
# metrics/evaluation

import scikitplot as skplt
from matplotlib.colors import ListedColormap
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [99]:
# instantiating the train and test sets

train = pd.read_csv('data/train.csv')
test =  pd.read_csv('data/test.csv')

In [100]:
# combining the train and test sets for the purpose of EDA and Data Cleaning/Feature Engineering

df = pd.concat([train, test], ignore_index=True)

In [101]:
print("Training Dataframe Shape: {}".format(str(train.shape)))
print("Test Dataframe Shape: {}".format(str(test.shape)))
print("Combined Dataframe Shape: {}".format(str(df.shape)))

Training Dataframe Shape: (7613, 5)
Test Dataframe Shape: (3263, 4)
Combined Dataframe Shape: (10876, 5)


In [102]:
# example of tweets

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,,,Forest fire near La Ronge Sask. Canada,1.0
2,5,,,All residents asked to 'shelter in place' are ...,1.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10876 entries, 0 to 10875
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        10876 non-null  int64  
 1   keyword   10789 non-null  object 
 2   location  7238 non-null   object 
 3   text      10876 non-null  object 
 4   target    7613 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 425.0+ KB


### Dealing with null values

In [104]:
# null values in the training set

df.isnull().sum()

id             0
keyword       87
location    3638
text           0
target      3263
dtype: int64

### Keyword

In [105]:
df.keyword.unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [106]:
# trying to understand whether the null values in keyword have any relevance - they don't

df[df.keyword.isnull()].target.value_counts()

1.0    42
0.0    19
Name: target, dtype: int64

In [107]:
# creating a new category for the null keyword and location values

df.fillna({'keyword': 'unknown', 'location': 'unknown'},inplace=True)

In [108]:
# cleaning the keyword column

df.replace({'keyword': '%20'}, {'keyword': '_'}, inplace=True, regex=True)

### Location

In [109]:
# given how messy and the location column is, it's unlikely that we'll be able to clean it for modelling purposes

df.location.value_counts().head(20)

unknown            3638
USA                 141
New York            109
United States        65
London               58
Canada               42
Nigeria              40
India                35
Worldwide            35
Los Angeles, CA      34
UK                   33
Kenya                32
Washington, DC       31
Mumbai               28
United Kingdom       26
Australia            25
California           25
Los Angeles          24
Chicago, IL          23
San Francisco        23
Name: location, dtype: int64

# Text

### Language

In [110]:
# checking that all tweets are in English

# lang_series = df.text.apply(lambda x: detect(x))

In [111]:
# saving lang_series as a joblib file

# joblib.dump(lang_series, 'jlib_files/lang_series.jlib')

In [112]:
# loading lang_series jlib file

lang_series = joblib.load('jlib_files/lang_series.jlib')

In [113]:
df['language'] = lang_series

In [114]:
df[df.language != 'en'].sample(20)

Unnamed: 0,id,keyword,location,text,target,language
7011,10050,twister,instagram: bribriony,Drunk twister is so hard ????,0.0,de
6703,9602,thunder,unknown,L B #Oklahoma #Thunder DURANT NBA ADIDAS OKLAH...,0.0,de
10320,9015,stretcher,unknown,Mind salivation stretcher beds: KEGm,,da
6522,9329,survive,EveryWhere,:: Survive??,0.0,hr
2347,3377,demolition,unknown,@czallstarwes more like demolition derby ??,0.0,da
401,578,arson,"North-East Region, Singapore",@sayn_ae angel or arson,0.0,cy
9082,4873,explode,unknown,Some guys explode ??,,fr
7755,449,armageddon,1996???????????,UNIVERSAL ORDER OF ARMAGEDDON http://t.co/3tY4mGm,,de
8722,3668,destroy,unknown,People can't destroy you unless you let them.,,fr
3904,5554,flattened,"Keighley, England",Imagine getting flattened by Kurt Zouma,0.0,no


It seems that the language detector function isn't doing a very good job of picking up some of the tweets' language. Regardless, it seems that all of the tweets are in English so we don't have to worry about dealing with other languages.

In [115]:
# dropping language column from dataset

df.drop('language', 1, inplace=True)

### Hash-tag extraction

In [116]:
df.text[0].split()

['Our',
 'Deeds',
 'are',
 'the',
 'Reason',
 'of',
 'this',
 '#earthquake',
 'May',
 'ALLAH',
 'Forgive',
 'us',
 'all']

In [117]:
def hash_tags(x):
    ht_list = []
    for word in x.split():
        if word[0] == '#':
            ht_list.append(word.lower().replace('#',''))
    return ht_list

In [118]:
hash_tags = df.text.apply(hash_tags)

In [119]:
num_hash_tags = hash_tags.apply(lambda x: len(x))

In [120]:
ht_dict = {}

for lst in hash_tags:
    if len(lst) != 0:
        for ht in lst:
            if ht in ht_dict.keys():
                ht_dict[ht] += 1
            else:
                ht_dict[ht] = 1

In [121]:
ht_df = pd.DataFrame.from_dict(ht_dict, orient = 'index', columns=["appearances"])

In [122]:
ht_df.shape

(2700, 1)

In [123]:
ht_df.sort_values('appearances',ascending=False).head(20)

Unnamed: 0,appearances
news,92
hot,42
best,41
prebreak,41
hiroshima,33
???,31
??,28
nowplaying,25
earthquake,24
islam,22


It seems as though it's not much use trying to categorise the hash-tags. There are 2700 different hash-tags in the dataset, which is too broad a range. When we look at the hash-tags that appear the most, they still have a very small number of apparances, which wouldn't be of much to use if we were to use it as a predictor variable. 

For now, I'm just going to use the meta-data of how many hash-tags appear in each tweet. I don't expect this to be a super-useful predictor, but I'm optimistic that it might add some value.

In [124]:
df['hashtags'] = num_hash_tags

In [125]:
df.head()

Unnamed: 0,id,keyword,location,text,target,hashtags
0,1,unknown,unknown,Our Deeds are the Reason of this #earthquake M...,1.0,1
1,4,unknown,unknown,Forest fire near La Ronge Sask. Canada,1.0,0
2,5,unknown,unknown,All residents asked to 'shelter in place' are ...,1.0,0
3,6,unknown,unknown,"13,000 people receive #wildfires evacuation or...",1.0,1
4,7,unknown,unknown,Just got sent this photo from Ruby #Alaska as ...,1.0,2


### Creating new features: length of tweet, number of words and average word length

### SHOULD WE TAKE OUT URLS BEFORE DOING THIS SECTION?

In [126]:
import string

In [127]:
df['tweet_characters'] = df.text.apply(lambda x: len(x))

In [130]:
def word_counter(tweet):   
    no_punct = ''.join([x for x in tweet if x not in string.punctuation])
    word_lst = no_punct.split()      
    return len(word_lst)

In [131]:
df['tweet_words'] = df.text.apply(word_counter)

In [146]:
words = ''.join([x for x in 'Our Deeds are the Reason of this #earthquake ' if x not in string.punctuation]).split()

In [149]:
sum(map(len, words))/len(words)

4.5

In [150]:
def ave_word_length(tweet):
    no_punct = ''.join([x for x in tweet if x not in string.punctuation])
    word_lst = no_punct.split()
    return sum(map(len, word_lst))/len(word_lst)

In [151]:
df['tweet_av_word_length'] = df.text.apply(ave_word_length)

### Expanding contractions

In [132]:
df['text_no_contr'] = df.text.apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))

### Tokenizing Tweets

In [133]:
df['tokenized'] = df.text_no_contr.apply(nltk.word_tokenize)

In [134]:
df.head()

Unnamed: 0,id,keyword,location,text,target,hashtags,tweet_characters,tweet_words,text_no_contr,tokenized
0,1,unknown,unknown,Our Deeds are the Reason of this #earthquake M...,1.0,1,69,13,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #, ea..."
1,4,unknown,unknown,Forest fire near La Ronge Sask. Canada,1.0,0,38,7,Forest fire near La Ronge Sask. Canada,"[Forest, fire, near, La, Ronge, Sask, ., Canada]"
2,5,unknown,unknown,All residents asked to 'shelter in place' are ...,1.0,0,133,22,All residents asked to 'shelter in place' are ...,"[All, residents, asked, to, 'shelter, in, plac..."
3,6,unknown,unknown,"13,000 people receive #wildfires evacuation or...",1.0,1,65,8,"13,000 people receive #wildfires evacuation or...","[13,000, people, receive, #, wildfires, evacua..."
4,7,unknown,unknown,Just got sent this photo from Ruby #Alaska as ...,1.0,2,88,16,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #, ..."


### Removing Punctuation

In [135]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Experimenting with tweet-preprocessor package

In [136]:
import preprocessor as p

In [137]:
df['tweet_tokenized'] = df.text.apply(lambda x:p.tokenize(x))

In [140]:
# creating a for-loop to add columns for the tweet meta-data features

for feature in ['url', 'hashtag', 'smiley', 'mention']:
    feature_counter = []
    for tweet in df.tweet_tokenized:
        counter = 0
        for word in tweet.split():
            if word == "$"+feature.upper()+"$":
                counter += 1
        feature_counter.append(counter)
    df["tweet_"+feature] = feature_counter

### To-do list:

- create broader categories for the keyword and, potentially, location columns
- use more visualizations through the data cleaning process (to start with: countvectorize before any data cleaning has started to show the words that appear the most frequently)


#### Text Pre-processing

- ~~check the language that the tweet is written in~~
- remove digits and lower the text
- ~~expand contractions~~
- convert to lowercase
- remove punctuation (maybe include meta-data for punctuation instead?)
- tokenize words
- lemmatize words
- remove stop-words
- ~~hashtag extraction~~

- does the text contain emojis?

#### Feature Engineering

- meta-data
    - - ~~how many hash-tags each tweet contains~~
    - ~~no. of emojis~~
    - ~~number of words~~
    - ~~number of characters~~
- sentiment analysis (textblob)
- average word length
- use spacy to extract location from location variable

#### EDA

- word clouds for each target variable
- seperate the below by each target variable
    - number of characters in each tweet
    - average word length in each sentence
    - most commonly appearing ngrams of various lenghts
    - textblob for sentiment analysis
    - use speech tagging
    - frequency of most common words
    - number of words with a given number of appearances
    
#### Other

- Research the use of LDA and NMF
    
    
Useful articles: 

https://towardsdatascience.com/preprocessing-text-data-using-python-576206753c28

https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e

https://medium.com/spatial-data-science/how-to-extract-locations-from-text-with-natural-language-processing-9b77035b3ea4

