Data sourced from Kaggle competition [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started/overview)

In [25]:
# import core libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
# pre-processing

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [27]:
# modelling

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [28]:
# metrics/evaluation

import scikitplot as skplt
from matplotlib.colors import ListedColormap
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [29]:
# instantiating the train and test sets

train = pd.read_csv('data/train.csv')
test =  pd.read_csv('data/test.csv')

In [30]:
# combining the train and test sets for the purpose of EDA and Data Cleaning/Feature Engineering

df = pd.concat([train, test], ignore_index=True)

In [31]:
print("Training Dataframe Shape: {}".format(str(train.shape)))
print("Test Dataframe Shape: {}".format(str(test.shape)))
print("Combined Dataframe Shape: {}".format(str(df.shape)))

Training Dataframe Shape: (7613, 5)
Test Dataframe Shape: (3263, 4)
Combined Dataframe Shape: (10876, 5)


In [32]:
# example of tweets

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,,,Forest fire near La Ronge Sask. Canada,1.0
2,5,,,All residents asked to 'shelter in place' are ...,1.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10876 entries, 0 to 10875
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        10876 non-null  int64  
 1   keyword   10789 non-null  object 
 2   location  7238 non-null   object 
 3   text      10876 non-null  object 
 4   target    7613 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 425.0+ KB


### Dealing with null values

In [34]:
# null values in the training set

df.isnull().sum()

id             0
keyword       87
location    3638
text           0
target      3263
dtype: int64

### Keyword

In [35]:
df.keyword.unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [36]:
# trying to understand whether the null values in keyword have any relevance - they don't

df[df.keyword.isnull()].target.value_counts()

1.0    42
0.0    19
Name: target, dtype: int64

In [41]:
# creating a new category for the null keyword and location values

df.fillna({'keyword': 'unknown', 'location': 'unknown'},inplace=True)

In [48]:
# cleaning the keyword column

df.replace({'keyword': '%20'}, {'keyword': '_'}, inplace=True, regex=True)

### Location

In [58]:
# given how messy and the location column is, it's unlikely that we'll be able to clean it for modelling purposes

df.location.value_counts().head(20)

unknown            3638
USA                 141
New York            109
United States        65
London               58
Canada               42
Nigeria              40
Worldwide            35
India                35
Los Angeles, CA      34
UK                   33
Kenya                32
Washington, DC       31
Mumbai               28
United Kingdom       26
California           25
Australia            25
Los Angeles          24
Chicago, IL          23
San Francisco        23
Name: location, dtype: int64

### Text

In [78]:
df.text[100]

'.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad'

# Countvectorizer

In [61]:
cvec = CountVectorizer(stop_words='english', strip_accents= 'unicode', lowercase=True, ngram_range=(1,1))

In [67]:
cvec_mat = cvec.fit_transform(df.text)

In [69]:
cvec.get_feature_names()

['00',
 '000',
 '0000',
 '000sijjl3t',
 '007npen6lg',
 '00am',
 '00cy9vxeff',
 '00end',
 '00pm',
 '01',
 '01hux8y9gi',
 '02',
 '0215',
 '02elqlopfk',
 '02pm',
 '03',
 '030',
 '032',
 '033',
 '034',
 '039',
 '03bjm4orow',
 '03l7nwqdje',
 '04',
 '05',
 '0518',
 '05qooc9cbr',
 '05th',
 '06',
 '060',
 '061',
 '06jst',
 '07',
 '073izwx0lb',
 '07nkmo7vas',
 '08',
 '0840728',
 '0853',
 '087809233445',
 '0880',
 '08lngclzsj',
 '09',
 '0992',
 '0abgfglh7x',
 '0aiv5khzjv',
 '0ajisa5531',
 '0ap7montjf',
 '0aqbzmrvlq',
 '0bkmuhysfx',
 '0blkwcupzq',
 '0btniwagt1',
 '0bvk5tub4j',
 '0bznfdg0ar',
 '0c1y8g7e9p',
 '0cr74m1uxm',
 '0cucmzcmao',
 '0cvxs2e1er',
 '0cxm5tkz8y',
 '0dqjeretxu',
 '0drqlrsgy5',
 '0dxvz7fdh3',
 '0erisq25kt',
 '0f8xa4ih1u',
 '0fdsc3f2iw',
 '0fekgyby5f',
 '0fs9ksv5xk',
 '0ghk693egj',
 '0gidg9u45j',
 '0gknpy4lua',
 '0gt63uqgcu',
 '0h7oua1pns',
 '0ipavkjgdc',
 '0iw6drf5x9',
 '0iyuntxduv',
 '0jfnvaxfph',
 '0jmkdtcymj',
 '0kccg1bt06',
 '0keh2treny',
 '0kjcwg6pn9',
 '0kjjdaojhi',
 '0krw1

### To-do list:

- create broader categories for the keyword and, potentially, location columns
- use spacy to extract location from location variable

#### Text Pre-processing

- check the language that the tweet is written in
- remove digits and lower the text
- expand contractions
- convert to lowercase
- remove punctuation (maybe include meta-data for punctuation instead?)
- tokenize + lemmatize words
- remove stop-words
- hashtag extraction
- does the text contain emojis?

#### EDA

- word clouds for each target variable
- seperate the below by each target variable
    - number of characters in each tweet
    - average word length in each sentence
    - most commonly appearing ngrams of various lenghts
    - textblob for sentiment analysis
    - use speech tagging
    - frequency of most common words
    - number of words with a given number of appearances
    
#### Other

- Research the use of LDA and NMF
    
    
Useful articles: 

https://towardsdatascience.com/preprocessing-text-data-using-python-576206753c28
https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e

