# Disaster Tweet Detector (NLP): Preprocessing and Modeling

## Importing Libraries

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora.dictionary import Dictionary
from collections import Counter

## Importing Data

In [2]:
tweets_c = pd.read_csv('C:/Users/justi/SpringboardDS/Capstone III/data/cleaned_tweets.csv')
tweets_c.head()

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target,text_cleaned
0,0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1,communal violence bhainsa telangana stones pel...
1,1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1,telangana section imposed bhainsa january clas...
2,2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1,arsonist sets cars ablaze dealership
3,3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1,arsonist sets cars ablaze dealership
4,4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0,lord jesus love brings freedom pardon fill hol...


In [3]:
# Drop unwanted columns.
tweets_c = tweets_c.drop(columns = ['Unnamed: 0', 'id', 'keyword', 'location', 'text'])
tweets_c.head()

Unnamed: 0,target,text_cleaned
0,1,communal violence bhainsa telangana stones pel...
1,1,telangana section imposed bhainsa january clas...
2,1,arsonist sets cars ablaze dealership
3,1,arsonist sets cars ablaze dealership
4,0,lord jesus love brings freedom pardon fill hol...


## Preprocessing

In [4]:
# There appears to be one null row still in our dataframe.
tweets_c[tweets_c.isnull().any(axis=1)]

Unnamed: 0,target,text_cleaned
1500,0,


In [5]:
# Let's drop it.
tweets_c = tweets_c.dropna()

In [6]:
# Split our data into train/test sets.
X = tweets_c.text_cleaned
y = tweets_c.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [7]:
# Build BOW features on trained tweets.
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2)) # ignore terms that appear in <5 docs and >100% docs (don't ignore any terms for the max_df setting), includes unigrams and bigrams.
cv_train_features = cv.fit_transform(X_train)

In [9]:
# Build TFIDF features on trained tweets.
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2), sublinear_tf=True)
tv_train_features = tv.fit_transform(X_train)

In [11]:
# Transform test tweets into features.
cv_test_features = cv.transform(X_test)
tv_test_features = tv.transform(X_test)

In [19]:
# Check our vectorized features shape.
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape) # Is the 3647 the amount of words in our docs with each column containing counts per tweet?
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (9095, 3647)  Test features shape: (2274, 3647)
TFIDF model:> Train features shape: (9095, 3647)  Test features shape: (2274, 3647)


## Modeling

In [24]:
# Let's try a Logistic Regression model first (using our count vectorized data).
# Initiate the model.
LR = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42) # Should I do a gridsearch with text data?

# Fit LR model with our training data.
LR.fit(cv_train_features, y_train)

# Make predictions using our vectorized test features.
LR_bow_predictions = LR.predict(cv_test_features)

In [34]:
labels = ['Not Disaster', 'Disaster']
print(classification_report(y_test, LR_bow_predictions))
pd.DataFrame(confusion_matrix(y_test, LR_bow_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      1867
           1       0.78      0.54      0.64       407

    accuracy                           0.89      2274
   macro avg       0.84      0.75      0.79      2274
weighted avg       0.88      0.89      0.88      2274



Unnamed: 0,Not Disaster,Disaster
Not Disaster,1805,62
Disaster,188,219


Since we care more about labeling disaster tweets correctly (we want our model to identify disaster tweets correctly more than having high precision within the positive class) we will use recall for the positive (disaster) class as our evaluation metric. Our positive class has a recall of 0.54 which is not great. Let's see how a logistic regression model performs on our TFIDF vectorized data.

In [38]:
# Let's try a Logistic Regression model using our tfidf vectorized data.
# Initiate the model.
LR = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)

# Fit LR model with our training data.
LR.fit(tv_train_features, y_train)

# Make predictions using our vectorized test features.
LR_tfidf_predictions = LR.predict(tv_test_features)

In [39]:
labels = ['Not Disaster', 'Disaster']
print(classification_report(y_test, LR_tfidf_predictions))
pd.DataFrame(confusion_matrix(y_test, LR_tfidf_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.88      0.99      0.93      1867
           1       0.87      0.36      0.51       407

    accuracy                           0.88      2274
   macro avg       0.88      0.67      0.72      2274
weighted avg       0.88      0.88      0.85      2274



Unnamed: 0,Not Disaster,Disaster
Not Disaster,1846,21
Disaster,261,146


The recall here is much worse at 0.36. Let's try a random forest model on both the count and tfidf vectorized data.

In [40]:
# Let's now try a Random Forest model (using our count vectorized data).
# Initiate the model.
RF = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

# train model
RF.fit(cv_train_features, y_train)

# predict on test data
RF_bow_predictions = RF.predict(cv_test_features)

In [41]:
labels = ['Not Disaster', 'Disaster']
print(classification_report(y_test, RF_bow_predictions))
pd.DataFrame(confusion_matrix(y_test, RF_bow_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1867
           1       0.74      0.49      0.59       407

    accuracy                           0.88      2274
   macro avg       0.82      0.73      0.76      2274
weighted avg       0.87      0.88      0.87      2274



Unnamed: 0,Not Disaster,Disaster
Not Disaster,1795,72
Disaster,207,200


In [42]:
# Let's try a Random Forest model using our tfidf vectorized data.
# Initiate the model.
RF = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

# train model
RF.fit(tv_train_features, y_train)

# predict on test data
RF_tfidf_predictions = RF.predict(tv_test_features)

In [43]:
labels = ['Not Disaster', 'Disaster']
print(classification_report(y_test, RF_tfidf_predictions))
pd.DataFrame(confusion_matrix(y_test, RF_tfidf_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.90      0.97      0.93      1867
           1       0.80      0.49      0.61       407

    accuracy                           0.89      2274
   macro avg       0.85      0.73      0.77      2274
weighted avg       0.88      0.89      0.87      2274



Unnamed: 0,Not Disaster,Disaster
Not Disaster,1818,49
Disaster,209,198


So far none of our models are performing at the level that we would like. Our best model is the logistic regression one using the count vectorized data. This model only had a recall of 0.54 for the positive (disaster) class.

In [None]:
# Should I try to balance the data for better performance even though it will delete a good amount of data?
# Deep learning neural netowrk necessary to try? Other models to try that can improve performance (recall)?
# How should I investigate feature importance for text data? Make word clouds for the correctly predicted class tweets? (i.e. TP and TN)