## Models of disaster tweet categories using Sandy-Joplin training data
  -- Several model types tested
  
  
  -- Use best model to predict categories for Hurricane Michael tweets (out-of-event sample)

In [2]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report,classification
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
import pandas as pd
import numpy as np
import re



In [92]:
#read pickled dataframe of Sandy-Joplin Tweets
tweets = pd.read_pickle("../data/train3.pkl")

In [93]:
#baseline accuracy
tweets.y.value_counts(1)

0.0    0.430952
2.0    0.185838
4.0    0.112784
1.0    0.098366
3.0    0.081705
5.0    0.075617
6.0    0.014739
Name: y, dtype: float64

In [94]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3121 entries, 204690718 to 223607572
Data columns (total 20 columns):
_golden                  3121 non-null bool
_last_judgment_at        3121 non-null object
_trusted_judgments       3121 non-null int64
_unit_state              3121 non-null object
category                 1776 non-null object
choose_one               1345 non-null object
choose_one:confidence    3121 non-null float64
choose_one_gold          151 non-null object
event                    1000 non-null object
id                       1233 non-null float64
nil                      1 non-null object
predicted                887 non-null object
retweetcount             649 non-null float64
screenname               1221 non-null object
text_no_rt               1888 non-null object
tweet                    3121 non-null object
type                     543 non-null object
user                     1000 non-null object
userid                   1221 non-null float64
y           

In [90]:
def clean_tweets(col):
    # convert text to lower case
    col = col.str.lower()

    # remove URLs
    col = col.apply(lambda x: re.split('https:\/\/.*', str(x))[0])
    col = col.apply(lambda x: re.split('http:\/\/.*', str(x))[0])
    col = col.replace(r'www\S+', '', regex=True)

    #remove "RT" string
    col = col.map(lambda x: x.lstrip('rt'))
    return col

In [95]:
# clean tweet text
tweets['clean_text'] = clean_tweets(tweets['tweet'])

In [96]:
#Name variables 
X = tweets[["clean_text"]]
y = tweets["y"]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [7]:
# Load stopwords from NLTK and add custom words
mystopwords = stopwords.words('english')
mystopwords.extend(['hurricane','tornado','harvey','irma','joplin','sandy','maria',
                    'like','would','get','x200b','https','one','www','com','org','etc','could'])

In [112]:
# Naive Bayes with CVEC
tknzr = TweetTokenizer()
cvec = CountVectorizer(tokenizer=tknzr.tokenize,stop_words=None,
                       max_features=3000,max_df=1.0,min_df=2, 
                       ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(),
                           columns = cvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_cvec, y_train)

# Score model on the training set.
print('Naive Bayes model with Count Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_cvec,y_test):.3f}')

Naive Bayes model with Count Vectorizer
Train data accuracy: 0.824
Test data accuracy: 0.675


In [119]:
# Naive Bayes with Tfidf
tvec = TfidfVectorizer(tokenizer=tknzr.tokenize,stop_words=None,
                       max_features=1000,max_df=1.0,min_df=4, ngram_range=(1,2))

# Fit Vectorizer on the training data and transform training data.
X_train_tvec = pd.DataFrame(tvec.fit_transform(X_train['clean_text']).todense(),
                            columns = tvec.get_feature_names())
# Transform testing data with the already-fit Vectorizer.
X_test_tvec = pd.DataFrame(tvec.transform(X_test['clean_text']).todense(),
                           columns = tvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_tvec, y_train)

# Score model on the training set.
print('Naive Bayes model with TF-IDF Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_tvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_tvec,y_test):.3f}')

Naive Bayes model with TF-IDF Vectorizer
Train data accuracy: 0.693
Test data accuracy: 0.634


In [122]:
# SVM model with CVEC
cvec = CountVectorizer(stop_words=mystopwords,max_features=800, ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(),
                           columns = cvec.get_feature_names())
# Instantiate SVM.
# svc = svm.SVC(kernel='poly', C = 1.8, gamma = .05) 
svc = svm.SVC(kernel='rbf', C = 1, gamma = .2)  

# Fit on training data.
svc.fit(X_train_cvec,y_train)

# Score model on the training set.
print('Support Vector Machine model with Count Vectorizer')
print(f'Train data accuracy: {svc.score(X_train_cvec,y_train):.3f}')

# Score model on the testing set.
print(f'Test data accuracy: {svc.score(X_test_cvec,y_test):.3f}')

Support Vector Machine model with Count Vectorizer
Train data accuracy: 0.845
Test data accuracy: 0.609


In [111]:
# SVM model with TVEC
tvec = TfidfVectorizer(tokenizer=tknzr.tokenize,stop_words=mystopwords,
                       max_features=800,max_df=1.0,min_df=4, ngram_range=(1,1))

# Fit Vectorizer on the training data and transform training data.
X_train_tvec = pd.DataFrame(tvec.fit_transform(X_train['clean_text']).todense(),
                            columns = tvec.get_feature_names())
# Transform testing data with the already-fit Vectorizer.
X_test_tvec = pd.DataFrame(tvec.transform(X_test['clean_text']).todense(),
                           columns = tvec.get_feature_names())
# Instantiate SVM.
# svc = svm.SVC(kernel='poly', C = 1.8, gamma = .05) 
svc = svm.SVC(kernel='rbf', C = 1, gamma = .2)  

# Fit on training data.
svc.fit(X_train_tvec,y_train)

# Score model on the training set.
print('Support Vector Machine model with TFIDF Vectorizer')
print(f'Train data accuracy: {svc.score(X_train_tvec,y_train):.3f}')

# Score model on the testing set.
print(f'Test data accuracy: {svc.score(X_test_tvec,y_test):.3f}')

Support Vector Machine model with TFIDF Vectorizer
Train data accuracy: 0.673
Test data accuracy: 0.629


In [142]:
#Logistic Regression with TFIDF Vectorizer 
tknzr = TweetTokenizer()
tvec = TfidfVectorizer(tokenizer=tknzr.tokenize,stop_words=mystopwords,
                       max_features=None,max_df=1.0,min_df=2, ngram_range=(1,1))

# Fit Vectorizer on the training data and transform training data.
X_train_tvec = pd.DataFrame(tvec.fit_transform(X_train['clean_text']).todense(),
                            columns = tvec.get_feature_names())
# Transform testing data with the already-fit Vectorizer.
X_test_tvec = pd.DataFrame(tvec.transform(X_test['clean_text']).todense(),
                           columns = tvec.get_feature_names())
# fit model
lr = LogisticRegression(penalty='l2',C=2,random_state=42,solver='liblinear')
lr_model = lr.fit(X_train_tvec, y_train)
# Score model on the training set.
print('Logistic Regression model with TFIDF Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_tvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_tvec,y_test):.3f}')

Logistic Regression model with TFIDF Vectorizer and TweetTokenzer
Train data accuracy: 0.842
Test data accuracy: 0.672




In [146]:
#Logistic Regression with CountVectorizer 
tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words=mystopwords, tokenizer=tknzr.tokenize)
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(), columns = cvec.get_feature_names())

# fit model
lr = LogisticRegression(penalty='l2',C=.3,random_state=42,solver='liblinear')
lr_model = lr.fit(X_train_cvec, y_train)
# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')



Logistic Regression model with Count Vectorizer and TweetTokenzer
Train data accuracy: 0.881
Test data accuracy: 0.676


In [157]:
# random forest
model = RandomForestClassifier(max_depth=20,random_state=42,min_samples_leaf=5,n_estimators=10)
model.fit(X_train_cvec,y_train)
y_pred = model.predict(X_test_cvec)
y_pred_train = model.predict(X_train_cvec)

print(f'Training accuracy is: {model.score(X_train_cvec,y_train):.3f}')
print(f'Testing accuracy is: {model.score(X_test_cvec,y_test):.3f}')

Training accuracy is: 0.508
Testing accuracy is: 0.502


### Analyze Results of Best Model

In [147]:
# from sklearn.metrics import confusion_matrix
# source: https://stackoverflow.com/questions/39770376/scikit-learn-get-accuracy-scores-for-each-class
y_pred = lr.predict(X_test_cvec)

#Get the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# normalize the diagonal entries
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

#The diagonal entries are the accuracies of each class
cm.diagonal()

categories = ['uninformative',"Casualties and damage","Caution and advice", 
        "Informative, other", "Information Source", "Donations of money, goods or services",
             'People missing, found or seen']
cm_df = pd.DataFrame(data=cm, index=categories,columns=None)
cm_df

Unnamed: 0,0,1,2,3,4,5,6
uninformative,0.913947,0.005935,0.041543,0.005935,0.011869,0.020772,0.0
Casualties and damage,0.311688,0.545455,0.090909,0.012987,0.038961,0.0,0.0
Caution and advice,0.275862,0.006897,0.682759,0.013793,0.013793,0.006897,0.0
"Informative, other",0.65625,0.03125,0.0625,0.125,0.109375,0.015625,0.0
Information Source,0.329545,0.079545,0.056818,0.068182,0.431818,0.034091,0.0
"Donations of money, goods or services",0.338983,0.067797,0.016949,0.0,0.084746,0.491525,0.0
"People missing, found or seen",0.0,0.090909,0.0,0.0,0.363636,0.181818,0.363636


## Use best model to predict categories for Hurricane Michael tweets

In [148]:
michael = pd.read_pickle('../data/hurricane_michael.pkl')
michael.shape

(50043, 11)

In [149]:
michael.head()

Unnamed: 0,date,favorites,geo,hashtags,id,mentions,permalink,retweets,text,to,username
0,2018-10-12 23:59:57+00:00,4,,#HurricaneMichael #blessed #UnitedWeStand #tal...,1050898900582838272,@COTNews,https://twitter.com/joeearenas/status/10508989...,0,@COTNews has been working 24/7 to restore serv...,,joeearenas
1,2018-10-12 23:59:53+00:00,1,,,1050898882526371842,,https://twitter.com/LakesideBexley/status/1050...,0,"In the wake of Hurricane Michael, we understan...",,LakesideBexley
2,2018-10-12 23:59:52+00:00,0,,#HurricaneMichael #Florida,1050898882077442048,,https://twitter.com/PRAISETRIUNEGOD/status/105...,0,"Maybe 17 "" #HurricaneMichael Updates: Body Fou...",,PRAISETRIUNEGOD
3,2018-10-12 23:59:49+00:00,0,,,1050898865988222976,,https://twitter.com/aShartee/status/1050898865...,0,In other news praying for those affected by hu...,,aShartee
4,2018-10-12 23:59:43+00:00,0,,#HurricaneMichael #ExcessiveForce,1050898841879236608,,https://twitter.com/MindOfMo/status/1050898841...,0,iSpy 2 or 3 who'd be hard-pressed to RUN in an...,CBSNews,MindOfMo


In [150]:
# clean tweet text
michael['clean_text'] = clean_tweets(michael['text'])

In [151]:
# Transform out-of-event testing data with the already-fit CountVectorizer.
michael_cvec = pd.DataFrame(cvec.transform(michael['clean_text']).todense(),
                           columns = cvec.get_feature_names())


In [152]:
michael['pred'] = lr_model.predict(michael_cvec)

In [153]:
michael.pred.value_counts()

0.0    28921
2.0     8360
1.0     4744
4.0     4174
5.0     2975
3.0      843
6.0       26
Name: pred, dtype: int64

In [154]:
michael.to_pickle('../data/michael_predictions.pkl')

In [None]:
# categories
# 0 'uninformative'
# 1 "Casualties and damage"
# 2 "Caution and advice", 
# 3 "Informative, other"
# 4 "Information Source", 
# 5 "Donations of money, goods or services",
# 6 'People missing, found or seen'

In [155]:
# make separate files for the first 20 records in each of 4 predicted categories 
mask = michael.pred == 1
casualties = michael[mask][0:20]

mask = michael.pred == 2
caution_advice = michael[mask][0:20]

mask = michael.pred == 4
info_source = michael[mask][0:20]

mask = michael.pred == 5
donations = michael[mask][0:20]

cols = ['permalink', 'text']
casualties = casualties[cols]
caution_advice = caution_advice[cols]
info_source = info_source[cols]
donations = donations[cols]

casualties.to_pickle('../data/casualties.pkl')
caution_advice.to_pickle('../data/caution_advice.pkl')
info_source.to_pickle('../data/info_source.pkl')
donations.to_pickle('../data/donations.pkl')