## Models of disaster tweet categories using Hurricanes Harvey, Irma, and Maria training data
  -- Several model types tested
  
  
  -- Use best model to predict categories for Hurricane Michael tweets (out-of-event sample)

In [3]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report,classification
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
import pandas as pd
import numpy as np
import re



In [4]:
#read pickled training data for 3 hurricanes
tweets = pd.read_pickle("../data/train.pkl")

In [5]:
#baseline accuracy
tweets.y.value_counts(1)

1    0.491601
2    0.278394
3    0.095827
4    0.075753
5    0.034760
6    0.016799
7    0.005283
8    0.001585
Name: y, dtype: float64

In [6]:
tweets.head()

Unnamed: 0,tweet_id,text_info,text_human,tweet_text,event,y
0,905274232590004225,not_informative,not_relevant_or_cant_judge,"CONGRATS ON HITTING YOIR GOAL GUYS, I'm sure t...",harvey,4
1,901646074527535105,informative,injured_or_dead_people,RT @ajwamood: #ajwamood : Harvey the first maj...,harvey,6
8,901646123080830976,informative,other_relevant_information,RT @yIIeza: When we get back to SCHS after Har...,harvey,1
9,901646127895863296,informative,other_relevant_information,Not always good when your city shows up on a s...,harvey,1
10,901646131628830721,informative,other_relevant_information,RT @MSNBC: Side by side satellite images compa...,harvey,1


In [7]:
def clean_tweets(col):
    # convert text to lower case
    col = col.str.lower()

    # remove URLs
    col = col.apply(lambda x: re.split('https:\/\/.*', str(x))[0])
    col = col.apply(lambda x: re.split('http:\/\/.*', str(x))[0])
    col = col.replace(r'www\S+', '', regex=True)

    #remove "RT" string
    col = col.map(lambda x: x.lstrip('rt'))
    return col

In [8]:
tweets['clean_text'] = clean_tweets(tweets['tweet_text'])

In [9]:
#Name variables 
X = tweets[["clean_text"]]
y = tweets["y"]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [10]:
# Load stopwords from NLTK and add custom words
mystopwords = stopwords.words('english')
mystopwords.extend(['hurricane','tornado','harvey','irma','joplin','sandy','maria',
                    'like','would','get','x200b','https','one','www','com','org','etc','could'])

In [11]:
# Naive Bayes with CVEC
tknzr = TweetTokenizer()
cvec = CountVectorizer(tokenizer=tknzr.tokenize,stop_words=mystopwords,
                       max_features=3000,max_df=1.0,min_df=2, 
                       ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(),
                           columns = cvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_cvec, y_train)

# Score model on the training set.
print('Naive Bayes model with Count Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_cvec,y_test):.3f}')

Naive Bayes model with Count Vectorizer
Train data accuracy: 0.774
Test data accuracy: 0.676


In [24]:
# Naive Bayes Bernouilli with Tfidf
tvec = TfidfVectorizer(tokenizer=tknzr.tokenize,stop_words=None,max_features=3000,max_df=1.0,min_df=2, ngram_range=(1,1))

# Fit our CountVectorizer on the training data and transform training data.
X_train_tvec = pd.DataFrame(tvec.fit_transform(X_train['clean_text']).todense(),
                            columns = tvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_tvec = pd.DataFrame(tvec.transform(X_test['clean_text']).todense(),
                           columns = tvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_tvec, y_train)

# Score model on the training set.
print('Naive Bayes model with TF-IDF Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_tvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_tvec,y_test):.3f}')

Naive Bayes model with TF-IDF Vectorizer
Train data accuracy: 0.702
Test data accuracy: 0.662


In [12]:
# SVM model with CVEC
cvec = CountVectorizer(stop_words=mystopwords,max_features=800, ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(),
                           columns = cvec.get_feature_names())
# Instantiate SVM.
# svc = svm.SVC(kernel='poly', C = 1.8, gamma = .05) 
svc = svm.SVC(kernel='rbf', C = 2, gamma = .2)  

# Fit on training data.
svc.fit(X_train_cvec,y_train)

# Score model on the training set.
print('Support Vector Machine model with Count Vectorizer')
print(f'Train data accuracy: {svc.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {svc.score(X_test_cvec,y_test):.3f}')

Support Vector Machine model with Count Vectorizer
Train data accuracy: 0.884
Test data accuracy: 0.687


In [17]:
#Logistic Regression with CountVectorizer 
tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize, ngram_range=(1,1))
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression(penalty='l2',C=1.5,random_state=42, solver='liblinear', multi_class='ovr')
lr_model = lr.fit(X_train_cvec, y_train)

# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')

Logistic Regression model with Count Vectorizer and TweetTokenzer
Train data accuracy: 0.957
Test data accuracy: 0.706


In [18]:
# random forest
model = RandomForestClassifier(max_depth=20,random_state=42,min_samples_leaf=5,n_estimators=10)
model.fit(X_train_cvec,y_train)
y_pred = model.predict(X_test_cvec)
y_pred_train = model.predict(X_train_cvec)

print(f'Training R-sq is: {model.score(X_train_cvec,y_train):.3f}')
print(f'Testing R-sq is: {model.score(X_test_cvec,y_test):.3f}')

Training R-sq is: 0.587
Testing R-sq is: 0.573


### Analyze Results of Best Model

In [19]:
# from sklearn.metrics import confusion_matrix
# source: https://stackoverflow.com/questions/39770376/scikit-learn-get-accuracy-scores-for-each-class
y_pred = lr.predict(X_test_cvec)

#Get the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# normalize the diagonal entries
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

#The diagonal entries are the accuracies of each class
cm.diagonal()

categories = ["other_relevant_information","rescue_volunteering_or_donation_effort", 
     "infrastructure_and_utility_damage", "not_relevant_or_cant_judge",
     "affected_individuals", 'injured_or_dead_people', 'vehicle_damage',
     'missing_or_found_people']
cm_df = pd.DataFrame(data=cm, index=categories,columns=None)
cm_df

Unnamed: 0,0,1,2,3,4,5,6,7
other_relevant_information,0.866838,0.079038,0.032646,0.018041,0.003436,0.0,0.0,0.0
rescue_volunteering_or_donation_effort,0.171472,0.798179,0.010622,0.013657,0.00607,0.0,0.0,0.0
infrastructure_and_utility_damage,0.568282,0.039648,0.378855,0.008811,0.004405,0.0,0.0,0.0
not_relevant_or_cant_judge,0.75419,0.100559,0.027933,0.106145,0.011173,0.0,0.0,0.0
affected_individuals,0.597561,0.292683,0.036585,0.012195,0.060976,0.0,0.0,0.0
injured_or_dead_people,0.25,0.125,0.0,0.0,0.05,0.575,0.0,0.0
vehicle_damage,0.75,0.0,0.083333,0.0,0.0,0.0,0.166667,0.0
missing_or_found_people,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Use model to make category predictions for Hurricane Michael tweets

In [20]:
michael = pd.read_pickle('../data/hurricane_michael.pkl')

In [21]:
michael.shape

(50043, 11)

In [22]:
michael.head()

Unnamed: 0,date,favorites,geo,hashtags,id,mentions,permalink,retweets,text,to,username
0,2018-10-12 23:59:57+00:00,4,,#HurricaneMichael #blessed #UnitedWeStand #tal...,1050898900582838272,@COTNews,https://twitter.com/joeearenas/status/10508989...,0,@COTNews has been working 24/7 to restore serv...,,joeearenas
1,2018-10-12 23:59:53+00:00,1,,,1050898882526371842,,https://twitter.com/LakesideBexley/status/1050...,0,"In the wake of Hurricane Michael, we understan...",,LakesideBexley
2,2018-10-12 23:59:52+00:00,0,,#HurricaneMichael #Florida,1050898882077442048,,https://twitter.com/PRAISETRIUNEGOD/status/105...,0,"Maybe 17 "" #HurricaneMichael Updates: Body Fou...",,PRAISETRIUNEGOD
3,2018-10-12 23:59:49+00:00,0,,,1050898865988222976,,https://twitter.com/aShartee/status/1050898865...,0,In other news praying for those affected by hu...,,aShartee
4,2018-10-12 23:59:43+00:00,0,,#HurricaneMichael #ExcessiveForce,1050898841879236608,,https://twitter.com/MindOfMo/status/1050898841...,0,iSpy 2 or 3 who'd be hard-pressed to RUN in an...,CBSNews,MindOfMo


In [23]:
# clean tweets using previously defined function
michael['clean_text'] = clean_tweets(michael['text'])

In [24]:
# Transform out-of-event testing data with the already-fit CountVectorizer.
michael_cvec = pd.DataFrame(cvec.transform(michael['clean_text']).todense(),
                           columns = cvec.get_feature_names())


In [25]:
michael['pred'] = lr_model.predict(michael_cvec)

In [33]:
michael.pred.value_counts(1)

1    0.512759
2    0.213756
3    0.160462
6    0.057371
4    0.037228
5    0.018224
7    0.000200
Name: inform_pred, dtype: float64

In [None]:
#  {"other_relevant_information":1,"rescue_volunteering_or_donation_effort":2, 
#      "infrastructure_and_utility_damage":3, "not_relevant_or_cant_judge":4,
#      "affected_individuals":5, 'injured_or_dead_people':6, 'vehicle_damage':7,
#      'missing_or_found_people':8})

In [39]:
mask = michael.inform_pred==7
for tweet in michael.text[mask]:
    print(tweet)
    print('---------------------------------')

Cars, boats carried away by Hurricane Michael, reports @MikeMagsCBS12 http://bit.ly/2QKPqvz pic.twitter.com/LfEKVgca0z
---------------------------------
@Honda thank you for building such safe and reliable cars. We rode out Hurricane Michael in a Civic, survived without a scratch, and I'm happy to say we're driving it back to Texas right now. This car needs to be in a museum. pic.twitter.com/xU0aqNs5xl
---------------------------------
All those flooded cars in #Florida? @Ford will be making up the $1B Trump has cost them when people buy their new cars. #HurricaneMichael http://fortune.com/2018/10/09/ford-stock-today-layoffs-trump-trade-tariffs/ …
---------------------------------
Rail cars flipped on their sides from the force of #HurricaneMichael in Panama City @winknewspic.twitter.com/SsCbObMNQq
---------------------------------
literally sitting in my car, listening to 90.5, charging my phone... must not be the only one, cause I see 2 other cars sitting too #noelectricity #Hurrican