In [1]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report,classification
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
import pandas as pd
import numpy as np
import re



In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pauls\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
#read pickled data
tweets = pd.read_pickle("../data/train_sandy.pkl")

In [7]:
#baseline accuracy
tweets.y.value_counts(1)

0.0    0.457
1.0    0.170
2.0    0.144
3.0    0.125
4.0    0.072
5.0    0.032
Name: y, dtype: float64

In [8]:
# convert text to lower case
tweets['clean_text'] = tweets['tweet'].str.lower()

# remove URLs
tweets['clean_text'] = tweets['clean_text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
tweets['clean_text'] = tweets['clean_text'].apply(lambda x: re.split('http:\/\/.*', str(x))[0])
tweets['clean_text'] = tweets['clean_text'].replace(r'www\S+', '', regex=True)

#remove "RT" string
tweets['clean_text'] = tweets['clean_text'].map(lambda x: x.lstrip('rt'))

# #remove remaining punctuation except for "#"
# tweets['clean_text'] = tweets['clean_text'].str.replace("[^a-zA-Z]", " ")

In [9]:
#Name variables 

X = tweets[["clean_text"]]
y = tweets["y"]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [10]:
# Load stopwords from NLTK and add custom words
mystopwords = stopwords.words('english')
mystopwords.extend(['hurricane','tornado','harvey','irma','joplin','sandy','maria',
                    'like','would','get','x200b','https','one','www','com','org','etc','could'])

In [20]:
# Naive Bayes with CVEC
tknzr = TweetTokenizer()
cvec = CountVectorizer(tokenizer=tknzr.tokenize,stop_words=mystopwords,
                       max_features=3000,max_df=1.0,min_df=2, 
                       ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(),
                           columns = cvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_cvec, y_train)

# Score model on the training set.
print('Naive Bayes model with Count Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_cvec,y_test):.3f}')

Naive Bayes model with Count Vectorizer
Train data accuracy: 0.837
Test data accuracy: 0.659


In [21]:
# Naive Bayes Bernouilli with Tfidf
tvec = TfidfVectorizer(tokenizer=tknzr.tokenize,stop_words=None,max_features=3000,max_df=1.0,min_df=2, ngram_range=(1,1))

# Fit our CountVectorizer on the training data and transform training data.
X_train_tvec = pd.DataFrame(tvec.fit_transform(X_train['clean_text']).todense(),
                            columns = tvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_tvec = pd.DataFrame(tvec.transform(X_test['clean_text']).todense(),
                           columns = tvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_tvec, y_train)

# Score model on the training set.
print('Naive Bayes model with TF-IDF Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_tvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_tvec,y_test):.3f}')

Naive Bayes model with TF-IDF Vectorizer
Train data accuracy: 0.653
Test data accuracy: 0.560


In [23]:
# SVM model with CVEC
cvec = CountVectorizer(stop_words=mystopwords,max_features=800, ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(),
                           columns = cvec.get_feature_names())
# Instantiate SVM.
# svc = svm.SVC(kernel='poly', C = 1.8, gamma = .05) 
svc = svm.SVC(kernel='rbf', C = 1, gamma = .2)  

# Fit on training data.
svc.fit(X_train_cvec,y_train)

# Score model on the training set.
print('Support Vector Machine model with Count Vectorizer')
print(f'Train data accuracy: {svc.score(X_train_cvec,y_train):.3f}')

# Score model on the testing set.
print(f'Test data accuracy: {svc.score(X_test_cvec,y_test):.3f}')

Support Vector Machine model with Count Vectorizer
Train data accuracy: 0.845
Test data accuracy: 0.609


In [38]:
#CountVectorizer 

tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=None)
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression(penalty='l2',C=8,random_state=42,solver='liblinear')
lr_model = lr.fit(X_train_cvec, y_train)
# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')

Logistic Regression model with Count Vectorizer and TweetTokenzer
Train data accuracy: 0.979
Test data accuracy: 0.660




In [None]:
# Latent Semantic Indexing
tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize)
model = LsiModel(cvec, id2word=common_dictionary)
# vectorized_corpus = model[cvec]


X_train_cvec = pd.DataFrame(model[X_train['tweet']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['tweet']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression()
lr_model = lr.fit(X_train_cvec, y_train)
# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')

In [24]:
# random forest
model = RandomForestClassifier(max_depth=20,random_state=42,min_samples_leaf=5,n_estimators=10)
model.fit(X_train_cvec,y_train)
y_pred = model.predict(X_test_cvec)
y_pred_train = model.predict(X_train_cvec)

print(f'Training R-sq is: {model.score(X_train_cvec,y_train):.3f}')
print(f'Testing R-sq is: {model.score(X_test_cvec,y_test):.3f}')

Training R-sq is: 0.550
Testing R-sq is: 0.531


### Analyze Results of Best Model

In [61]:
# from sklearn.metrics import confusion_matrix
# source: https://stackoverflow.com/questions/39770376/scikit-learn-get-accuracy-scores-for-each-class
y_pred = lr.predict(X_test_cvec)

#Get the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# normalize the diagonal entries
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

#The diagonal entries are the accuracies of each class
cm.diagonal()

categories = ['uninformative',"Casualties and damage","Caution and advice", 
        "Informative, other", "Information Source", "Donations of money, goods or services"]
cm_df = pd.DataFrame(data=cm, index=categories,columns=None)
cm_df

Unnamed: 0,0,1,2,3,4,5
uninformative,0.847826,0.057971,0.007246,0.043478,0.021739,0.021739
Casualties and damage,0.058824,0.894118,0.023529,0.023529,0.0,0.0
Caution and advice,0.013889,0.055556,0.888889,0.0,0.0,0.041667
"Informative, other",0.095238,0.031746,0.063492,0.809524,0.0,0.0
Information Source,0.0,0.055556,0.0,0.111111,0.833333,0.0
"Donations of money, goods or services",0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
michael = pd.read_pickle('../data/hurricane_michael.pkl')

In [9]:
michael.shape

(50043, 11)

In [17]:
michael.head()

Unnamed: 0,date,favorites,geo,hashtags,id,mentions,permalink,retweets,text,to,username
0,2018-10-12 23:59:57+00:00,4,,#HurricaneMichael #blessed #UnitedWeStand #tal...,1050898900582838272,@COTNews,https://twitter.com/joeearenas/status/10508989...,0,@COTNews has been working 24/7 to restore serv...,,joeearenas
1,2018-10-12 23:59:53+00:00,1,,,1050898882526371842,,https://twitter.com/LakesideBexley/status/1050...,0,"In the wake of Hurricane Michael, we understan...",,LakesideBexley
2,2018-10-12 23:59:52+00:00,0,,#HurricaneMichael #Florida,1050898882077442048,,https://twitter.com/PRAISETRIUNEGOD/status/105...,0,"Maybe 17 "" #HurricaneMichael Updates: Body Fou...",,PRAISETRIUNEGOD
3,2018-10-12 23:59:49+00:00,0,,,1050898865988222976,,https://twitter.com/aShartee/status/1050898865...,0,In other news praying for those affected by hu...,,aShartee
4,2018-10-12 23:59:43+00:00,0,,#HurricaneMichael #ExcessiveForce,1050898841879236608,,https://twitter.com/MindOfMo/status/1050898841...,0,iSpy 2 or 3 who'd be hard-pressed to RUN in an...,CBSNews,MindOfMo


In [38]:
# convert text to lower case
michael['clean_text'] = michael['text'].str.lower()

# remove URLs
michael['clean_text'] = michael['clean_text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
michael['clean_text'] = michael['clean_text'].apply(lambda x: re.split('http:\/\/.*', str(x))[0])
michael['clean_text'] = michael['clean_text'].replace(r'www\S+', '', regex=True)

#remove "RT" string
michael['clean_text'] = michael['clean_text'].map(lambda x: x.lstrip('rt'))

# #remove remaining punctuation except for "#"
# michael['clean_text'] = michael['clean_text'].str.replace("[^a-zA-Z]", " ")

In [39]:
# Transform out-of-event testing data with the already-fit CountVectorizer.
michael_cvec = pd.DataFrame(cvec.transform(michael['clean_text']).todense(),
                           columns = cvec.get_feature_names())


In [40]:
michael['pred'] = lr_model.predict(michael_cvec)

In [41]:
michael.to_pickle('../data/michael_predictions.pkl')

In [42]:
# run vectorizer on tweets from ENTIRE original tweet3 data
X_cvec = pd.DataFrame(cvec.transform(tweets['clean_text']).todense(), columns = cvec.get_feature_names())

In [43]:
tweets['pred'] = lr_model.predict(X_cvec)

In [44]:
tweets.to_pickle('../data/train3_pred.pkl')

In [45]:
michael.pred.value_counts()

0.0    32053
2.0     8615
1.0     4859
4.0     2243
3.0     1239
5.0     1034
Name: pred, dtype: int64

In [None]:
# uninformative = 0
# "Casualties and damage":1,"Caution and advice":2, 
#      "Informative, other":3, "Information Source":4, "Donations of money, goods or services":5})

In [46]:
mask = michael.pred == 1
casualties = michael[mask][0:20]

In [47]:
mask = michael.pred == 2
caution_advice = michael[mask][0:20]

In [48]:
mask = michael.pred == 4
info_source = michael[mask][0:20]

In [49]:
mask = michael.pred == 5
donations = michael[mask][0:20]

In [50]:
cols = ['permalink', 'text']

In [51]:
casualties = casualties[cols]

In [52]:
caution_advice = caution_advice[cols]

In [53]:
info_source = info_source[cols]

In [54]:
donations = donations[cols]

In [55]:
casualties.to_pickle('../data/casualties.pkl')

In [56]:
caution_advice.to_pickle('../data/caution_advice.pkl')

In [57]:
info_source.to_pickle('../data/info_source.pkl')

In [58]:
donations.to_pickle('../data/donations.pkl')