In [47]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report,classification
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
import pandas as pd
import re

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pauls\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
#read pickled data
tweets = pd.read_pickle("../data/train3.pkl")

In [74]:
#baseline accuracy
distribution = tweets.y.value_counts(1)

In [77]:
list(distribution)

[0.33659132559560173,
 0.20769700671960903,
 0.17593158216249236,
 0.15271838729383017,
 0.08796579108124618,
 0.03909590714722053]

In [48]:
# convert text to lower case
tweets['clean_text'] = tweets['tweet'].str.lower()

# remove URLs

tweets['clean_text'] = tweets['clean_text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

# #remove "RT" string
# tweets['clean_text'] = tweets['clean_text'].map(lambda x: x.lstrip('rt'))

# #remove remaining punctuation except for "#"
# tweets['clean_text'] = tweets['clean_text'].str.replace("[^a-zA-Z]", " ")

In [49]:
#Name variables 

X = tweets[["clean_text"]]
y = tweets["y"]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [19]:
# Load stopwords from NLTK and add custom words
mystopwords = stopwords.words('english')
mystopwords.extend(['hurricane','tornado','harvey','irma','joplin','sandy','maria',
                    'like','would','get','x200b','https','one','www','com','org','etc','could'])

In [22]:
# Naive Bayes with CVEC
tknzr = TweetTokenizer()
cvec = CountVectorizer(tokenizer=tknzr.tokenize,stop_words=mystopwords,
                       max_features=3000,max_df=1.0,min_df=2, 
                       ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['tweet']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['tweet']).todense(),
                           columns = cvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_cvec, y_train)

# Score model on the training set.
print('Naive Bayes model with Count Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_cvec,y_test):.3f}')

Naive Bayes model with Count Vectorizer
Train data accuracy: 0.924
Test data accuracy: 0.702


In [23]:
# Naive Bayes Bernouilli with Tfidf
tvec = TfidfVectorizer(tokenizer=tknzr.tokenize,stop_words=None,max_features=3000,max_df=1.0,min_df=2, ngram_range=(1,1))

# Fit our CountVectorizer on the training data and transform training data.
X_train_tvec = pd.DataFrame(tvec.fit_transform(X_train['tweet']).todense(),
                            columns = tvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_tvec = pd.DataFrame(tvec.transform(X_test['tweet']).todense(),
                           columns = tvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_tvec, y_train)

# Score model on the training set.
print('Naive Bayes model with TF-IDF Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_tvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_tvec,y_test):.3f}')

Naive Bayes model with TF-IDF Vectorizer
Train data accuracy: 0.822
Test data accuracy: 0.624


In [42]:
# SVM model with CVEC
cvec = CountVectorizer(stop_words=mystopwords,max_features=800, ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['tweet']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['tweet']).todense(),
                           columns = cvec.get_feature_names())
# Instantiate SVM.
# svc = svm.SVC(kernel='poly', C = 1.8, gamma = .05) 
svc = svm.SVC(kernel='rbf', C = 2, gamma = .2)  

# Fit on training data.
svc.fit(X_train_cvec,y_train)

# Score model on the training set.
print('Support Vector Machine model with Count Vectorizer')
print(f'Train data accuracy: {svc.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {svc.score(X_test_cvec,y_test):.3f}')

Support Vector Machine model with Count Vectorizer
Train data accuracy: 0.969
Test data accuracy: 0.817


In [68]:
#CountVectorizer 

tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize)
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression(C=1.5,random_state=42)
lr_model = lr.fit(X_train_cvec, y_train)
# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')



Logistic Regression model with Count Vectorizer and TweetTokenzer
Train data accuracy: 0.989
Test data accuracy: 0.859


In [None]:
# Latent Semantic Indexing
tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize)
model = LsiModel(cvec, id2word=common_dictionary)
# vectorized_corpus = model[cvec]


X_train_cvec = pd.DataFrame(model[X_train['tweet']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['tweet']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression()
lr_model = lr.fit(X_train_cvec, y_train)
# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')

In [69]:
# random forest
model = RandomForestClassifier(max_depth=20,random_state=42,min_samples_leaf=5,n_estimators=10)
model.fit(X_train_cvec,y_train)
y_pred = model.predict(X_test_cvec)
y_pred_train = model.predict(X_train_cvec)

print(f'Training R-sq is: {model.score(X_train_cvec,y_train):.3f}')
print(f'Testing R-sq is: {model.score(X_test_cvec,y_test):.3f}')

Training R-sq is: 0.502
Testing R-sq is: 0.427


### Analyze Results of Best Model

In [28]:
# print confusion matrix
# y_pred = lr.predict(X_test_cvec)
# cm = confusion_matrix(y_test, y_pred)
# cm_df = pd.DataFrame(cm, columns=['predict neg', 'predict pos'], index=['actual neg', 'actual pos'])
# cm_df

In [29]:
michael = pd.read_pickle('../data/hurricane_michael.pkl')

In [30]:
michael.shape

(50043, 11)

In [17]:
michael.head()

Unnamed: 0,date,favorites,geo,hashtags,id,mentions,permalink,retweets,text,to,username
0,2018-10-12 23:59:57+00:00,4,,#HurricaneMichael #blessed #UnitedWeStand #tal...,1050898900582838272,@COTNews,https://twitter.com/joeearenas/status/10508989...,0,@COTNews has been working 24/7 to restore serv...,,joeearenas
1,2018-10-12 23:59:53+00:00,1,,,1050898882526371842,,https://twitter.com/LakesideBexley/status/1050...,0,"In the wake of Hurricane Michael, we understan...",,LakesideBexley
2,2018-10-12 23:59:52+00:00,0,,#HurricaneMichael #Florida,1050898882077442048,,https://twitter.com/PRAISETRIUNEGOD/status/105...,0,"Maybe 17 "" #HurricaneMichael Updates: Body Fou...",,PRAISETRIUNEGOD
3,2018-10-12 23:59:49+00:00,0,,,1050898865988222976,,https://twitter.com/aShartee/status/1050898865...,0,In other news praying for those affected by hu...,,aShartee
4,2018-10-12 23:59:43+00:00,0,,#HurricaneMichael #ExcessiveForce,1050898841879236608,,https://twitter.com/MindOfMo/status/1050898841...,0,iSpy 2 or 3 who'd be hard-pressed to RUN in an...,CBSNews,MindOfMo


In [51]:
# convert text to lower case
michael['clean_text'] = michael['text'].str.lower()

# remove URLs
import re
michael['clean_text'] = michael['clean_text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

# #remove "RT" string
# michael['clean_text'] = michael['clean_text'].map(lambda x: x.lstrip('rt'))

# #remove remaining punctuation except for "#"
# michael['clean_text'] = michael['clean_text'].str.replace("[^a-zA-Z]", " ")

In [52]:
# Transform out-of-event testing data with the already-fit CountVectorizer.
michael_cvec = pd.DataFrame(cvec.transform(michael['clean_text']).todense(),
                           columns = cvec.get_feature_names())


In [53]:
michael['inform_pred'] = lr_model.predict(michael_cvec)

In [54]:
michael.inform_pred.value_counts()

0.0    40534
2.0     3646
1.0     3057
3.0     1313
4.0     1053
5.0      440
Name: inform_pred, dtype: int64

In [26]:
# michael.inform_pred.value_counts()  - before cleaning 200 more tweets were considered informative by model

1    40151
0     9892
Name: inform_pred, dtype: int64

In [None]:
# "Casualties and damage":1,"Caution and advice":2, 
#      "Unknown":3, "Information Source":4, "Donations of money, goods or services":5})

In [36]:
mask = michael.inform_pred==1
for tweet in michael.text[mask]:
    print(tweet)
    print('---------------------------------')

In the wake of Hurricane Michael, we understand the importance of knowing where to find aid. For storm recovery resources, please visit the following link: https://feaweb.org/hurricane-michael-resources-and-relief …https://feaweb.org/hurricane-michael-resources-and-relief …
---------------------------------
iSpy 2 or 3 who'd be hard-pressed to RUN in an emergency. I mention it cuz: 1. POTUS is not honoring #HurricaneMichael, WTF. 2. Suspect overweight/unfit contributes to #ExcessiveForce, including/especially fatally shooting FAST. 3. LEO mostly giving Heir carte blanche (like MBS).https://twitter.com/CBSNews/status/1050891575067377664 …
---------------------------------
iSpy 2 or 3 who'd be hard-pressed to RUN in an emergency. I mention it cuz: 1. POTUS is not honoring #HurricaneMichael, WTF. 2. Suspect overweight/unfit contributes to #ExcessiveForce, including/especially fatally shooting FAST. 3. LEO mostly giving Heir carte blanche (like MBS).
---------------------------------
Arizo

---------------------------------
11 people are confirmed dead in the aftermath of Hurricane Michael. More than a million people in 6 states don't have electricity. pic.twitter.com/ADAq2wgJqD
---------------------------------
There’s no telling how long power restoration will take in the Florida Panhandle. The combination of State, county, and local crews isn’t enough. So, federal help is coming. #HurricaneMichael Full story -> http://goo.gl/FDwNJj
---------------------------------
#AnsweringTheCall #MutualAId #HurricaneMichael 1/2 the transmission is down, 2/3 of distributions circuits without load and 106,000 customers without power. NEPPA members respond. Be safe and THANK YOU. #CommunityPoweredpic.twitter.com/WyRK7RO88N
---------------------------------
My heart is breaking for the #FloridaPanhandle in the aftermath of #HurricaneMichael. Growing up in this area, I never imagined I would see so much destruction. So many places l have visited have been destroyed. It will take a while

---------------------------------
#HurricaneMichael's death toll rises to 11, including 5 in Virginia: CNN https://www.cnn.com/2018/10/12/us/hurricane-michael-wxc/index.html … | More w/ Eco-Search: https://search.ecointernet.org/search/results?q=hurricane%20michael%27s%20death%20toll%20rises%20including%20virginia&w=relevance …
---------------------------------
Witnesses said they saw the wind lift homes into the air. The aftermath of #HurricaneMichael. @nytimes analyze aerial images of 1 Mile of Devastation in Florida. @singhvianjali @dwtkns @kkrebeccalai @TroyEricG @kkrebeccalai @karenyourish https://www.nytimes.com/interactive/2018/10/12/us/mexico-beach-fl-damage-map.html?action=click&module=Spotlight&pgtype=Homepage … #FloridaPanhandle
---------------------------------
#HurricaneMichael left a trail of destruction in Florida — now the rebuilding process begins. Our crews are working around the clock with @GulfPower to get power back on for customers in the Panama City area. #boundl

Good Morning! Since the effects of Hurricane Michael, have you checked on your neighbors? If they don’t have power, it’s a possibility they don’t have food. For those who are still without power, hang in... https://www.facebook.com/100001032722046/posts/1995426380501767/ …
---------------------------------
Hurricane Michael aftermath: Death toll spikes after five storm-related fatalities reported in Virginia - The Washington Post https://apple.news/A9yq0L2J_QSGLmrgveKFO7g …
---------------------------------
Hurricane Michael aftermath: Death toll spikes after five storm-related fatalities reported in Virginia - The Washington Post https://apple.news/A9yq0L2J_QSGLmrgveKFO7g …
---------------------------------
Hurricane Michael aftermath: Death toll spikes after five storm-related fatalities reported in Virginia https://www.washingtonpost.com/news/post-nation/wp/2018/10/12/hurricane-michael-updates-cleanup-and-recovery-in-the-aftermath-of-a-deadly-storm/ …
-------------------------------

Tropical Storm #Michael took its drenching rains to #Georgia and the #Carolinas on Thursday after devastating Florida's #Panhandle, killing at least two people, reducing homes to rubble and ripping up power lines and tree. | #HurricaneMichael https://www.deccanherald.com/international/hurricane-michael-kills-one-697325.html …
---------------------------------
UPDATE 10:00 PM ~235K without power in Georgia — (GPC and EMC) For those in the dark tonight across South GA, and particularly those who have lost much, people across the state are thinking of you and crews are working hard to get you back online. #HurricaneMichael (1 of 2) pic.twitter.com/uLiQTVhLFF
---------------------------------
Watch: Condo destroyed in Crawfordville by Hurricane Michael, reports @MikeMagsCBS12 http://bit.ly/2PstfKk pic.twitter.com/woEkmUaazq
---------------------------------
No class Friday for @CharMeckSchools. 32 schools affected by #HurricaneMichael power outages. Thank you to the @DukeEnergy crews worki

---------------------------------
David Johnson: The Wall Street Journal: Hurricane Michael leaves trail of devastation; hundreds of thousands without power http://dlvr.it/Qn3WDC pic.twitter.com/vzbQW1961G
---------------------------------
Hurricane Michael tears apart Florida towns, 7 dead: Hurricane Michael's violence was visible on Thursday in shattered Florida coastal towns, where rows of homes were ripped from foundations and roofs were… http://dlvr.it/Qn3W2Q #ImpeachTrump #ImpeachKavanaugh #TheResistancepic.twitter.com/kzT99R4Y3w
---------------------------------
Thousands of residences and businesses are without power as remnants of Hurricane Michael near North Carolina. https://trib.al/SwfAndD
---------------------------------
Via @Reuters: Hurricane Michael tears apart Florida towns, 7 dead http://dlvr.it/Qn3Vt9 pic.twitter.com/XlZvTDayio
---------------------------------
If your power is out, keep refrigerator and freezer doors closed. #HurricaneMichael pic.twitter.com/UvrkvZ

---------------------------------
Hurricane Michael took at least 1 life, left families homeless, thousands without power, completely devastated areas in Florida... But sure just continue to circle jerk with Kanye West in the Oval office today @realDonaldTrump Good job. Great leadership
---------------------------------
As you have likely seen the news, Hurricane Michael came through the panhandle of Florida on Wednesday. The City of Tallahassee and Leon County, are 97% without power to homes and businesses. Our office is... https://www.facebook.com/136085581398/posts/10156595833141399/ …
---------------------------------
@wxbrad too rainy & windy to take a pic right now but a huge tree on my street fell, took power lines down in Kannapolis near Cannon YMCA & NCRC #HurricaneMichael #CLTwx
---------------------------------
#EuronewsTonight | "This is one of the strongest hurricanes ever to hit the United States in recorded history" Jonathan Petramala ( @jpetramala), reporter from @accuw