In [1]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report,classification
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
import pandas as pd
import re



In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pauls\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#read pickled data
tweets = pd.read_pickle("../data/train3.pkl")

In [4]:
#baseline accuracy
tweets.y.value_counts(1)

0.0    0.336591
1.0    0.207697
2.0    0.175932
3.0    0.152718
4.0    0.087966
5.0    0.039096
Name: y, dtype: float64

In [5]:
# convert text to lower case
tweets['clean_text'] = tweets['tweet'].str.lower()

# remove URLs

tweets['clean_text'] = tweets['clean_text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

#remove "RT" string
tweets['clean_text'] = tweets['clean_text'].map(lambda x: x.lstrip('rt'))

# #remove remaining punctuation except for "#"
# tweets['clean_text'] = tweets['clean_text'].str.replace("[^a-zA-Z]", " ")

In [6]:
#Name variables 

X = tweets[["clean_text"]]
y = tweets["y"]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [90]:
# Load stopwords from NLTK and add custom words
mystopwords = stopwords.words('english')
mystopwords.extend(['hurricane','tornado','harvey','irma','joplin','sandy','maria',
                    'like','would','get','x200b','https','one','www','com','org','etc','could'])

In [22]:
# Naive Bayes with CVEC
tknzr = TweetTokenizer()
cvec = CountVectorizer(tokenizer=tknzr.tokenize,stop_words=mystopwords,
                       max_features=3000,max_df=1.0,min_df=2, 
                       ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['tweet']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['tweet']).todense(),
                           columns = cvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_cvec, y_train)

# Score model on the training set.
print('Naive Bayes model with Count Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_cvec,y_test):.3f}')

Naive Bayes model with Count Vectorizer
Train data accuracy: 0.924
Test data accuracy: 0.702


In [23]:
# Naive Bayes Bernouilli with Tfidf
tvec = TfidfVectorizer(tokenizer=tknzr.tokenize,stop_words=None,max_features=3000,max_df=1.0,min_df=2, ngram_range=(1,1))

# Fit our CountVectorizer on the training data and transform training data.
X_train_tvec = pd.DataFrame(tvec.fit_transform(X_train['tweet']).todense(),
                            columns = tvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_tvec = pd.DataFrame(tvec.transform(X_test['tweet']).todense(),
                           columns = tvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_tvec, y_train)

# Score model on the training set.
print('Naive Bayes model with TF-IDF Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_tvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_tvec,y_test):.3f}')

Naive Bayes model with TF-IDF Vectorizer
Train data accuracy: 0.822
Test data accuracy: 0.624


In [42]:
# SVM model with CVEC
cvec = CountVectorizer(stop_words=mystopwords,max_features=800, ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['tweet']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['tweet']).todense(),
                           columns = cvec.get_feature_names())
# Instantiate SVM.
# svc = svm.SVC(kernel='poly', C = 1.8, gamma = .05) 
svc = svm.SVC(kernel='rbf', C = 2, gamma = .2)  

# Fit on training data.
svc.fit(X_train_cvec,y_train)

# Score model on the training set.
print('Support Vector Machine model with Count Vectorizer')
print(f'Train data accuracy: {svc.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {svc.score(X_test_cvec,y_test):.3f}')

Support Vector Machine model with Count Vectorizer
Train data accuracy: 0.969
Test data accuracy: 0.817


In [7]:
#CountVectorizer 

tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize)
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression(penalty='l1',C=3,random_state=42,solver='liblinear')
lr_model = lr.fit(X_train_cvec, y_train)
# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')



Logistic Regression model with Count Vectorizer and TweetTokenzer
Train data accuracy: 0.987
Test data accuracy: 0.871




In [None]:
# Latent Semantic Indexing
tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize)
model = LsiModel(cvec, id2word=common_dictionary)
# vectorized_corpus = model[cvec]


X_train_cvec = pd.DataFrame(model[X_train['tweet']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['tweet']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression()
lr_model = lr.fit(X_train_cvec, y_train)
# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')

In [69]:
# random forest
model = RandomForestClassifier(max_depth=20,random_state=42,min_samples_leaf=5,n_estimators=10)
model.fit(X_train_cvec,y_train)
y_pred = model.predict(X_test_cvec)
y_pred_train = model.predict(X_train_cvec)

print(f'Training R-sq is: {model.score(X_train_cvec,y_train):.3f}')
print(f'Testing R-sq is: {model.score(X_test_cvec,y_test):.3f}')

Training R-sq is: 0.502
Testing R-sq is: 0.427


### Analyze Results of Best Model

In [28]:
# print confusion matrix
# y_pred = lr.predict(X_test_cvec)
# cm = confusion_matrix(y_test, y_pred)
# cm_df = pd.DataFrame(cm, columns=['predict neg', 'predict pos'], index=['actual neg', 'actual pos'])
# cm_df

In [28]:
michael = pd.read_pickle('../data/hurricane_michael.pkl')

In [29]:
michael.shape

(50043, 11)

In [17]:
michael.head()

Unnamed: 0,date,favorites,geo,hashtags,id,mentions,permalink,retweets,text,to,username
0,2018-10-12 23:59:57+00:00,4,,#HurricaneMichael #blessed #UnitedWeStand #tal...,1050898900582838272,@COTNews,https://twitter.com/joeearenas/status/10508989...,0,@COTNews has been working 24/7 to restore serv...,,joeearenas
1,2018-10-12 23:59:53+00:00,1,,,1050898882526371842,,https://twitter.com/LakesideBexley/status/1050...,0,"In the wake of Hurricane Michael, we understan...",,LakesideBexley
2,2018-10-12 23:59:52+00:00,0,,#HurricaneMichael #Florida,1050898882077442048,,https://twitter.com/PRAISETRIUNEGOD/status/105...,0,"Maybe 17 "" #HurricaneMichael Updates: Body Fou...",,PRAISETRIUNEGOD
3,2018-10-12 23:59:49+00:00,0,,,1050898865988222976,,https://twitter.com/aShartee/status/1050898865...,0,In other news praying for those affected by hu...,,aShartee
4,2018-10-12 23:59:43+00:00,0,,#HurricaneMichael #ExcessiveForce,1050898841879236608,,https://twitter.com/MindOfMo/status/1050898841...,0,iSpy 2 or 3 who'd be hard-pressed to RUN in an...,CBSNews,MindOfMo


In [30]:
# convert text to lower case
michael['clean_text'] = michael['text'].str.lower()

# remove URLs
import re
michael['clean_text'] = michael['clean_text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

#remove "RT" string
michael['clean_text'] = michael['clean_text'].map(lambda x: x.lstrip('rt'))

# #remove remaining punctuation except for "#"
# michael['clean_text'] = michael['clean_text'].str.replace("[^a-zA-Z]", " ")

In [31]:
# Transform out-of-event testing data with the already-fit CountVectorizer.
michael_cvec = pd.DataFrame(cvec.transform(michael['clean_text']).todense(),
                           columns = cvec.get_feature_names())


In [32]:
michael['pred'] = lr_model.predict(michael_cvec)

In [14]:
#michael.to_pickle('../data/michael_predictions.pkl')

In [16]:
michael.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 295957 entries, 0 to 11274
Data columns (total 13 columns):
date          295957 non-null datetime64[ns, UTC]
favorites     295957 non-null int64
geo           295957 non-null object
hashtags      295957 non-null object
id            295957 non-null object
mentions      295957 non-null object
permalink     295957 non-null object
retweets      295957 non-null int64
text          295957 non-null object
to            33073 non-null object
username      295957 non-null object
clean_text    295957 non-null object
pred          295957 non-null float64
dtypes: datetime64[ns, UTC](1), float64(1), int64(2), object(9)
memory usage: 31.6+ MB


In [33]:
michael.pred.value_counts()

0.0    34302
2.0     7299
1.0     4967
4.0     1591
3.0     1134
5.0      750
Name: pred, dtype: int64

In [None]:
# uninformative = 0
# "Casualties and damage":1,"Caution and advice":2, 
#      "Informative, other":3, "Information Source":4, "Donations of money, goods or services":5})

In [39]:
mask = michael.pred == 1
casualties = michael[mask][0:20]

In [40]:
mask = michael.pred == 2
caution_advice = michael[mask][0:20]

In [41]:
mask = michael.pred == 4
info_source = michael[mask][0:20]

In [42]:
mask = michael.pred == 5
donations = michael[mask][0:20]

In [46]:
cols = ['permalink', 'text']

In [47]:
casualties = casualties[cols]

In [48]:
caution_advice = caution_advice[cols]

In [49]:
info_source = info_source[cols]

In [50]:
donations = donations[cols]

In [52]:
casualties.to_pickle('../data/casualties.pkl')

In [53]:
caution_advice.to_pickle('../data/caution_advice.pkl')

In [54]:
info_source.to_pickle('../data/info_source.pkl')

In [55]:
donations.to_pickle('../data/donations.pkl')