In [1]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report,classification
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
import pandas as pd
import re



In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pauls\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#read pickled data
tweets = pd.read_pickle("../data/train.pkl")

In [10]:
tweets.columns

Index(['tweet_id', 'image_id', 'text_info', 'text_info_conf', 'image_info',
       'image_info_conf', 'text_human', 'text_human_conf', 'image_human',
       'image_human_conf', 'image_damage', 'image_damage_conf', 'tweet_text',
       'image_url', 'image_path', 'event', 'y', 'clean_text'],
      dtype='object')

In [11]:
# drop unneeded columns
cols = ['tweet_id', 'text_info', 'text_human', 'tweet_text', 'event', 'y']

tweets = tweets[cols]

In [16]:
# drop duplicates
tweets.drop_duplicates(inplace=True)
tweets.dropna(inplace=True)

In [17]:
#baseline accuracy
tweets.y.value_counts(1)

1.0    0.491601
2.0    0.278394
3.0    0.095827
4.0    0.075753
5.0    0.034760
6.0    0.016799
0.0    0.005283
7.0    0.001585
Name: y, dtype: float64

In [18]:
tweets.head()

Unnamed: 0,tweet_id,text_info,text_human,tweet_text,event,y
0,905274232590004225,not_informative,not_relevant_or_cant_judge,"CONGRATS ON HITTING YOIR GOAL GUYS, I'm sure t...",harvey,4.0
1,901646074527535105,informative,injured_or_dead_people,RT @ajwamood: #ajwamood : Harvey the first maj...,harvey,6.0
8,901646123080830976,informative,other_relevant_information,RT @yIIeza: When we get back to SCHS after Har...,harvey,1.0
9,901646127895863296,informative,other_relevant_information,Not always good when your city shows up on a s...,harvey,1.0
10,901646131628830721,informative,other_relevant_information,RT @MSNBC: Side by side satellite images compa...,harvey,1.0


In [19]:
# convert text to lower case
tweets['clean_text'] = tweets['tweet_text'].str.lower()

# remove URLs
tweets['clean_text'] = tweets['clean_text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

#remove "RT" string
tweets['clean_text'] = tweets['clean_text'].map(lambda x: x.lstrip('rt'))

# #remove remaining punctuation except for "#"
# tweets['clean_text'] = tweets['clean_text'].str.replace("[^a-zA-Z]", " ")

In [47]:
tweets['tweet_chars']=tweets['clean_text'].map(lambda x: len(x))

In [48]:
tweets['tweet_chars'].describe()

count    9465.000000
mean       78.757739
std        22.795045
min         0.000000
25%        63.000000
50%        78.000000
75%        93.000000
max       153.000000
Name: tweet_chars, dtype: float64

In [50]:
#Name variables 

X = tweets[["clean_text","tweet_chars"]]
y = tweets["y"]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [22]:
# Load stopwords from NLTK and add custom words
mystopwords = stopwords.words('english')
mystopwords.extend(['hurricane','tornado','harvey','irma','joplin','sandy','maria',
                    'like','would','get','x200b','https','one','www','com','org','etc','could'])

In [23]:
# Naive Bayes with CVEC
tknzr = TweetTokenizer()
cvec = CountVectorizer(tokenizer=tknzr.tokenize,stop_words=mystopwords,
                       max_features=3000,max_df=1.0,min_df=2, 
                       ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(),
                           columns = cvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_cvec, y_train)

# Score model on the training set.
print('Naive Bayes model with Count Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_cvec,y_test):.3f}')

Naive Bayes model with Count Vectorizer
Train data accuracy: 0.773
Test data accuracy: 0.663


In [24]:
# Naive Bayes Bernouilli with Tfidf
tvec = TfidfVectorizer(tokenizer=tknzr.tokenize,stop_words=None,max_features=3000,max_df=1.0,min_df=2, ngram_range=(1,1))

# Fit our CountVectorizer on the training data and transform training data.
X_train_tvec = pd.DataFrame(tvec.fit_transform(X_train['clean_text']).todense(),
                            columns = tvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_tvec = pd.DataFrame(tvec.transform(X_test['clean_text']).todense(),
                           columns = tvec.get_feature_names())

# instantiate and fit model 
nb = MultinomialNB(alpha=1)
nb.fit(X_train_tvec, y_train)

# Score model on the training set.
print('Naive Bayes model with TF-IDF Vectorizer')
print(f'Train data accuracy: {nb.score(X_train_tvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {nb.score(X_test_tvec,y_test):.3f}')

Naive Bayes model with TF-IDF Vectorizer
Train data accuracy: 0.702
Test data accuracy: 0.662


In [None]:
# SVM model with CVEC
cvec = CountVectorizer(stop_words=mystopwords,max_features=800, ngram_range=(1,1))

# Fit  CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(),
                            columns = cvec.get_feature_names())
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(),
                           columns = cvec.get_feature_names())
# Instantiate SVM.
# svc = svm.SVC(kernel='poly', C = 1.8, gamma = .05) 
svc = svm.SVC(kernel='rbf', C = 2, gamma = .2)  

# Fit on training data.
svc.fit(X_train_cvec,y_train)

# Score model on the training set.
print('Support Vector Machine model with Count Vectorizer')
print(f'Train data accuracy: {svc.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {svc.score(X_test_cvec,y_test):.3f}')

In [53]:
# Add in number of characters per tweet
#CountVectorizer 
tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize)
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(), columns = cvec.get_feature_names())
X_train_cvec["tweet_chars"] = X_train["tweet_chars"].values
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(), columns = cvec.get_feature_names())
X_test_cvec["tweet_chars"] = X_test["tweet_chars"].values
lr = LogisticRegression(penalty='l2',C=1.5,random_state=42, solver='liblinear', multi_class='ovr')
lr_model = lr.fit(X_train_cvec, y_train)

# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')

Logistic Regression model with Count Vectorizer and TweetTokenzer
Train data accuracy: 0.959
Test data accuracy: 0.709


In [61]:
#CountVectorizer 
tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize)
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression(penalty='l2',C=1.2,random_state=42, solver='liblinear', multi_class='ovr')
lr_model = lr.fit(X_train_cvec, y_train)

# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')

Logistic Regression model with Count Vectorizer and TweetTokenzer
Train data accuracy: 0.944
Test data accuracy: 0.710


In [None]:
# Latent Semantic Indexing
tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize)
model = LsiModel(cvec, id2word=common_dictionary)
# vectorized_corpus = model[cvec]


X_train_cvec = pd.DataFrame(model[X_train['tweet']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['tweet']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression()
lr_model = lr.fit(X_train_cvec, y_train)
# Score model on the training set.
print('Logistic Regression model with Count Vectorizer and TweetTokenzer')
print(f'Train data accuracy: {lr.score(X_train_cvec,y_train):.3f}')

# Score our model on the testing set.
print(f'Test data accuracy: {lr.score(X_test_cvec,y_test):.3f}')

In [54]:
# random forest
model = RandomForestClassifier(max_depth=20,random_state=42,min_samples_leaf=5,n_estimators=10)
model.fit(X_train_cvec,y_train)
y_pred = model.predict(X_test_cvec)
y_pred_train = model.predict(X_train_cvec)

print(f'Training R-sq is: {model.score(X_train_cvec,y_train):.3f}')
print(f'Testing R-sq is: {model.score(X_test_cvec,y_test):.3f}')

Training R-sq is: 0.557
Testing R-sq is: 0.546


### Analyze Results of Best Model

In [28]:
# print confusion matrix
# y_pred = lr.predict(X_test_cvec)
# cm = confusion_matrix(y_test, y_pred)
# cm_df = pd.DataFrame(cm, columns=['predict neg', 'predict pos'], index=['actual neg', 'actual pos'])
# cm_df

In [55]:
michael = pd.read_pickle('../data/hurricane_michael.pkl')

In [56]:
michael.shape

(50043, 11)

In [57]:
michael.head()

Unnamed: 0,date,favorites,geo,hashtags,id,mentions,permalink,retweets,text,to,username
0,2018-10-12 23:59:57+00:00,4,,#HurricaneMichael #blessed #UnitedWeStand #tal...,1050898900582838272,@COTNews,https://twitter.com/joeearenas/status/10508989...,0,@COTNews has been working 24/7 to restore serv...,,joeearenas
1,2018-10-12 23:59:53+00:00,1,,,1050898882526371842,,https://twitter.com/LakesideBexley/status/1050...,0,"In the wake of Hurricane Michael, we understan...",,LakesideBexley
2,2018-10-12 23:59:52+00:00,0,,#HurricaneMichael #Florida,1050898882077442048,,https://twitter.com/PRAISETRIUNEGOD/status/105...,0,"Maybe 17 "" #HurricaneMichael Updates: Body Fou...",,PRAISETRIUNEGOD
3,2018-10-12 23:59:49+00:00,0,,,1050898865988222976,,https://twitter.com/aShartee/status/1050898865...,0,In other news praying for those affected by hu...,,aShartee
4,2018-10-12 23:59:43+00:00,0,,#HurricaneMichael #ExcessiveForce,1050898841879236608,,https://twitter.com/MindOfMo/status/1050898841...,0,iSpy 2 or 3 who'd be hard-pressed to RUN in an...,CBSNews,MindOfMo


In [58]:
# convert text to lower case
michael['clean_text'] = michael['text'].str.lower()

# remove URLs
import re
michael['clean_text'] = michael['clean_text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

#remove "RT" string
michael['clean_text'] = michael['clean_text'].map(lambda x: x.lstrip('rt'))

# #remove remaining punctuation except for "#"
# michael['clean_text'] = michael['clean_text'].str.replace("[^a-zA-Z]", " ")

In [59]:
# Transform out-of-event testing data with the already-fit CountVectorizer.
michael_cvec = pd.DataFrame(cvec.transform(michael['clean_text']).todense(),
                           columns = cvec.get_feature_names())


In [62]:
michael['inform_pred'] = lr_model.predict(michael_cvec)

In [64]:
michael.inform_pred.value_counts(1)

1.0    0.544092
2.0    0.211418
3.0    0.144156
6.0    0.055073
4.0    0.030154
5.0    0.014847
0.0    0.000260
Name: inform_pred, dtype: float64

In [None]:
# "Casualties and damage":1,"Caution and advice":2, 
#      "Unknown":3, "Information Source":4, "Donations of money, goods or services":5})