In [17]:
#imports
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from models import naive_bayes, logistic, random_forest
from scipy.sparse import csr_matrix, hstack


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
# import the data
data = pd.read_csv('data/clean_data.csv',index_col = 0)

Based on the results from the initial model fitting, it seems as though the numerical features we collected using Tweepy were not particularly useful. This could be because the features themselves are not informative or because the imputation techniques we used for the missing tweets were not good. If we have time, we can go back and try more advanced imputation techniques. For now, let's ignore the numerical features except for date_time. Date_time should be reliable because tweets are organized chronologically by tweet id and so the missing date_times we imputed should be fairly reliable. Let's also keep numerical features that we collected directly from the text like exclamation_mark_count.

In [3]:
# adding date time features
data['date_time'] = pd.to_datetime(data['date_time'])

# might not be a good idea to use year as a feature, since we want to be able to make predictions in 
# future years not contained in training data
# let's use hour and month since these correlate with temperature
data['month'] = data['date_time'].dt.month
data['hour'] = data['date_time'].dt.hour
# lets also include day of week since that can influence sentiment
data['dayofweek'] = data['date_time'].dt.dayofweek

data.head()

Unnamed: 0,clean text,tweetid,is_retweet,is_quoted,date_time,retweets,favorites,followers,verified,location,exclamation_mark_count,question_mark_count,imputed,sentiment,month,hour,dayofweek
2,aaaaaand delet glob warm rain tweet cas miss s...,794050846807982080,1,0,2016-11-03 05:37:29,47,79,20106,1,NYC,0,0,False,-1,11,5,3
3,aaaaand go trump admin start remov clim chang ...,828858786286796800,1,0,2017-02-07 07:07:01,2,3,5164,0,n,0,0,True,1,2,7,1
4,aaaand elimin ref glob warm wisconsin,814547316258512896,1,0,2016-12-29 18:58:31,2,3,5164,0,n,0,0,True,1,12,18,3
5,aaaand ep remov clim chang pag stil check ever...,858153629638959106,1,0,2017-04-29 02:59:03,760,637,428254,0,,0,0,False,1,4,2,5
6,ab act clim chang cal show success vis amp pract,890334462004699136,1,0,2017-07-26 22:14:12,38,76,19398,1,"Lakewood, CA",0,0,False,1,7,22,2


## Random Forest

In [4]:
# upsampling seemed to improve results so let's continue doing this

n = len(data)
print('class 1: ', len(data[data['sentiment']==1])/n)
print('class 0: ', len(data[data['sentiment']==0])/n)
print('class -1: ', len(data[data['sentiment']==-1])/n)

class 1:  0.6481264232972189
class 0:  0.22859015940928853
class -1:  0.12328341729349251


In [5]:
np.random.seed(123)

diffneg = (len(data[data['sentiment']==1]) - 
        len(data[data['sentiment']==-1])) # want to creat 33-33-33 balance
diffneu = (len(data[data['sentiment']==1]) - 
        len(data[data['sentiment']==0])) # want to creat 33-33-33 balance

neg_data = data[data['sentiment']==-1] # sample to choose from
neu_data = data[data['sentiment']==0] # sample to choose from

upsample_index_neg = np.random.choice(neg_data.index,size=diffneg) # sample with repetition
upsample_index_neu = np.random.choice(neu_data.index,size=diffneu) # sample with repetition

data_ups = data.append(neg_data.loc[upsample_index_neg]).append(neu_data.loc[upsample_index_neu]).sample(frac=1)

n = len(data_ups)
print('class 1: ', len(data_ups[data_ups['sentiment']==1])/n)
print('class 0: ', len(data_ups[data_ups['sentiment']==0])/n)
print('class -1: ', len(data_ups[data_ups['sentiment']==-1])/n)

class 1:  0.3333333333333333
class 0:  0.3333333333333333
class -1:  0.3333333333333333


### Part 1: Feature Selection

In [7]:
#split data into testing and training sets 
X_train, X_test, y_train, y_test = train_test_split(data_ups.drop(labels = "sentiment", axis = 1), 
                                                    data_ups["sentiment"], test_size=0.2, 
                                                    random_state=0)

In [21]:
# select just the numerical data
numerical_feature_names = ['exclamation_mark_count', 'question_mark_count','month','hour','dayofweek']
X_train_numerical = X_train.loc[:,numerical_feature_names]

In [11]:
# compare 5 fold cross validation error metrics for count and tfidf vectorizer with
# 1, 2, 3-grams

for skvectorizeri in [CountVectorizer,TfidfVectorizer]:
    print(skvectorizeri)
    for ngrami in [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3)]:
        rf = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values,
                           ngram=ngrami,skvectorizer=skvectorizeri)
        print('n-gram: ',ngrami)
        print('Accuracy: ',rf[0])
        print('Precision: ',rf[1])
        print('Recall: ',rf[2])
        print('F score: ',rf[3])

<class 'sklearn.feature_extraction.text.CountVectorizer'>
n-gram:  (1, 1)
Accuracy:  0.8962756265671498
Precision:  0.8971316026935282
Recall:  0.8963747930448666
F score:  0.8952602867572047
n-gram:  (1, 2)
Accuracy:  0.8921496923045584
Precision:  0.89496521274199
Recall:  0.8922580278359604
F score:  0.8911518410132888
n-gram:  (1, 3)
Accuracy:  0.8909296981084432
Precision:  0.8945634687950432
Recall:  0.891035682712479
F score:  0.8899493985250444
n-gram:  (2, 2)
Accuracy:  0.8900424038828756
Precision:  0.8918710181642598
Recall:  0.8901701293288931
F score:  0.8897688771811273
n-gram:  (2, 3)
Accuracy:  0.8864266983029886
Precision:  0.8883014053458954
Recall:  0.8865457950462232
F score:  0.8858787897968792
n-gram:  (3, 3)
Accuracy:  0.8554601561538252
Precision:  0.8624233906219041
Recall:  0.8556111031621623
F score:  0.8528549248367756
<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
n-gram:  (1, 1)
Accuracy:  0.9022648853457224
Precision:  0.9021834624408583
Recall

TFIDF vectorizer with 1-grams had the best performance.

Random forests perform implicit feature selection by splitting on the most important nodes, but it can still be useful to look at which features it deemed the most important to speed up the model and potentially use as feature selection for other models.

In [30]:
# train model on all training data
vectorizer = TfidfVectorizer(ngram_range=(1,1))
features = vectorizer.fit_transform(X_train["clean text"].values)
features = hstack([features, csr_matrix(X_train_numerical.values)])
rf_classifier = RandomForestClassifier()
rf_classifier.fit(features, y_train);

In [31]:
feature_names = vectorizer.get_feature_names() + numerical_feature_names

In [32]:
# https://towardsdatascience.com/improving-random-forest-in-python-part-1-893916666cd

importances = list(rf_classifier.feature_importances_) # higher value = more important
feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(feature_names, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
for i in range(100): # print top 100 most important features
    print('feature:', feature_importances[i][0],' | ',feature_importances[i][1])

feature: chang  |  0.02154
feature: clim  |  0.02121
feature: hour  |  0.01717
feature: glob  |  0.01531
feature: month  |  0.01508
feature: warm  |  0.01427
feature: dayofweek  |  0.01354
feature: deny  |  0.00774
feature: lib  |  0.00699
feature: sci  |  0.0064
feature: scam  |  0.0057
feature: real  |  0.00495
feature: trump  |  0.00479
feature: obam  |  0.00472
feature: exclamation_mark_count  |  0.00457
feature: believ  |  0.00419
feature: fight  |  0.0041
feature: mad  |  0.0039
feature: man  |  0.00383
feature: alarm  |  0.00364
feature: question_mark_count  |  0.00357
feature: hoax  |  0.00353
feature: fak  |  0.0034
feature: caus  |  0.00338
feature: us  |  0.0033
feature: left  |  0.00323
feature: say  |  0.00318
feature: act  |  0.00317
feature: amp  |  0.00308
feature: think  |  0.00301
feature: new  |  0.00294
feature: peopl  |  0.00293
feature: blam  |  0.00291
feature: lik  |  0.00273
feature: tax  |  0.00271
feature: dont  |  0.00268
feature: year  |  0.00265
feature: o

A lot of these features make sense! There are many words related to climate change (chang, clim, warm, glob), belief (deny, scam, real, believ, fak, hoax), and politcs (lib, obam, trump). It's also intersting to note that all our numerical features were pretty important.