In [1]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from models import naive_bayes, logistic, random_forest
from models import naive_bayes, logistic, random_forest

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
#import the data
data = pd.read_csv('data/clean_data.csv',index_col = 0)

## Fitting a Binary Model

In [3]:
#select only tweets that are pro (1) and anti (-1)
binary_data = data[data["sentiment"] != 0]

In [4]:
#split data into testing and training sets 
X_train, X_test, y_train, y_test = train_test_split(binary_data.drop(labels = "sentiment", axis = 1), binary_data["sentiment"], test_size=0.2, random_state=0)

In [5]:
#select just the numerical data
X_train_numerical = X_train.loc[:, ['is_retweet', 'is_quoted', 'retweets', 'favorites', 'followers', 'verified',
               'exclamation_mark_count', 'question_mark_count']]

#### Perform 5-fold cross validation on each of the potential models

In [6]:
#fit the models and obtain test error estimates
nb = naive_bayes(X_train["clean text"].values, y_train.values)
log = logistic(X_train["clean text"].values, y_train.values)
rf = random_forest(X_train["clean text"].values, y_train.values)
log_with_numeric = logistic(X_train["clean text"].values, y_train.values, X_train_numerical.values)
rf_with_numeric = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values)

In [8]:
binary_df = pd.DataFrame(data ={'Model':['Naive Bayes (2)',
                                         'Logistic Regression (2)',
                                         'Random Forest (2)', 
                                         'Logistic Regression with Numerical (2)',
                                         'Random Forest with Numerical (2)'], 
                                'Accuracy': [nb,log,rf,log_with_numeric,rf_with_numeric] } )
binary_df

Unnamed: 0,Model,Accuracy
0,Naive Bayes (2),0.879172
1,Logistic Regression (2),0.884428
2,Random Forest (2),0.875762
3,Logistic Regression with Numerical (2),0.841711
4,Random Forest with Numerical (2),0.873022


## Fitting a Binary Model with Upsampling

Our dataset contains class imbalance, so we will try to account for this by upsampling the -1 class.

In [9]:
n = len(binary_data)
print('class 1: ', len(binary_data[binary_data['sentiment']==1])/n)
print('class -1: ', len(binary_data[binary_data['sentiment']==-1])/n)

class 1:  0.8401842823276826
class -1:  0.1598157176723174


In [10]:
# randomly sample values from the smaller clss untill the classes are balanced
np.random.seed(123)

diff = (len(binary_data[binary_data['sentiment']==1]) - 
        len(binary_data[binary_data['sentiment']==-1])) # want to creat 50 - 50 balance
neg_data = binary_data[binary_data['sentiment']==-1] # sample to choose from
upsample_index = np.random.choice(neg_data.index,size=diff) # sample with repetition
binary_data_ups = binary_data.append(neg_data.loc[upsample_index]).sample(frac=1)

n = len(binary_data_ups)
print('class 1: ', len(binary_data_ups[binary_data_ups['sentiment']==1])/n)
print('class -1: ', len(binary_data_ups[binary_data_ups['sentiment']==-1])/n)

class 1:  0.5
class -1:  0.5


In [11]:
#split data into testing and training sets 
X_train, X_test, y_train, y_test = train_test_split(binary_data_ups.drop(labels = "sentiment", axis = 1), binary_data_ups["sentiment"], test_size=0.2, random_state=0)

In [12]:
#select just the numerical data
X_train_numerical = X_train.loc[:, ['is_retweet', 'is_quoted', 'retweets', 'favorites', 'followers', 'verified',
               'exclamation_mark_count', 'question_mark_count']]

#### Perform 5-fold cross validation on each of the potential models

In [13]:
#fit the models and obtain test error estimates
nb = naive_bayes(X_train["clean text"].values, y_train.values)
log = logistic(X_train["clean text"].values, y_train.values)
rf = random_forest(X_train["clean text"].values, y_train.values)
log_with_numeric = logistic(X_train["clean text"].values, y_train.values, X_train_numerical.values)
rf_with_numeric = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values)

In [14]:
binary_df_ups = pd.DataFrame(data ={'Model':['Naive Bayes (2) upsampled',
                                             'Logistic Regression (2) upsampled',
                                             'Random Forest (2) upsampled', 
                                             'Logistic Regression with Numerical (2) upsampled',
                                             'Random Forest with Numerical (2) upsampled'], 
                                    'Accuracy': [nb,log,rf,log_with_numeric,rf_with_numeric] } )
binary_df_ups

Unnamed: 0,Model,Accuracy
0,Naive Bayes (2) upsampled,0.844147
1,Logistic Regression (2) upsampled,0.878985
2,Random Forest (2) upsampled,0.94643
3,Logistic Regression with Numerical (2) upsampled,0.494277
4,Random Forest with Numerical (2) upsampled,0.959406


## Test performance and fitting the final binary model

In [None]:
# note: i don't think we should be doing this until we decide on the final model

#### Fit the best model and obtain the final test error estimate

In [19]:
#vectorize
vectorizer = CountVectorizer()
train = vectorizer.fit_transform(X_train["clean text"].values)
test = vectorizer.transform(X_test["clean text"].values)
        
#run logistic regression
logistic_classifier = LogisticRegression(random_state=0, max_iter = 2000)
logistic_classifier.fit(train, y_train.values)
        
#obtain the test accuracy
logistic_classifier.score(test, y_test.values)

0.8761180679785331

##### Fit the final model

In [21]:
vectorizer = CountVectorizer()
vec = vectorizer.fit_transform(data["clean text"].values)
        
#run logistic regression
logistic_classifier = LogisticRegression(random_state=0, max_iter = 2000)
logistic_classifier.fit(vec, data["sentiment"].values)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
logistic_classifier.coef_

array([[-0.064229  , -0.08045319, -0.01437724, ..., -0.00890921,
         0.67050556,  0.39071954],
       [ 0.39435001, -0.45111389, -0.08785605, ...,  0.33158911,
        -0.13981684, -0.27403148],
       [-0.33012102,  0.53156708,  0.10223329, ..., -0.32267991,
        -0.53068872, -0.11668806]])

## Fitting a 3 Class Model

In [15]:
#split data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels = "sentiment", axis = 1), data["sentiment"])

In [16]:
#select just the numerical data
X_train_numerical = X_train.loc[:, ['is_retweet', 'is_quoted', 'retweets', 'favorites', 'followers', 'verified',
               'exclamation_mark_count', 'question_mark_count']]

##### Perform 5-fold cross validation on each of the models

In [17]:
#fit the models and obtain test error estimates
nb = naive_bayes(X_train["clean text"].values, y_train.values)
log = logistic(X_train["clean text"].values, y_train.values)
rf = random_forest(X_train["clean text"].values, y_train.values)
log_with_numeric = logistic(X_train["clean text"].values, y_train.values, X_train_numerical.values)
rf_with_numeric = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values)

In [18]:
class3_df = pd.DataFrame(data ={'Model':['Naive Bayes (3)',
                                         'Logistic Regression (3)',
                                         'Random Forest (3)', 
                                         'Logistic Regression with Numerical (3)',
                                         'Random Forest with Numerical (3)'], 
                                'Accuracy': [nb,log,rf,log_with_numeric,rf_with_numeric] } )
class3_df

Unnamed: 0,Model,Accuracy
0,Naive Bayes (3),0.701325
1,Logistic Regression (3),0.706386
2,Random Forest (3),0.689271
3,Logistic Regression with Numerical (3),0.648233
4,Random Forest with Numerical (3),0.695114


## Fitting a 3 Class Model with Upsampling

In [19]:
n = len(data)
print('class 1: ', len(data[data['sentiment']==1])/n)
print('class 0: ', len(data[data['sentiment']==0])/n)
print('class -1: ', len(data[data['sentiment']==-1])/n)

class 1:  0.6481264232972189
class 0:  0.22859015940928853
class -1:  0.12328341729349251


In [20]:
np.random.seed(123)

diffneg = (len(data[data['sentiment']==1]) - 
        len(data[data['sentiment']==-1])) # want to creat 33-33-33 balance
diffneu = (len(data[data['sentiment']==1]) - 
        len(data[data['sentiment']==0])) # want to creat 33-33-33 balance

neg_data = data[data['sentiment']==-1] # sample to choose from
neu_data = data[data['sentiment']==0] # sample to choose from

upsample_index_neg = np.random.choice(neg_data.index,size=diffneg) # sample with repetition
upsample_index_neu = np.random.choice(neu_data.index,size=diffneu) # sample with repetition

data_ups = data.append(neg_data.loc[upsample_index_neg]).append(neu_data.loc[upsample_index_neu]).sample(frac=1)

n = len(data_ups)
print('class 1: ', len(data_ups[data_ups['sentiment']==1])/n)
print('class 0: ', len(data_ups[data_ups['sentiment']==0])/n)
print('class -1: ', len(data_ups[data_ups['sentiment']==-1])/n)

class 1:  0.3333333333333333
class 0:  0.3333333333333333
class -1:  0.3333333333333333


In [21]:
#split data into testing and training sets 
X_train, X_test, y_train, y_test = train_test_split(data_ups.drop(labels = "sentiment", axis = 1), data_ups["sentiment"], test_size=0.2, random_state=0)

In [22]:
#select just the numerical data
X_train_numerical = X_train.loc[:, ['is_retweet', 'is_quoted', 'retweets', 'favorites', 'followers', 'verified',
               'exclamation_mark_count', 'question_mark_count']]

#### Perform 5-fold cross validation on each of the potential models

In [23]:
#fit the models and obtain test error estimates
nb = naive_bayes(X_train["clean text"].values, y_train.values)
log = logistic(X_train["clean text"].values, y_train.values)
rf = random_forest(X_train["clean text"].values, y_train.values)
log_with_numeric = logistic(X_train["clean text"].values, y_train.values, X_train_numerical.values)
rf_with_numeric = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values)

In [24]:
class3_df_ups = pd.DataFrame(data ={'Model':['Naive Bayes (3) upsampled',
                                             'Logistic Regression (3) upsampled',
                                             'Random Forest (3) upsampled', 
                                             'Logistic Regression with Numerical (3) upsampled',
                                             'Random Forest with Numerical (3) upsampled'], 
                                    'Accuracy': [nb,log,rf,log_with_numeric,rf_with_numeric] } )
class3_df_ups

Unnamed: 0,Model,Accuracy
0,Naive Bayes (3) upsampled,0.709589
1,Logistic Regression (3) upsampled,0.760542
2,Random Forest (3) upsampled,0.88077
3,Logistic Regression with Numerical (3) upsampled,0.330339
4,Random Forest with Numerical (3) upsampled,0.896475


## Test performance and fitting the final binary model

###### Fit the best model and obtain the final test error estimate

In [30]:
#vectorize
vectorizer = CountVectorizer()
train = vectorizer.fit_transform(X_train["clean text"].values)
test = vectorizer.transform(X_test["clean text"].values)
        
#naive bayes    
nb_classifier = MultinomialNB()
nb_classifier.fit(train, y_train.values)
      
#obtain the test accuracy
nb_classifier.score(test, y_test.values)

0.7071487717361303

## Comparing model performance

This is not much better than the null classifier as we found in the exploratory data analysis. To improve this we will next try:
- upsampling
- n-grams
- scaling the numerical data

In [29]:
binary_df.append(binary_df_ups)

Unnamed: 0,Model,Accuracy
0,Naive Bayes (2),0.879172
1,Logistic Regression (2),0.884428
2,Random Forest (2),0.875762
3,Logistic Regression with Numerical (2),0.841711
4,Random Forest with Numerical (2),0.873022
0,Naive Bayes (2) upsampled,0.844147
1,Logistic Regression (2) upsampled,0.878985
2,Random Forest (2) upsampled,0.94643
3,Logistic Regression with Numerical (2) upsampled,0.494277
4,Random Forest with Numerical (2) upsampled,0.959406


In [30]:
class3_df.append(class3_df_ups)

Unnamed: 0,Model,Accuracy
0,Naive Bayes (3),0.701325
1,Logistic Regression (3),0.706386
2,Random Forest (3),0.689271
3,Logistic Regression with Numerical (3),0.648233
4,Random Forest with Numerical (3),0.695114
0,Naive Bayes (3) upsampled,0.709589
1,Logistic Regression (3) upsampled,0.760542
2,Random Forest (3) upsampled,0.88077
3,Logistic Regression with Numerical (3) upsampled,0.330339
4,Random Forest with Numerical (3) upsampled,0.896475
