In [1]:
#imports
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from models import naive_bayes, logistic, random_forest

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [4]:
# import the data
data = pd.read_csv('data/clean_data.csv',index_col = 0)

## Fitting a Binary Model

In [3]:
#select only tweets that are pro (1) and anti (-1)
binary_data = data[data["sentiment"] != 0]

In [4]:
#split data into testing and training sets 
X_train, X_test, y_train, y_test = train_test_split(binary_data.drop(labels = "sentiment", axis = 1), binary_data["sentiment"], test_size=0.2, random_state=0)

In [5]:
#select just the numerical data
X_train_numerical = X_train.loc[:, ['is_retweet', 'is_quoted', 'retweets', 'favorites', 'followers', 'verified',
               'exclamation_mark_count', 'question_mark_count']]

#### Perform 5-fold cross validation on each of the potential models

In [6]:
#fit the models and obtain test error estimates
nb = naive_bayes(X_train["clean text"].values, y_train.values)
log = logistic(X_train["clean text"].values, y_train.values)
rf = random_forest(X_train["clean text"].values, y_train.values)
log_with_numeric = logistic(X_train["clean text"].values, y_train.values, X_train_numerical.values)
rf_with_numeric = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values)

In [7]:
binary_df = pd.DataFrame(data ={'Model':['Naive Bayes',
                                         'Logistic Regression',
                                         'Random Forest', 
                                         'Logistic Regression with Numerical',
                                         'Random Forest with Numerical'], 
                                'Accuracy': [nb[0],log[0],rf[0],log_with_numeric[0],rf_with_numeric[0]],
                                'Precision': [nb[1],log[1],rf[1],log_with_numeric[1],rf_with_numeric[1]],
                                'Recall': [nb[2],log[2],rf[2],log_with_numeric[2],rf_with_numeric[2]],
                                'F score': [nb[3],log[3],rf[3],log_with_numeric[3],rf_with_numeric[3]]})
binary_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F score
0,Naive Bayes,0.879172,0.804868,0.682913,0.719953
1,Logistic Regression,0.884372,0.815195,0.69921,0.736774
2,Random Forest,0.875594,0.800413,0.667119,0.704335
3,Logistic Regression with Numerical,0.841711,0.420855,0.5,0.457025
4,Random Forest with Numerical,0.873246,0.834202,0.629075,0.666645


## Fitting a Binary Model with Upsampling

Our dataset contains class imbalance, so we will try to account for this by upsampling the -1 class.

In [8]:
n = len(binary_data)
print('class 1: ', len(binary_data[binary_data['sentiment']==1])/n)
print('class -1: ', len(binary_data[binary_data['sentiment']==-1])/n)

class 1:  0.8401842823276826
class -1:  0.1598157176723174


In [9]:
# randomly sample values from the smaller class untill the classes are balanced
np.random.seed(123)

diff = (len(binary_data[binary_data['sentiment']==1]) - 
        len(binary_data[binary_data['sentiment']==-1])) # want to creat 50 - 50 balance
neg_data = binary_data[binary_data['sentiment']==-1] # sample to choose from
upsample_index = np.random.choice(neg_data.index,size=diff) # sample with repetition
binary_data_ups = binary_data.append(neg_data.loc[upsample_index]).sample(frac=1)

n = len(binary_data_ups)
print('class 1: ', len(binary_data_ups[binary_data_ups['sentiment']==1])/n)
print('class -1: ', len(binary_data_ups[binary_data_ups['sentiment']==-1])/n)

class 1:  0.5
class -1:  0.5


In [10]:
#split data into testing and training sets 
X_train, X_test, y_train, y_test = train_test_split(binary_data_ups.drop(labels = "sentiment", axis = 1), binary_data_ups["sentiment"], test_size=0.2, random_state=0)

In [11]:
#select just the numerical data
X_train_numerical = X_train.loc[:, ['is_retweet', 'is_quoted', 'retweets', 'favorites', 'followers', 'verified',
               'exclamation_mark_count', 'question_mark_count']]

#### Perform 5-fold cross validation on each of the potential models

In [12]:
#fit the models and obtain test error estimates
nb = naive_bayes(X_train["clean text"].values, y_train.values)
log = logistic(X_train["clean text"].values, y_train.values)
rf = random_forest(X_train["clean text"].values, y_train.values)
log_with_numeric = logistic(X_train["clean text"].values, y_train.values, X_train_numerical.values)
rf_with_numeric = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values)

In [13]:
binary_df_ups = pd.DataFrame(data ={'Model':['Naive Bayes upsampled',
                                             'Logistic Regression upsampled',
                                             'Random Forest upsampled', 
                                             'Logistic Regression with Numerical upsampled',
                                             'Random Forest with Numerical upsampled'], 
                                    'Accuracy': [nb[0],log[0],rf[0],log_with_numeric[0],rf_with_numeric[0]],
                                    'Precision': [nb[1],log[1],rf[1],log_with_numeric[1],rf_with_numeric[1]],
                                    'Recall': [nb[2],log[2],rf[2],log_with_numeric[2],rf_with_numeric[2]],
                                    'F score': [nb[3],log[3],rf[3],log_with_numeric[3],rf_with_numeric[3]]})
binary_df_ups

Unnamed: 0,Model,Accuracy,Precision,Recall,F score
0,Naive Bayes upsampled,0.844147,0.844486,0.844082,0.844064
1,Logistic Regression upsampled,0.878985,0.879833,0.878903,0.878879
2,Random Forest upsampled,0.94643,0.949446,0.946292,0.94632
3,Logistic Regression with Numerical upsampled,0.495142,0.460364,0.495896,0.375201
4,Random Forest with Numerical upsampled,0.959406,0.960787,0.959337,0.959365


## Test performance and fitting the final binary model

In [None]:
# note: i don't think we should be doing this until we decide on the final model

#### Fit the best model and obtain the final test error estimate

In [19]:
#vectorize
#vectorizer = CountVectorizer()
#train = vectorizer.fit_transform(X_train["clean text"].values)
#test = vectorizer.transform(X_test["clean text"].values)
        
#run logistic regression
#logistic_classifier = LogisticRegression(random_state=0, max_iter = 2000)
#logistic_classifier.fit(train, y_train.values)
        
#obtain the test accuracy
#logistic_classifier.score(test, y_test.values)

##### Fit the final model

In [20]:
#vectorizer = CountVectorizer()
#vec = vectorizer.fit_transform(data["clean text"].values)
        
#run logistic regression
#logistic_classifier = LogisticRegression(random_state=0, max_iter = 2000)
#logistic_classifier.fit(vec, data["sentiment"].values)

In [21]:
#logistic_classifier.coef_

## Fitting a 3 Class Model

In [14]:
#split data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels = "sentiment", axis = 1), data["sentiment"])

In [15]:
#select just the numerical data
X_train_numerical = X_train.loc[:, ['is_retweet', 'is_quoted', 'retweets', 'favorites', 'followers', 'verified',
               'exclamation_mark_count', 'question_mark_count']]

##### Perform 5-fold cross validation on each of the models

In [16]:
#fit the models and obtain test error estimates
nb = naive_bayes(X_train["clean text"].values, y_train.values)
log = logistic(X_train["clean text"].values, y_train.values)
rf = random_forest(X_train["clean text"].values, y_train.values)
log_with_numeric = logistic(X_train["clean text"].values, y_train.values, X_train_numerical.values)
rf_with_numeric = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values)

In [17]:
class3_df = pd.DataFrame(data ={'Model':['Naive Bayes',
                                         'Logistic Regression',
                                         'Random Forest', 
                                         'Logistic Regression with Numerical',
                                         'Random Forest with Numerical'], 
                                'Accuracy': [nb[0],log[0],rf[0],log_with_numeric[0],rf_with_numeric[0]],
                                'Precision': [nb[1],log[1],rf[1],log_with_numeric[1],rf_with_numeric[1]],
                                'Recall': [nb[2],log[2],rf[2],log_with_numeric[2],rf_with_numeric[2]],
                                'F score': [nb[3],log[3],rf[3],log_with_numeric[3],rf_with_numeric[3]]})
class3_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F score
0,Naive Bayes,0.701325,0.638655,0.498019,0.526972
1,Logistic Regression,0.698289,0.614781,0.544584,0.56832
2,Random Forest,0.689271,0.611492,0.500463,0.527235
3,Logistic Regression with Numerical,0.648371,0.216124,0.333333,0.262225
4,Random Forest with Numerical,0.695114,0.667204,0.472788,0.500113


## Fitting a 3 Class Model with Upsampling

In [18]:
n = len(data)
print('class 1: ', len(data[data['sentiment']==1])/n)
print('class 0: ', len(data[data['sentiment']==0])/n)
print('class -1: ', len(data[data['sentiment']==-1])/n)

class 1:  0.6481264232972189
class 0:  0.22859015940928853
class -1:  0.12328341729349251


In [19]:
np.random.seed(123)

diffneg = (len(data[data['sentiment']==1]) - 
        len(data[data['sentiment']==-1])) # want to creat 33-33-33 balance
diffneu = (len(data[data['sentiment']==1]) - 
        len(data[data['sentiment']==0])) # want to creat 33-33-33 balance

neg_data = data[data['sentiment']==-1] # sample to choose from
neu_data = data[data['sentiment']==0] # sample to choose from

upsample_index_neg = np.random.choice(neg_data.index,size=diffneg) # sample with repetition
upsample_index_neu = np.random.choice(neu_data.index,size=diffneu) # sample with repetition

data_ups = data.append(neg_data.loc[upsample_index_neg]).append(neu_data.loc[upsample_index_neu]).sample(frac=1)

n = len(data_ups)
print('class 1: ', len(data_ups[data_ups['sentiment']==1])/n)
print('class 0: ', len(data_ups[data_ups['sentiment']==0])/n)
print('class -1: ', len(data_ups[data_ups['sentiment']==-1])/n)

class 1:  0.3333333333333333
class 0:  0.3333333333333333
class -1:  0.3333333333333333


In [20]:
#split data into testing and training sets 
X_train, X_test, y_train, y_test = train_test_split(data_ups.drop(labels = "sentiment", axis = 1), data_ups["sentiment"], test_size=0.2, random_state=0)

In [21]:
#select just the numerical data
X_train_numerical = X_train.loc[:, ['is_retweet', 'is_quoted', 'retweets', 'favorites', 'followers', 'verified',
               'exclamation_mark_count', 'question_mark_count']]

#### Perform 5-fold cross validation on each of the potential models

In [22]:
#fit the models and obtain test error estimates
nb = naive_bayes(X_train["clean text"].values, y_train.values)
log = logistic(X_train["clean text"].values, y_train.values)
rf = random_forest(X_train["clean text"].values, y_train.values)
log_with_numeric = logistic(X_train["clean text"].values, y_train.values, X_train_numerical.values)
rf_with_numeric = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values)

In [23]:
class3_df_ups = pd.DataFrame(data ={'Model':['Naive Bayes upsampled',
                                             'Logistic Regression upsampled',
                                             'Random Forest upsampled', 
                                             'Logistic Regression with Numerical upsampled',
                                             'Random Forest with Numerical upsampled'], 
                                    'Accuracy': [nb[0],log[0],rf[0],log_with_numeric[0],rf_with_numeric[0]],
                                    'Precision': [nb[1],log[1],rf[1],log_with_numeric[1],rf_with_numeric[1]],
                                    'Recall': [nb[2],log[2],rf[2],log_with_numeric[2],rf_with_numeric[2]],
                                    'F score': [nb[3],log[3],rf[3],log_with_numeric[3],rf_with_numeric[3]]})
class3_df_ups

Unnamed: 0,Model,Accuracy,Precision,Recall,F score
0,Naive Bayes upsampled,0.709589,0.710537,0.709674,0.707507
1,Logistic Regression upsampled,0.772232,0.771554,0.772363,0.771098
2,Random Forest upsampled,0.88077,0.88248,0.880924,0.878738
3,Logistic Regression with Numerical upsampled,0.334531,0.291014,0.333936,0.170294
4,Random Forest with Numerical upsampled,0.896475,0.896928,0.896583,0.895543


## Test performance and fitting the final binary model

#### Fit the best model and obtain the final test error estimate

In [18]:
#vectorize
#vectorizer = CountVectorizer()
#train = vectorizer.fit_transform(X_train["clean text"].values)
#test = vectorizer.transform(X_test["clean text"].values)
        
#naive bayes    
#nb_classifier = MultinomialNB()
#nb_classifier.fit(train, y_train.values)
      
#obtain the test accuracy
#nb_classifier.score(test, y_test.values)

## Comparing model performance

To improve our results we will next try:
- scaling the numerical data
- upsampling to remove class imbalance (implemented and evaluated)
- n-grams (implemented but not yet evaluated)
- setting max number of words to use
- feature selection with numerical features
- adding penalty (regularizing)
- speed up random forest (run in parallel)
- use MPQA sentiment lexicon


**Binary Results**

In [24]:
binary_df.append(binary_df_ups)

Unnamed: 0,Model,Accuracy,Precision,Recall,F score
0,Naive Bayes,0.879172,0.804868,0.682913,0.719953
1,Logistic Regression,0.884372,0.815195,0.69921,0.736774
2,Random Forest,0.875594,0.800413,0.667119,0.704335
3,Logistic Regression with Numerical,0.841711,0.420855,0.5,0.457025
4,Random Forest with Numerical,0.873246,0.834202,0.629075,0.666645
0,Naive Bayes upsampled,0.844147,0.844486,0.844082,0.844064
1,Logistic Regression upsampled,0.878985,0.879833,0.878903,0.878879
2,Random Forest upsampled,0.94643,0.949446,0.946292,0.94632
3,Logistic Regression with Numerical upsampled,0.495142,0.460364,0.495896,0.375201
4,Random Forest with Numerical upsampled,0.959406,0.960787,0.959337,0.959365


**3 Class Results**

In [25]:
class3_df.append(class3_df_ups)

Unnamed: 0,Model,Accuracy,Precision,Recall,F score
0,Naive Bayes,0.701325,0.638655,0.498019,0.526972
1,Logistic Regression,0.698289,0.614781,0.544584,0.56832
2,Random Forest,0.689271,0.611492,0.500463,0.527235
3,Logistic Regression with Numerical,0.648371,0.216124,0.333333,0.262225
4,Random Forest with Numerical,0.695114,0.667204,0.472788,0.500113
0,Naive Bayes upsampled,0.709589,0.710537,0.709674,0.707507
1,Logistic Regression upsampled,0.772232,0.771554,0.772363,0.771098
2,Random Forest upsampled,0.88077,0.88248,0.880924,0.878738
3,Logistic Regression with Numerical upsampled,0.334531,0.291014,0.333936,0.170294
4,Random Forest with Numerical upsampled,0.896475,0.896928,0.896583,0.895543
