In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from models import naive_bayes, logistic, random_forest
from models import naive_bayes, logistic, random_forest

In [2]:
#import the data
data = pd.read_csv('clean_data.csv.txt',index_col = 0)

### Fitting a Binary Model

In [3]:
#select only tweets that are pro and anti
binary_data = data[data["sentiment"] != 0]

In [4]:
#split data into testing and training sets 
X_train, X_test, y_train, y_test = train_test_split(binary_data.drop(labels = "sentiment", axis = 1), binary_data["sentiment"], test_size=0.2, random_state=0)

In [None]:
#select just the numerical data
X_train_numerical = X_train.loc[:, ['is_retweet', 'is_quoted', 'retweets', 'favorites', 'followers', 'verified',
               'exclamation_mark_count', 'question_mark_count']]

###### Perform 5-fold cross validation on each of the potential models

In [13]:
#fit the models and obtain test error estimates
nb = naive_bayes(X_train["clean text"].values, y_train.values)
log = logistic(X_train["clean text"].values, y_train.values)
rf = random_forest(X_train["clean text"].values, y_train.values)
log_with_numeric = logistic(X_train["clean text"].values, y_train.values, X_train_numerical.values)
rf_with_numeric = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values)

In [14]:
binary_df = pd.DataFrame(data ={'Model':['Naive Bayes',
                            'Logistic Regression',
                            'Random Forest', 
                             'Logistic Regression with Numerical',
                            'Random Forest with Numerical'], 'Accuracy': [nb,log,rf,log_with_numeric,rf_with_numeric] } )
binary_df

Unnamed: 0,Model,Accuracy
0,Naive Bayes,0.879396
1,Logistic Regression,0.884261
2,Random Forest,0.873637
3,Logistic Regression with Numerical,0.841711
4,Random Forest with Numerical,0.873637


Logistic regression had the best accuracy rate. This will be our final model

##### Fit the best model and obtain the final test error estimate

In [18]:
#imports
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
#vectorize
vectorizer = CountVectorizer()
train = vectorizer.fit_transform(X_train["clean text"].values)
test = vectorizer.transform(X_test["clean text"].values)
        
#run logistic regression
logistic_classifier = LogisticRegression(random_state=0, max_iter = 2000)
logistic_classifier.fit(train, y_train.values)
        
#obtain the test accuracy
logistic_classifier.score(test, y_test.values)

0.8761180679785331

##### Fit the final model

In [21]:
vectorizer = CountVectorizer()
vec = vectorizer.fit_transform(data["clean text"].values)
        
#run logistic regression
logistic_classifier = LogisticRegression(random_state=0, max_iter = 2000)
logistic_classifier.fit(vec, data["sentiment"].values)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
logistic_classifier.coef_

array([[-0.064229  , -0.08045319, -0.01437724, ..., -0.00890921,
         0.67050556,  0.39071954],
       [ 0.39435001, -0.45111389, -0.08785605, ...,  0.33158911,
        -0.13981684, -0.27403148],
       [-0.33012102,  0.53156708,  0.10223329, ..., -0.32267991,
        -0.53068872, -0.11668806]])

## Fitting a 3 Class Model

In [24]:
#split data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels = "sentiment", axis = 1), data["sentiment"])

In [25]:
#select just the numerical data
X_train_numerical = X_train.loc[:, ['is_retweet', 'is_quoted', 'retweets', 'favorites', 'followers', 'verified',
               'exclamation_mark_count', 'question_mark_count']]

##### Perform 5-fold cross validation on each of the models

In [26]:
#fit the models and obtain test error estimates
nb = naive_bayes(X_train["clean text"].values, y_train.values)
log = logistic(X_train["clean text"].values, y_train.values)
rf = random_forest(X_train["clean text"].values, y_train.values)
log_with_numeric = logistic(X_train["clean text"].values, y_train.values, X_train_numerical.values)
rf_with_numeric = random_forest(X_train["clean text"].values, y_train.values, X_train_numerical.values)

In [28]:
class3_df = pd.DataFrame(data ={'Model':['Naive Bayes',
                            'Logistic Regression',
                            'Random Forest', 
                             'Logistic Regression with Numerical',
                            'Random Forest with Numerical'], 'Accuracy': [nb,log,rf,log_with_numeric,rf_with_numeric] } )
class3_df

Unnamed: 0,Model,Accuracy
0,Naive Bayes,0.701417
1,Logistic Regression,0.693044
2,Random Forest,0.685683
3,Logistic Regression with Numerical,0.647866
4,Random Forest with Numerical,0.691342


###### Fit the best model and obtain the final test error estimate

In [29]:
#imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [30]:
#vectorize
vectorizer = CountVectorizer()
train = vectorizer.fit_transform(X_train["clean text"].values)
test = vectorizer.transform(X_test["clean text"].values)
        
#naive bayes    
nb_classifier = MultinomialNB()
nb_classifier.fit(train, y_train.values)
      
#obtain the test accuracy
nb_classifier.score(test, y_test.values)

0.7071487717361303

This is not much better than the null classifier as we found in the exploratory data analysis. To improve this we will next try:
- upsampling
- n-grams
- scaling the numerical data