# CP1 Baseline notebook

### By Logan Larson

In [1]:
### import packages ###

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")


### load data ###

df = pd.read_csv('clean_schefter_tweets')
del df['Unnamed: 0']
df.head(3)

Unnamed: 0,Class,Tweet
0,0,rt rank best kicker history top maybe top 3. f...
1,0,april least one trade made (but one waiting
2,1,giant re-signed restricted free agent guard kevin


In [2]:
# define vectorizer
def make_xy(df, vectorizer=None):   
    if vectorizer is None:
        vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df.Tweet.values.astype('U')) # convert object type to unicode
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (df.Class == 1).values.astype(np.int)
    return X, y


# define cross-validation score
def cv_score(clf, X, y, scorefunc):
    result = 0.
    nfold = 5
    for train, test in KFold(nfold).split(X): # split data into train/test groups, 5 times
        clf.fit(X[train], y[train]) # fit the classifier, passed is as clf.
        result += scorefunc(clf, X[test], y[test]) # evaluate score function on held-out data
    return result / nfold # average


# define log-likelihood score function
def log_likelihood(clf, x, y):
    prob = clf.predict_log_proba(x)
    irrelevant = y == 0
    relevant = ~irrelevant
    return prob[irrelevant, 0].sum() + prob[relevant, 1].sum()

In [3]:
# vectorize before train/test split
X, y = make_xy(df)

# split dataset into a training and test set
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

## Show train/test classification for both models

#### 1. Naive Bayes model

In [10]:
NB = BernoulliNB().fit(xtrain, ytrain)
NBtrain_pred = NB.predict(xtrain)
NBtest_pred = NB.predict(xtest)

print('\n Naive Bayes classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, NBtrain_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, NBtest_pred, labels=[0,1]))


 Naive Bayes classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.89      0.93     19185
           1       0.70      0.88      0.78      5595

    accuracy                           0.89     24780
   macro avg       0.83      0.89      0.85     24780
weighted avg       0.90      0.89      0.89     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.89      0.92      6391
           1       0.69      0.85      0.76      1870

    accuracy                           0.88      8261
   macro avg       0.82      0.87      0.84      8261
weighted avg       0.89      0.88      0.88      8261



#### 2. Logistic Regression model

In [7]:
LR = LogisticRegression().fit(xtrain, ytrain)
LR_train_pred = LR.predict(xtrain)
LR_test_pred = LR.predict(xtest)

print('\n Logistic Regression classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, LR_train_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, LR_test_pred, labels=[0,1]))


 Logistic Regression classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.97      0.96     19185
           1       0.90      0.82      0.86      5595

    accuracy                           0.94     24780
   macro avg       0.92      0.90      0.91     24780
weighted avg       0.94      0.94      0.94     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      6391
           1       0.84      0.75      0.79      1870

    accuracy                           0.91      8261
   macro avg       0.88      0.85      0.87      8261
weighted avg       0.91      0.91      0.91      8261



## Determine if there is overfitting

For naive bayes model: 
    - Apparent overfitting in terms of precision for both test and training sets
    - Meanwhile, performs well in terms of recall and overall accuracy
    
For logistic regression model:
    - Interestingly, the opposite seems to be the case for this model
    - Performs well in terms of precision, but overfit in terms of recall
    
Since overfitting is present in both, I'll proceed with regularized models for both naive bayes and logistic regression

## Build regularized models

#### 1. Naive Bayes

In [8]:
# define the grid of parameters to search over
alphas = [.01, .1, 1, 5, 10, 50, 100]
min_df = 0.001

# find the best value for alpha
best_alpha = None
_, itest = train_test_split(range(df.shape[0]))
mask = np.zeros(df.shape[0], dtype=np.bool)
mask[itest] = True
maxscore = -np.inf
for alpha in alphas:        
    vectorizer = CountVectorizer(min_df = min_df)       
    Xthis, ythis = make_xy(df, vectorizer)
    Xtrainthis = Xthis[mask]
    ytrainthis = ythis[mask]
    
    clf = BernoulliNB(alpha=alpha)
    
    cvscore = cv_score(clf, Xtrainthis, ytrainthis, log_likelihood)
    
    if cvscore > maxscore:
        maxscore = cvscore
        best_alpha = alpha
        
print('Optimal alpha: {}'.format(best_alpha))

Optimal alpha: 5


In [9]:
NB_tuned = BernoulliNB(alpha=best_alpha).fit(xtrain, ytrain)

NB_tuned_train_pred = NB_tuned.predict(xtrain)
NB_tuned_test_pred = NB_tuned.predict(xtest)

print('\n Tuned Naive Bayes classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, NB_tuned_train_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, NB_tuned_test_pred, labels=[0,1]))


 Tuned Naive Bayes classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.92      0.93     19185
           1       0.73      0.81      0.77      5595

    accuracy                           0.89     24780
   macro avg       0.84      0.86      0.85     24780
weighted avg       0.90      0.89      0.89     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.91      0.92      6391
           1       0.72      0.78      0.75      1870

    accuracy                           0.88      8261
   macro avg       0.83      0.85      0.84      8261
weighted avg       0.89      0.88      0.88      8261



#### Compared to non-regularized model:

- Less overfit in terms of precision, but more overfit in terms of recall
- Overall accuracy barely changed

#### 2. Logistic regression

In [12]:
# hypertuning C parameter
LR2 = LogisticRegression()
parameters = {"C": [0.0001, 0.001, 0.1, 1, 2, 3, 4, 5, 10, 100, 1000, 10000]}
fitmodel = GridSearchCV(LR2, param_grid=parameters, cv=10, scoring="accuracy").fit(xtrain,ytrain)
fitmodel.best_params_, fitmodel.best_score_, fitmodel.cv_results_

LR_tuned = LogisticRegression(C=fitmodel.best_params_['C']).fit(xtrain, ytrain)
LR_tuned_train_pred = LR_tuned.predict(xtrain)
LR_tuned_test_pred = LR_tuned.predict(xtest)

print('Optimal C value:', fitmodel.best_params_['C'])
print('\n Tuned Logistic Regression classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, LR_tuned_train_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, LR_tuned_test_pred, labels=[0,1]))

Optimal C value: 1

 Tuned Logistic Regression classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.97      0.96     19185
           1       0.90      0.82      0.86      5595

    accuracy                           0.94     24780
   macro avg       0.92      0.90      0.91     24780
weighted avg       0.94      0.94      0.94     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      6391
           1       0.84      0.75      0.79      1870

    accuracy                           0.91      8261
   macro avg       0.88      0.85      0.87      8261
weighted avg       0.91      0.91      0.91      8261



#### Compared to non-regularized model

Since the optimal alpha turned out to be the same as the default setting (1), there is no difference between regularized and non-regularized models. Let's instead look at a ridge and lasso classifier.

In [19]:
# lasso

lasso = LogisticRegression(C=1, penalty='l1').fit(xtrain,ytrain)

lasso_train_pred = lasso.predict(xtrain)
lasso_test_pred = lasso.predict(xtest)

print('\n Logistic Regression classifier with L1 penalty: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, lasso_train_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, lasso_test_pred, labels=[0,1]))


 Logistic Regression classifier with L1 penalty: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.97      0.96     19185
           1       0.88      0.80      0.84      5595

    accuracy                           0.93     24780
   macro avg       0.91      0.89      0.90     24780
weighted avg       0.93      0.93      0.93     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      6391
           1       0.84      0.75      0.79      1870

    accuracy                           0.91      8261
   macro avg       0.89      0.85      0.87      8261
weighted avg       0.91      0.91      0.91      8261



#### Compared to non-regularized model

Performs the same as the baseline model on test data, but performs worse on all metrics on training data.

In [20]:
# ridge

ridge = LogisticRegression(C=1, penalty='l2').fit(xtrain,ytrain)

ridge_train_pred = ridge.predict(xtrain)
ridge_test_pred = ridge.predict(xtest)

print('\n Logistic Regression classifier with L2 penalty: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, ridge_train_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, ridge_test_pred, labels=[0,1]))


 Logistic Regression classifier with L2 penalty: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.97      0.96     19185
           1       0.90      0.82      0.86      5595

    accuracy                           0.94     24780
   macro avg       0.92      0.90      0.91     24780
weighted avg       0.94      0.94      0.94     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      6391
           1       0.84      0.75      0.79      1870

    accuracy                           0.91      8261
   macro avg       0.88      0.85      0.87      8261
weighted avg       0.91      0.91      0.91      8261



#### Compared to non-regularized model:

Performs the same.

## Which metric aligns with the business problem?

- False positive: irrelevant tweet classified as relevant
- False negative: relevant tweet classified as irrelevant

False negatives are more acceptable in this problem, so I'm better served by prioritizing precision (TP/TP+FP) over recall (TP/TP+FN). 

In [23]:
# summary table 

from tabulate import tabulate
print(tabulate([
          ['Training Precision (0)', 0.96, 0.94, 0.95, 0.95, 0.95, 0.94],
          ['Training Precision (1)', 0.70, 0.73, 0.90, 0.90, 0.90, 0.88],
          ['Test Precision (0)', 0.95, 0.93, 0.93, 0.93, 0.93, 0.93],
          ['Test Precision (1)', 0.69, 0.72, 0.84, 0.84, 0.84, 0.84],
          ['Training Recall (0)', 0.89, 0.92, 0.97, 0.97, 0.97, 0.97],
          ['Training Recall (1)', 0.88, 0.81, 0.82, 0.82, 0.82, 0.80],
          ['Test Recall (0)', 0.89, 0.91, 0.96, 0.96, 0.96, 0.96],
          ['Test Recall (1)', 0.85, 0.78, 0.75, 0.75, 0.75, 0.75],
          ['Training F1 score (0)', 0.93, 0.93, 0.96, 0.96, 0.96, 0.96],
          ['Training F1 score (1)', 0.78, 0.77, 0.86, 0.86, 0.86, 0.84],
          ['Test F1 score (0)', 0.92, 0.92, 0.94, 0.94, 0.94, 0.94],
          ['Test F1 score (1)', 0.76, 0.75, 0.79, 0.79, 0.79, 0.79],
                ['Count - Training (0)', 19185],
                ['Count - Training (1)', 5595],
                ['Count - Test (0)', 6391],
                ['Count - Test (1)', 1870]
         ], 
               headers=['', 'NB', 'NB (tuned)', 'LR', 'LR (tuned)', 'Ridge', 'Lasso']
        ))

                              NB    NB (tuned)    LR    LR (tuned)    Ridge    Lasso
----------------------  --------  ------------  ----  ------------  -------  -------
Training Precision (0)      0.96          0.94  0.95          0.95     0.95     0.94
Training Precision (1)      0.7           0.73  0.9           0.9      0.9      0.88
Test Precision (0)          0.95          0.93  0.93          0.93     0.93     0.93
Test Precision (1)          0.69          0.72  0.84          0.84     0.84     0.84
Training Recall (0)         0.89          0.92  0.97          0.97     0.97     0.97
Training Recall (1)         0.88          0.81  0.82          0.82     0.82     0.8
Test Recall (0)             0.89          0.91  0.96          0.96     0.96     0.96
Test Recall (1)             0.85          0.78  0.75          0.75     0.75     0.75
Training F1 score (0)       0.93          0.93  0.96          0.96     0.96     0.96
Training F1 score (1)       0.78          0.77  0.86          0.86

In prioritizing precision, my choice is to use the logistic regression model.