# CP1 Extended Modeling

## By Logan Larson

In [3]:
### import packages ###

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")


### load data ###

df = pd.read_csv('clean_schefter_tweets')
del df['Unnamed: 0']
df.head(8)

Unnamed: 0,Class,Tweet
0,0,rt rank best kicker history top maybe top 3. f...
1,0,april least one trade made (but one waiting
2,1,giant re-signed restricted free agent guard kevin
3,0,@profootballtalk aaron retiring least trying g...
4,0,jimmy clausen scheduled fly washington spend s...
5,0,"trip washington enough, redskin also work jimm..."
6,1,jaguar re-signed veteran defensive end reggie ...
7,1,surprise here: bear informed defensive end ale...


In [4]:
# define vectorizer
def make_xy(df, vectorizer=None):   
    if vectorizer is None:
        vectorizer = TfidfVectorizer(
            stop_words='english',
            analyzer='word',
            ngram_range=(1,1)
        )
    X = vectorizer.fit_transform(df.Tweet.values.astype('U')) # convert object type to unicode
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (df.Class == 1).values.astype(np.int)
    return X, y


# define cross-validation score
def cv_score(clf, X, y, scorefunc):
    result = 0.
    nfold = 5
    for train, test in KFold(nfold).split(X): # split data into train/test groups, 5 times
        clf.fit(X[train], y[train]) # fit the classifier, passed is as clf.
        result += scorefunc(clf, X[test], y[test]) # evaluate score function on held-out data
    return result / nfold # average


# define log-likelihood score function
def log_likelihood(clf, x, y):
    prob = clf.predict_log_proba(x)
    irrelevant = y == 0
    relevant = ~irrelevant
    return prob[irrelevant, 0].sum() + prob[relevant, 1].sum()

In [5]:
# vectorize before train/test split
X, y = make_xy(df)

# split dataset into a training and test set
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

## Show train/test classification for both models

#### 1. Bernoulli Naive Bayes model

In [6]:
BNB = BernoulliNB().fit(xtrain, ytrain)
BNB_train_pred = BNB.predict(xtrain)
BNB_test_pred = BNB.predict(xtest)

print('\n Bernoulli Naive Bayes baseline classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, BNB_train_pred, digits = 3, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, BNB_test_pred, digits = 3, labels=[0,1]))


 Bernoulli Naive Bayes baseline classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.961     0.899     0.929     19185
           1      0.716     0.875     0.787      5595

    accuracy                          0.893     24780
   macro avg      0.838     0.887     0.858     24780
weighted avg      0.906     0.893     0.897     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.952     0.893     0.921      6391
           1      0.697     0.845     0.764      1870

    accuracy                          0.882      8261
   macro avg      0.825     0.869     0.843      8261
weighted avg      0.894     0.882     0.886      8261



#### 2. Multinomial Naive Bayes model

In [7]:
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()
MNB.fit(xtrain, ytrain)

MNBtrain_pred = MNB.predict(xtrain)
MNBtest_pred = MNB.predict(xtest)

print('\n Multinomial Naive Bayes baseline classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, MNBtrain_pred, labels=[0,1], digits=3))
print('\n Test Classification Report: \n', classification_report(ytest, MNBtest_pred, labels=[0,1], digits=3))


 Multinomial Naive Bayes baseline classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.921     0.957     0.938     19185
           1      0.828     0.718     0.769      5595

    accuracy                          0.903     24780
   macro avg      0.875     0.837     0.854     24780
weighted avg      0.900     0.903     0.900     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.912     0.951     0.931      6391
           1      0.805     0.684     0.740      1870

    accuracy                          0.891      8261
   macro avg      0.858     0.818     0.835      8261
weighted avg      0.887     0.891     0.888      8261



#### 3. Logistic Regression model

In [8]:
LR = LogisticRegression().fit(xtrain, ytrain)
LR_train_pred = LR.predict(xtrain)
LR_test_pred = LR.predict(xtest)

print('\n Logistic Regression baseline classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, LR_train_pred, digits = 3, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, LR_test_pred, digits = 3, labels=[0,1]))


 Logistic Regression baseline classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.931     0.972     0.951     19185
           1      0.886     0.755     0.815      5595

    accuracy                          0.923     24780
   macro avg      0.909     0.863     0.883     24780
weighted avg      0.921     0.923     0.920     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.919     0.967     0.943      6391
           1      0.864     0.709     0.779      1870

    accuracy                          0.909      8261
   macro avg      0.891     0.838     0.861      8261
weighted avg      0.907     0.909     0.906      8261



## Determine if there is overfitting

Strictly speaking in terms of recall, we again have little evidence of overfitting in any of our models. However, given the changes seen in regularizing the baseline models, I'll again proceed with regularization to see if recall on positive cases might improve.

## Determine optimal hyperparameters for regularization

#### 1. Bernoulli Naive Bayes (alpha)

In [9]:
# define the grid of parameters to search over
alphas = [.01, .1, 1, 5, 10, 50, 100]
min_df = 0.001

# find the best value for alpha
best_alpha = None
_, itest = train_test_split(range(df.shape[0]))
mask = np.zeros(df.shape[0], dtype=np.bool)
mask[itest] = True
maxscore = -np.inf
for alpha in alphas:        
    vectorizer = TfidfVectorizer(min_df = min_df)       
    Xthis, ythis = make_xy(df, vectorizer)
    Xtrainthis = Xthis[mask]
    ytrainthis = ythis[mask]
    
    clf = BernoulliNB(alpha=alpha)
    
    cvscore = cv_score(clf, Xtrainthis, ytrainthis, log_likelihood)
    
    if cvscore > maxscore:
        maxscore = cvscore
        BNB_best_alpha = alpha
        
print('Bernoulli NB optimal alpha: {}'.format(BNB_best_alpha))

Bernoulli NB optimal alpha: 5


#### 2. Multinomial Naive Bayes (alpha)

In [10]:
# define the grid of parameters to search over
alphas = [.01, .1, 1, 5, 10, 50, 100]
min_df = 0.001

# find the best value for alpha
best_alpha = None
_, itest = train_test_split(range(df.shape[0]))
mask = np.zeros(df.shape[0], dtype=np.bool)
mask[itest] = True
maxscore = -np.inf
for alpha in alphas:        
    vectorizer = TfidfVectorizer(min_df = min_df)       
    Xthis, ythis = make_xy(df, vectorizer)
    Xtrainthis = Xthis[mask]
    ytrainthis = ythis[mask]
    
    clf = MultinomialNB(alpha=alpha)
    
    cvscore = cv_score(clf, Xtrainthis, ytrainthis, log_likelihood)
    
    if cvscore > maxscore:
        maxscore = cvscore
        MNB_best_alpha = alpha
        
print('Multinomial Naive Bayes optimal alpha: {}'.format(MNB_best_alpha))

Multinomial Naive Bayes optimal alpha: 0.1


#### 3. Logistic Regression (c)

In [11]:
# hypertuning C parameter
LR2 = LogisticRegression()
parameters = {"C": [0.0001, 0.001, 0.1, 1, 2, 3, 4, 5, 10, 100, 1000, 10000]}
fitmodel = GridSearchCV(LR2, param_grid=parameters, cv=10, scoring="accuracy").fit(xtrain,ytrain)
fitmodel.best_params_, fitmodel.best_score_, fitmodel.cv_results_

print('Optimal C value:', fitmodel.best_params_['C'])

Optimal C value: 2


## Build regularized models

#### 1. Bernoulli Naive Bayes model

In [12]:
BNB_tuned = BernoulliNB(alpha=BNB_best_alpha).fit(xtrain, ytrain)

BNB_tuned_train_pred = BNB_tuned.predict(xtrain)
BNB_tuned_test_pred = BNB_tuned.predict(xtest)

print('\n Tuned Bernoulli Naive Bayes classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, BNB_tuned_train_pred, digits = 3, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, BNB_tuned_test_pred, digits = 3, labels=[0,1]))


 Tuned Bernoulli Naive Bayes classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.936     0.923     0.930     19185
           1      0.749     0.783     0.766      5595

    accuracy                          0.892     24780
   macro avg      0.842     0.853     0.848     24780
weighted avg      0.894     0.892     0.893     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.930     0.921     0.925      6391
           1      0.738     0.763     0.750      1870

    accuracy                          0.885      8261
   macro avg      0.834     0.842     0.838      8261
weighted avg      0.887     0.885     0.886      8261



#### Compared to non-regularized model

We saw improvement in precision, but a significant reduction in recall for the positive class. However, we do see less overfitting in terms of recall

#### 2. Multinomial Naive Bayes model

In [13]:
MNB_tuned = MultinomialNB(alpha=MNB_best_alpha).fit(xtrain, ytrain)

MNB_tuned_train_pred = MNB_tuned.predict(xtrain)
MNB_tuned_test_pred = MNB_tuned.predict(xtest)

print('\n Tuned Multinomial Naive Bayes classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, MNB_tuned_train_pred, digits = 3, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, MNB_tuned_test_pred, digits = 3, labels=[0,1]))


 Tuned Multinomial Naive Bayes classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.930     0.949     0.939     19185
           1      0.811     0.757     0.783      5595

    accuracy                          0.905     24780
   macro avg      0.871     0.853     0.861     24780
weighted avg      0.903     0.905     0.904     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.916     0.945     0.930      6391
           1      0.788     0.702     0.743      1870

    accuracy                          0.890      8261
   macro avg      0.852     0.823     0.836      8261
weighted avg      0.887     0.890     0.888      8261



*** Compared to non-regularized model:

We see increased recall on both test and training data, but also increased overfitting on the positive class.

#### 3. Logistic regression

In [14]:
LR_tuned = LogisticRegression(C=fitmodel.best_params_['C']).fit(xtrain, ytrain)
LR_tuned_train_pred = LR_tuned.predict(xtrain)
LR_tuned_test_pred = LR_tuned.predict(xtest)

print('\n Tuned Logistic Regression classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, LR_tuned_train_pred, digits = 3, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, LR_tuned_test_pred, digits = 3, labels=[0,1]))


 Tuned Logistic Regression classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.939     0.971     0.955     19185
           1      0.888     0.785     0.833      5595

    accuracy                          0.929     24780
   macro avg      0.914     0.878     0.894     24780
weighted avg      0.928     0.929     0.928     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.924     0.964     0.943      6391
           1      0.855     0.728     0.786      1870

    accuracy                          0.911      8261
   macro avg      0.890     0.846     0.865      8261
weighted avg      0.908     0.911     0.908      8261



**** Compared to non-regularized model:
We see increased recall performance but more overfitting.

### Experimenting with increased n-gram range

In [15]:
# redefine TF-IDF vectorizer
def make_xy2(df, vectorizer=None):   
    if vectorizer is None:
        vectorizer = TfidfVectorizer(
            stop_words='english',
            analyzer='word',
            ngram_range=(1,5)
        )
    X = vectorizer.fit_transform(df.Tweet.values.astype('U')) # convert object type to unicode
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (df.Class == 1).values.astype(np.int)
    return X, y

In [16]:
# vectorize before train/test split
X, y = make_xy2(df)

# split dataset into a training and test set
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

#### 1. Bernoulli naive Bayes model

In [17]:
NB = BernoulliNB().fit(xtrain, ytrain)
NB_train_pred = NB.predict(xtrain)
NB_test_pred = NB.predict(xtest)

print('\n Naive Bayes classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, NB_train_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, NB_test_pred, labels=[0,1]))


 Naive Bayes classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.81      1.00      0.89     19185
           1       1.00      0.18      0.30      5595

    accuracy                           0.81     24780
   macro avg       0.90      0.59      0.60     24780
weighted avg       0.85      0.81      0.76     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.78      1.00      0.88      6391
           1       0.98      0.05      0.09      1870

    accuracy                           0.78      8261
   macro avg       0.88      0.52      0.48      8261
weighted avg       0.83      0.78      0.70      8261



#### 2. Multinomial naive Bayes model

In [18]:
MNB = MultinomialNB()
MNB.fit(xtrain, ytrain)

MNBtrain_pred = MNB.predict(xtrain)
MNBtest_pred = MNB.predict(xtest)

print('\n Multinomial Naive Bayes baseline classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, MNBtrain_pred, labels=[0,1], digits=3))
print('\n Test Classification Report: \n', classification_report(ytest, MNBtest_pred, labels=[0,1], digits=3))


 Multinomial Naive Bayes baseline classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.866     1.000     0.928     19185
           1      0.998     0.467     0.637      5595

    accuracy                          0.880     24780
   macro avg      0.932     0.734     0.782     24780
weighted avg      0.896     0.880     0.862     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.798     0.999     0.887      6391
           1      0.984     0.135     0.238      1870

    accuracy                          0.804      8261
   macro avg      0.891     0.567     0.563      8261
weighted avg      0.840     0.804     0.740      8261



In [19]:
LR = LogisticRegression().fit(xtrain, ytrain)
LR_train_pred = LR.predict(xtrain)
LR_test_pred = LR.predict(xtest)

print('\n Logistic Regression classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, LR_train_pred, digits=3, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, LR_test_pred, digits=3, labels=[0,1]))


 Logistic Regression classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.913     0.995     0.952     19185
           1      0.976     0.676     0.799      5595

    accuracy                          0.923     24780
   macro avg      0.945     0.836     0.876     24780
weighted avg      0.927     0.923     0.918     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.868     0.987     0.924      6391
           1      0.918     0.487     0.636      1870

    accuracy                          0.874      8261
   macro avg      0.893     0.737     0.780      8261
weighted avg      0.879     0.874     0.859      8261



In [114]:
# redefine Count vectorizer to use n-grams of one to three words

def make_xy4(df, vectorizer=None):   
    if vectorizer is None:
        vectorizer = CountVectorizer(
            stop_words='english',
            analyzer='word',
            ngram_range=(1,3)
        )
    X = vectorizer.fit_transform(df.Tweet.values.astype('U')) # convert object type to unicode
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (df.Class == 1).values.astype(np.int)
    return X, y

# vectorize before train/test split
X, y = make_xy4(df)

# split dataset into a training and test set
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

BNB = MultinomialNB().fit(xtrain, ytrain)
BNB_train_pred = BNB.predict(xtrain)
BNB_test_pred = BNB.predict(xtest)

print('\n N-gram range: 1-3 \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, BNB_train_pred, digits = 3, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, BNB_test_pred, digits = 3, labels=[0,1]))


 Bernoulli Naive Bayes baseline classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.995     0.977     0.986     19185
           1      0.926     0.983     0.954      5595

    accuracy                          0.978     24780
   macro avg      0.961     0.980     0.970     24780
weighted avg      0.979     0.978     0.979     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.951     0.889     0.919      6391
           1      0.691     0.845     0.760      1870

    accuracy                          0.879      8261
   macro avg      0.821     0.867     0.840      8261
weighted avg      0.892     0.879     0.883      8261



In [115]:
# redefine Count vectorizer to use n-grams of one to four words

def make_xy4(df, vectorizer=None):   
    if vectorizer is None:
        vectorizer = CountVectorizer(
            stop_words='english',
            analyzer='word',
            ngram_range=(1,4)
        )
    X = vectorizer.fit_transform(df.Tweet.values.astype('U')) # convert object type to unicode
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (df.Class == 1).values.astype(np.int)
    return X, y

# vectorize before train/test split
X, y = make_xy4(df)

# split dataset into a training and test set
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

BNB = MultinomialNB().fit(xtrain, ytrain)
BNB_train_pred = BNB.predict(xtrain)
BNB_test_pred = BNB.predict(xtest)

print('\n N-gram range: 1-4 \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, BNB_train_pred, digits = 3, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, BNB_test_pred, digits = 3, labels=[0,1]))


 Bernoulli Naive Bayes baseline classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.999     0.988     0.993     19185
           1      0.961     0.995     0.978      5595

    accuracy                          0.990     24780
   macro avg      0.980     0.992     0.986     24780
weighted avg      0.990     0.990     0.990     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.959     0.869     0.912      6391
           1      0.661     0.874     0.753      1870

    accuracy                          0.870      8261
   macro avg      0.810     0.871     0.832      8261
weighted avg      0.892     0.870     0.876      8261



In [108]:
# redefine Count vectorizer to use n-grams of one to five words

def make_xy4(df, vectorizer=None):   
    if vectorizer is None:
        vectorizer = CountVectorizer(
            stop_words='english',
            analyzer='word',
            ngram_range=(1,5)
        )
    X = vectorizer.fit_transform(df.Tweet.values.astype('U')) # convert object type to unicode
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (df.Class == 1).values.astype(np.int)
    return X, y

# vectorize before train/test split
X, y = make_xy4(df)

# split dataset into a training and test set
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

BNB = MultinomialNB().fit(xtrain, ytrain)
BNB_train_pred = BNB.predict(xtrain)
BNB_test_pred = BNB.predict(xtest)

print('\n N-gram range: 1-5 \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, BNB_train_pred, digits = 3, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, BNB_test_pred, digits = 3, labels=[0,1]))


 Bernoulli Naive Bayes baseline classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.999     0.993     0.996     19185
           1      0.977     0.996     0.986      5595

    accuracy                          0.994     24780
   macro avg      0.988     0.995     0.991     24780
weighted avg      0.994     0.994     0.994     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.965     0.850     0.904      6391
           1      0.636     0.894     0.743      1870

    accuracy                          0.860      8261
   macro avg      0.800     0.872     0.823      8261
weighted avg      0.890     0.860     0.867      8261



In [103]:
# redefine Count vectorizer to use n-grams of two to five words

from sklearn.feature_extraction.text import CountVectorizer

def make_xy3(df, vectorizer=None):   
    if vectorizer is None:
        vectorizer = CountVectorizer(
            stop_words='english',
            analyzer='word',
            ngram_range=(2,5)
        )
    X = vectorizer.fit_transform(df.Tweet.values.astype('U')) # convert object type to unicode
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (df.Class == 1).values.astype(np.int)
    return X, y

# vectorize before train/test split
X, y = make_xy3(df)

# split dataset into a training and test set
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

MNB = MultinomialNB().fit(xtrain, ytrain)
BNB_train_pred = BNB.predict(xtrain)
BNB_test_pred = BNB.predict(xtest)

print('\n N-gram range: 2-5 \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, BNB_train_pred, digits = 3, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, BNB_test_pred, digits = 3, labels=[0,1]))


 Bernoulli Naive Bayes baseline classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0      0.999     0.996     0.997     19185
           1      0.986     0.997     0.991      5595

    accuracy                          0.996     24780
   macro avg      0.992     0.996     0.994     24780
weighted avg      0.996     0.996     0.996     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0      0.969     0.376     0.542      6391
           1      0.310     0.959     0.469      1870

    accuracy                          0.508      8261
   macro avg      0.640     0.667     0.505      8261
weighted avg      0.820     0.508     0.525      8261



This latest multinomial Naive Bayes model that uses a Count Vectorizer and n-grams of range 2 through 5 achieves the highest recall rate yet. The most considerable alternative is a similar model that expands to use n-grams of size 1, which appears to help strike a much better balance between precision and recall. However, as stated earlier, I believe potential users of my model won't have nearly as big of a problem with frequent false positives than they would with only a few false negatives, so for my business problem I'm inclined to maximize recall rate and I thus choose the Multinomial Naive Bayes model using a Count Vectorizer and n-grams of range 2 through 5. While a low precision means I should expect frequent false positives, this model will make up for it by capturing the vast majority of newsworthy tweets.