# CP1 TF-IDF notebook

## By Logan Larson

In [1]:
### import packages ###

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")


### load data ###

df = pd.read_csv('clean_schefter_tweets')
del df['Unnamed: 0']
df.head(3)

Unnamed: 0,Class,Tweet
0,0,rt rank best kicker history top maybe top 3. f...
1,0,april least one trade made (but one waiting
2,1,giant re-signed restricted free agent guard kevin


In [4]:
# define vectorizer
def make_xy(df, vectorizer=None):   
    if vectorizer is None:
        vectorizer = TfidfVectorizer(
            stop_words='english',
            analyzer='word',
            ngram_range=(1,1)
        )
    X = vectorizer.fit_transform(df.Tweet.values.astype('U')) # convert object type to unicode
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (df.Class == 1).values.astype(np.int)
    return X, y


# define cross-validation score
def cv_score(clf, X, y, scorefunc):
    result = 0.
    nfold = 5
    for train, test in KFold(nfold).split(X): # split data into train/test groups, 5 times
        clf.fit(X[train], y[train]) # fit the classifier, passed is as clf.
        result += scorefunc(clf, X[test], y[test]) # evaluate score function on held-out data
    return result / nfold # average


# define log-likelihood score function
def log_likelihood(clf, x, y):
    prob = clf.predict_log_proba(x)
    irrelevant = y == 0
    relevant = ~irrelevant
    return prob[irrelevant, 0].sum() + prob[relevant, 1].sum()

In [5]:
# vectorize before train/test split
X, y = make_xy(df)

# split dataset into a training and test set
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

## Show train/test classification for both models

#### 1. Naive Bayes model


In [6]:
NB = BernoulliNB().fit(xtrain, ytrain)
NBtrain_pred = NB.predict(xtrain)
NBtest_pred = NB.predict(xtest)

print('\n Naive Bayes classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, NBtrain_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, NBtest_pred, labels=[0,1]))


 Naive Bayes classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.90      0.93     19185
           1       0.72      0.87      0.79      5595

    accuracy                           0.89     24780
   macro avg       0.84      0.89      0.86     24780
weighted avg       0.91      0.89      0.90     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.89      0.92      6391
           1       0.70      0.85      0.76      1870

    accuracy                           0.88      8261
   macro avg       0.82      0.87      0.84      8261
weighted avg       0.89      0.88      0.89      8261



#### 2. Logistic Regression model

In [7]:
LR = LogisticRegression().fit(xtrain, ytrain)
LR_train_pred = LR.predict(xtrain)
LR_test_pred = LR.predict(xtest)

print('\n Logistic Regression classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, LR_train_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, LR_test_pred, labels=[0,1]))


 Logistic Regression classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.97      0.95     19185
           1       0.89      0.75      0.82      5595

    accuracy                           0.92     24780
   macro avg       0.91      0.86      0.88     24780
weighted avg       0.92      0.92      0.92     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.97      0.94      6391
           1       0.86      0.71      0.78      1870

    accuracy                           0.91      8261
   macro avg       0.89      0.84      0.86      8261
weighted avg       0.91      0.91      0.91      8261



## Determine if there is overfitting

In the baseline notebook, I determined precision to be the metric of focus. While the naive bayes classifier is still overfit in these terms, it's "less overfit" due to higher precision on positive observations on both test and training data. On the other hand, the logistic regression model performed similarly and is also "less overfit". Plus, the precision on positive observations on the test data improved. Since the naive bayes classifier is the only model that was overfit in terms of precision, I won't bother making a regularized model for the logistic regression classifier.

## Build regularized models

#### 1. Naive Bayes

In [12]:
# define the grid of parameters to search over
alphas = [.01, .1, 1, 5, 10, 50, 100]
min_df = 0.001

# find the best value for alpha
best_alpha = None
_, itest = train_test_split(range(df.shape[0]))
mask = np.zeros(df.shape[0], dtype=np.bool)
mask[itest] = True
maxscore = -np.inf
for alpha in alphas:        
    vectorizer = TfidfVectorizer(min_df = min_df)       
    Xthis, ythis = make_xy(df, vectorizer)
    Xtrainthis = Xthis[mask]
    ytrainthis = ythis[mask]
    
    clf = BernoulliNB(alpha=alpha)
    
    cvscore = cv_score(clf, Xtrainthis, ytrainthis, log_likelihood)
    
    if cvscore > maxscore:
        maxscore = cvscore
        best_alpha = alpha
        
print('Optimal alpha: {}'.format(best_alpha))

Optimal alpha: 5


In [13]:
NB_tuned = BernoulliNB(alpha=best_alpha).fit(xtrain, ytrain)

NB_tuned_train_pred = NB_tuned.predict(xtrain)
NB_tuned_test_pred = NB_tuned.predict(xtest)

print('\n Tuned Naive Bayes classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, NB_tuned_train_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, NB_tuned_test_pred, labels=[0,1]))


 Tuned Naive Bayes classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.92      0.93     19185
           1       0.75      0.78      0.77      5595

    accuracy                           0.89     24780
   macro avg       0.84      0.85      0.85     24780
weighted avg       0.89      0.89      0.89     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.92      0.93      6391
           1       0.74      0.76      0.75      1870

    accuracy                           0.89      8261
   macro avg       0.83      0.84      0.84      8261
weighted avg       0.89      0.89      0.89      8261



#### Compared to non-regularized model

This performs better than the non-regularized model, but it doesn't compare to the logistic regression model. I'll proceed with another logistic model using n-grams greater than 1.

In [14]:
# redefine vectorizer
def make_xy2(df, vectorizer=None):   
    if vectorizer is None:
        vectorizer = TfidfVectorizer(
            stop_words='english',
            analyzer='word',
            ngram_range=(1,4)
        )
    X = vectorizer.fit_transform(df.Tweet.values.astype('U')) # convert object type to unicode
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (df.Class == 1).values.astype(np.int)
    return X, y

In [15]:
# vectorize before train/test split
X, y = make_xy2(df)

# split dataset into a training and test set
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

In [16]:
LR = LogisticRegression().fit(xtrain, ytrain)
LR_train_pred = LR.predict(xtrain)
LR_test_pred = LR.predict(xtest)

print('\n Logistic Regression classifier: \n \n')
print('\n Training Classification Report: \n', classification_report(ytrain, LR_train_pred, labels=[0,1]))
print('\n Test Classification Report: \n', classification_report(ytest, LR_test_pred, labels=[0,1]))


 Logistic Regression classifier: 
 


 Training Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.99      0.95     19185
           1       0.97      0.70      0.81      5595

    accuracy                           0.93     24780
   macro avg       0.94      0.85      0.88     24780
weighted avg       0.93      0.93      0.92     24780


 Test Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.98      0.93      6391
           1       0.91      0.53      0.67      1870

    accuracy                           0.88      8261
   macro avg       0.89      0.76      0.80      8261
weighted avg       0.89      0.88      0.87      8261



This model is performing better on positive cases than negative ones and achieved the highest precision yet.