In [1]:
## Import libaries
import pandas as pd
import numpy as np
import re
import itertools
from IPython.display import clear_output
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import confusion_matrix, accuracy_score
pd.options.display.max_columns = None
pd.set_option('display.max_colwidth', -1)

In [2]:
df = pd.read_pickle('../data/df_comments.pkl')

## Setting Test and Fit Values

In [4]:
## Setting test size to a finite number so that all train/test splits are the same size
test_size = int(len(df) * .25)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['comments_text_lemma'],
                                                   df['target'],
                                                   test_size = test_size,
                                                   random_state = 42)

X = df['comments_text_lemma']
y = df['target']

In [6]:
df_test = pd.DataFrame() 
df_test['comments'] = X_test
df_test['target'] = y_test

## Model 1: Logistic Regression Standard TfidfVectorizer
Default: 0.70 CV Score

New Best: 0.74 CV Score

In [7]:
## Setting the pipeline
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('logreg', LogisticRegression())
])

In [8]:
#### The following code (and all ensuing blocks) run a GridSearchCV on the models that were identified
#### as having strong CrossVal Scores in the EDA stage. 
params = {
    'tvec__ngram_range' : [(1,1), (1,3)],
    'logreg__solver' : ['liblinear'],
    'logreg__C' : [500, 600, 800],
}
gs = GridSearchCV(pipe, param_grid=params, cv=3, n_jobs=5)
gs.fit(X_train, y_train)
print('Best score:',round(gs.best_score_, 4))
print('Best parameters:',gs.best_params_)
print('GS Train Score:',round(gs.score(X_train, y_train), 4))
print('GS Test Score:',round(gs.score(X_test, y_test), 4))
print('Cross Val Score:',round(cross_val_score(gs, X, y, cv=5).mean(), 4))
predictions = gs.predict(X_test)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, columns=['Pred Neg (Lib)', 'Pred Pos (Cons)'],
                 index=['Actual Neg (Lib)', 'Actual Pos (Cons)'])
print(f'\n{cm_df}\n')
df_test['logreg_predictions'] = predictions

Best score: 0.7589
Best parameters: {'logreg__C': 600, 'logreg__solver': 'liblinear', 'tvec__ngram_range': (1, 3)}
GS Train Score: 1.0
GS Test Score: 0.6875
Cross Val Score: 0.7346

                   Pred Neg (Lib)  Pred Pos (Cons)
Actual Neg (Lib)   63              54             
Actual Pos (Cons)  31              124            



## Model 2: Multinomial Naive Bayes Count Vectorized
Default: 0.71 CV Score

New Best: .71 Best Score

In [9]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('bag', BaggingClassifier(base_estimator=MultinomialNB()))
])

In [10]:
params = {
    'cvec__ngram_range' : [(1,1), (1,3), (1,5)],
    'bag__base_estimator__alpha' : [0.1, 1.0], ## Iterating through the base estimate (MultinomialNB)
    'bag__max_samples' : [.5, .8, .95]
}
gs = GridSearchCV(pipe, param_grid=params, cv=3, n_jobs=5)
gs.fit(X_train, y_train)
print('Best score:',round(gs.best_score_, 4))
print('Best parameters:',gs.best_params_)
print('GS Train Score:',round(gs.score(X_train, y_train), 4))
print('GS Test Score:',round(gs.score(X_test, y_test), 4))
print('Cross Val Score:',round(cross_val_score(gs, X, y, cv=5).mean(), 4))
predictions = gs.predict(X_test)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, columns=['Pred Neg (Lib)', 'Pred Pos (Cons)'],
                 index=['Actual Neg (Lib)', 'Actual Pos (Cons)'])
print(f'\n{cm_df}\n')
df_test['nb_predictions'] = predictions

Best score: 0.7234
Best parameters: {'bag__base_estimator__alpha': 0.1, 'bag__max_samples': 0.8, 'cvec__ngram_range': (1, 5)}
GS Train Score: 0.9706
GS Test Score: 0.6912
Cross Val Score: 0.7108

                   Pred Neg (Lib)  Pred Pos (Cons)
Actual Neg (Lib)   74              43             
Actual Pos (Cons)  41              114            



## Gradient Boosting Classifier Standard TfidfVectorizer Model
Default: 0.71 CV Score

New Best: 0.70 Best Score

NOTES: Not sure why I can't replicate the 0.71 CV Score. Likely has to do with the train/test split.

In [11]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])

In [12]:
params = {
#     'tvec__ngram_range' : [(1,1)],
#     'tvec__max_df' : [.8, .9, 1.0],
#     'gb__n_estimators' : [40, 50],
#     'gb__max_depth' : [3],
#     'gb__max_features' : [0.9, 1.0],
}

gs = GridSearchCV(pipe, param_grid=params, cv=3, n_jobs=5)
gs.fit(X_train, y_train)
print('Best score:',round(gs.best_score_, 4))
print('Best parameters:',gs.best_params_)
print('GS Train Score:',round(gs.score(X_train, y_train), 4))
print('GS Test Score:',round(gs.score(X_test, y_test), 4))
print('Cross Val Score:',round(cross_val_score(gs, X, y, cv=5).mean(), 4))
predictions = gs.predict(X_test)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, columns=['Pred Neg (Lib)', 'Pred Pos (Cons)'],
                 index=['Actual Neg (Lib)', 'Actual Pos (Cons)'])
print(f'\n{cm_df}\n')
df_test['gb_predictions'] = predictions

Best score: 0.705
Best parameters: {}
GS Train Score: 0.9192
GS Test Score: 0.6801
Cross Val Score: 0.697

                   Pred Neg (Lib)  Pred Pos (Cons)
Actual Neg (Lib)   54              63             
Actual Pos (Cons)  24              131            



## Logistic Regression TfidVector (LSA)
Default: .71 CV score

New Best: 0.71 CV Score

In [13]:
tvec = TfidfVectorizer(stop_words='english', min_df=5, max_df=.8)

term_mat = tvec.fit_transform(df['comments_text_lemma'])
term_df = pd.DataFrame(term_mat.toarray(), columns=tvec.get_feature_names())

SVD = TruncatedSVD(n_components=100)
svd_matrix = SVD.fit_transform(term_df)

component_names = ["component_"+str(i+1) for i in range(100)]
svd_df = pd.DataFrame(svd_matrix,
                      columns=component_names)

X_train, X_test, y_train, y_test = train_test_split(svd_df,
                                                    df['target'],
                                                    test_size = test_size,
                                                    random_state=42)

X = svd_df
y = df['target']

In [14]:
pipe = Pipeline([
    ('logreg', LogisticRegression())
])

In [15]:
params = {
    'logreg__solver' : ['liblinear'],
    'logreg__C' : [1.0],
    'logreg__fit_intercept' : [True, False],
    'logreg__penalty' : ['l1', 'l2']
}
gs = GridSearchCV(pipe, param_grid=params, cv=3, n_jobs=5)
gs.fit(X_train, y_train)
print('Best score:',round(gs.best_score_, 4))
print('Best parameters:',gs.best_params_)
print('GS Train Score:',round(gs.score(X_train, y_train), 4))
print('GS Test Score:',round(gs.score(X_test, y_test), 4))
print('Cross Val Score:',round(cross_val_score(gs, X, y, cv=5).mean(), 4))
predictions = gs.predict(X_test)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, columns=['Pred Neg (Lib)', 'Pred Pos (Cons)'],
                 index=['Actual Neg (Lib)', 'Actual Pos (Cons)'])
print(f'\n{cm_df}\n')
df_test['lsa_logreg_predictions'] = predictions

Best score: 0.7246
Best parameters: {'logreg__C': 1.0, 'logreg__fit_intercept': False, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}
GS Train Score: 0.7772
GS Test Score: 0.6949
Cross Val Score: 0.7089

                   Pred Neg (Lib)  Pred Pos (Cons)
Actual Neg (Lib)   61              56             
Actual Pos (Cons)  27              128            



## Model Combination

Notes: Well, I learned a lot. But this was only marginally useful.

In [24]:
df_test['combo'] = df_test['logreg_predictions'] + df_test['nb_predictions'] + df_test['lsa_logreg_predictions']

In [28]:
combo_preds = []
for i in df_test.index:
    if df_test['combo'][i] >= 2:
        combo_preds.append(1)
    else:
        combo_preds.append(0)
df_test['combo_preds'] = combo_preds

In [29]:
print('Logreg Score:',accuracy_score(df_test['target'], df_test['logreg_predictions']))
print('Logreg LSA Score:',accuracy_score(df_test['target'], df_test['lsa_logreg_predictions']))
print('NB Score:',accuracy_score(df_test['target'], df_test['nb_predictions']))
print('Combo Score:',accuracy_score(df_test['target'], df_test['combo_preds']))

Logreg Score: 0.6875
Logreg LSA Score: 0.6948529411764706
NB Score: 0.6911764705882353
Combo Score: 0.6911764705882353
