# Modeling using CountVectorizer and Logistic Regression GridSearch
For each combination of transformers and predictors I am exploring different combinations of data.
- All Comments & Submissions
- Comments & Submission with Reddit Scores >10
- Comments only
- Submissions only

In [27]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.compose import ColumnTransformer

import nltk
from nltk.corpus import stopwords

from my_tools.pipes_grids import *
import warnings
warnings.filterwarnings("ignore")

In [24]:
nltk_stops = set(stopwords.words('english'))

In [21]:
try:
    df = pd.read_csv('datasets/data_submissions.csv')
except:
    df = pd.read_csv('https://www.dropbox.com/s/eh48hdw4af7tjc3/data_submissions.csv?dl=0')

In [None]:
df_2020 = df[df['is_2016'] == 0]
df_2016_sample = df[df['is_2016'] == 1].sample(df[df['is_2016'] == 0].shape[0], random_state=737)
df_sample = pd.concat([df_2020,df_2016_sample])
df_sample.to_csv('datasets/df_sample.csv', index=False)

In [25]:
df = pd.read_csv('datasets/df_sample.csv')

## Define the pipeline and gridsearch here to be used for this round of analysis.

In [30]:
ct = ColumnTransformer(
    [('cvec', CountVectorizer(), 'body')],
    remainder='passthrough',
    sparse_threshold=0.3,
    n_jobs=-1,
)

pipe = Pipeline([
    ('ct',ct),
    ('lr', LogisticRegression(solver = 'lbfgs'))
])

pipe_params = {
    'ct__cvec__ngram_range' : [(1,1),(1,2)],
    'ct__cvec__max_features' : [3000,6000,9000],
    'ct__cvec__min_df' : [2,3,10],
    'ct__cvec__max_df' : [.8,.9],
    'ct__cvec__stop_words' : [nltk_stops],
    'lr__penalty' : ['l2']

    
    
}

gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv=3,# 3-fold cross-validation.
                 n_jobs = -1) 

## Define functions to automate the model fitting, gridsearch & score outputs
see [`my_tools/pipes_grids.py`](my_tools/pipes_grids.py)

## Model Creation and Scoring

In [31]:
X = df[['body']]
y = df['is_2016']
model_base = model_go(X=X,
        y=y,
      gridsearch=gs)
model_base[0].best_params_

Best score: 0.8486998697624287
Training score: 0.9213634526429245
Test score: 0.8539387308533917
Cross val score: 0.8523821622048414
Baseline Accuracy: 0.5
Accuracy: 0.8539387308533917
Misclassification rate: 0.14606126914660833
Sensitivity & Recall: 0.8425419401896426
Specificity: 0.8653355215171408
Precision: 0.8621944392610562
Balanced Accuracy Score: 0.7078774617067833


{'ct__cvec__max_df': 0.8,
 'ct__cvec__max_features': 9000,
 'ct__cvec__min_df': 3,
 'ct__cvec__ngram_range': (1, 1),
 'ct__cvec__stop_words': {'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',
  'm',
  'ma',
  'me',
  'mightn',
  "mightn't",
  'more

In [38]:
model_base[0].

<bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cvec',
                                                  CountVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.int64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=0.

In [32]:
X = df[['body','vaderSentiment']]
y = df['is_2016']
model_vader = model_go(X=X,
        y=y,
        gridsearch=gs)
model_vader[0].best_params_

Best score: 0.8467013966856783
Training score: 0.920757174293798
Test score: 0.8525255288110868
Cross val score: 0.8487222984842638
Baseline Accuracy: 0.5
Accuracy: 0.8525255288110868
Misclassification rate: 0.14747447118891321
Sensitivity & Recall: 0.8337892049598833
Specificity: 0.8712618526622903
Precision: 0.8662498815951502
Balanced Accuracy Score: 0.7050510576221736


{'ct__cvec__max_df': 0.8,
 'ct__cvec__max_features': 9000,
 'ct__cvec__min_df': 2,
 'ct__cvec__ngram_range': (1, 1),
 'ct__cvec__stop_words': {'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',
  'm',
  'ma',
  'me',
  'mightn',
  "mightn't",
  'more

In [33]:
X = df[['body','senti_score']]
y = df['is_2016']
model_sentiscore = model_go(X=X,
        y=y,
        gridsearch=gs)
model_sentiscore[0].best_params_

Best score: 0.8068666636726995
Training score: 0.8079220370952531
Test score: 0.7947210065645515
Cross val score: 0.8022633137541619
Baseline Accuracy: 0.5
Accuracy: 0.7947210065645515
Misclassification rate: 0.20527899343544853
Sensitivity & Recall: 0.6840809628008753
Specificity: 0.9053610503282276
Precision: 0.8784685634000703
Balanced Accuracy Score: 0.5894420131291027


{'ct__cvec__max_df': 0.8,
 'ct__cvec__max_features': 9000,
 'ct__cvec__min_df': 10,
 'ct__cvec__ngram_range': (1, 2),
 'ct__cvec__stop_words': {'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',
  'm',
  'ma',
  'me',
  'mightn',
  "mightn't",
  'mor