# Modeling using TFIDFVectorizer and Logistic Regression GridSearch
For each combination of transformers and predictors I am exploring different combinations of data.
- All Comments & Submissions
- Comments & Submission with Reddit Scores >10
- Comments only
- Submissions only

In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.compose import ColumnTransformer

import nltk
from nltk.corpus import stopwords

from my_tools.pipes_grids import *
import warnings
warnings.filterwarnings("ignore")

In [3]:
sk_stop = set(stop_words.ENGLISH_STOP_WORDS)
nltk_stops = set(stopwords.words('english'))

In [12]:
try:
    df = pd.read_csv('datasets/data_submissions.csv')
except:
    df = pd.read_csv('https://www.dropbox.com/s/eh48hdw4af7tjc3/data_submissions.csv?dl=0')

In [4]:
df = pd.read_csv('datasets/df_sample.csv')

## Define the pipeline and gridsearch here to be used for this round of analysis.

In [5]:
ct = ColumnTransformer(
    [('tvec',TfidfVectorizer(), 'body')],
    remainder='passthrough',
    sparse_threshold=0.3,
    n_jobs=-1,
)

pipe = Pipeline([
    ('ct',ct),
    ('lr', LogisticRegression(solver = 'lbfgs'))
])

pipe_params = {
    'ct__tvec__tokenizer' : [LemmaTokenizer(),RedditIt()],
    'ct__tvec__ngram_range' : [(1,1),(1,2)],
    'ct__tvec__max_features' : [3000,6000,9000],
    'ct__tvec__min_df' : [2,3,10],
    'ct__tvec__max_df' : [.8,.9],
    'ct__tvec__stop_words' : [nltk_stops],
    'lr__penalty' : ['l2']

    
    
}

gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv=3,# 3-fold cross-validation.
                 n_jobs = -1) 

## Define functions to automate the model fitting, gridsearch & score outputs
see [`my_tools/pipes_grids.py`](my_tools/pipes_grids.py)

## Model Creation and Scoring

### All Comments and Submission

In [6]:
X = df[['body']]
y = df['is_2016']
model_base = model_go(X=X,
        y=y,
      gridsearch=gs)
model_base[0].best_params_

Best score: 0.8511698926662774
Training score: 0.8932501010463916
Test score: 0.8577224653537564
Cross val score: 0.8535726634849379
Baseline Accuracy: 0.5
Accuracy: 0.8577224653537564
Misclassification rate: 0.1422775346462436
Sensitivity & Recall: 0.8418125455871627
Specificity: 0.8736323851203501
Precision: 0.8694792353328938
Balanced Accuracy Score: 0.7154449307075128


{'ct__tvec__max_df': 0.8,
 'ct__tvec__max_features': 9000,
 'ct__tvec__min_df': 3,
 'ct__tvec__ngram_range': (1, 1),
 'ct__tvec__stop_words': {'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',
  'm',
  'ma',
  'me',
  'mightn',
  "mightn't",
  'more

In [None]:
X = df[['body','vaderSentiment']]
y = df['is_2016']
model_vader = model_go(X=X,
        y=y,
        gridsearch=gs)
model_vader[0].best_params_

In [None]:
X = df[['body','senti_score']]
y = df['is_2016']
model_sentiscore = model_go(X=X,
        y=y,
        gridsearch=gs)
model_sentiscore[0].best_params_