# Modeling using TFIDFVectorizer and RandomForestClassifier & GridSearch
For each combination of transformers and predictors I am exploring different combinations of data.
- All Comments & Submissions
- Comments & Submission with Reddit Scores >10
- Comments only
- Submissions only

In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.compose import ColumnTransformer

import nltk
from nltk.corpus import stopwords

from my_tools.pipes_grids import *


In [2]:
sk_stop = set(stop_words.ENGLISH_STOP_WORDS)
nltk_stops = set(stopwords.words('english'))

In [3]:
try:
    df = pd.read_csv('datasets/data_submissions.csv')
except:
    df = pd.read_csv('https://www.dropbox.com/s/eh48hdw4af7tjc3/data_submissions.csv?dl=0')

In [3]:
df = pd.read_csv('datasets/df_sample.csv')

## Define the pipeline and gridsearch here to be used for this round of analysis.

In [4]:
ct = ColumnTransformer(
    [('tvec',TfidfVectorizer(), 'body')],
    remainder='passthrough',
    sparse_threshold=0.3,
    n_jobs=-1,
)

pipe = Pipeline([
    ('ct',ct),
    ('rfc', RandomForestClassifier(n_jobs=-1))
])

pipe_params = {
    'ct__tvec__ngram_range' : [(1,1),(1,2)],
    'ct__tvec__max_features' : [1100,1500,1750],
    'ct__tvec__min_df' : [.01, .025],
    'ct__tvec__max_df' : [.8,.9],
    'ct__tvec__stop_words' : [nltk_stops],
    'rfc__n_estimators' : [10],
    'rfc__min_samples_split' : [2,3],
    'rfc__min_samples_leaf' : [2,3,4],
    'rfc__max_features': ['sqrt','log2'],
    
}

gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv=3,# 3-fold cross-validation.
                 n_jobs = -1) 

## Define functions to automate the model fitting, gridsearch & score outputs
see [`my_tools/pipes_grids.py`](my_tools/pipes_grids.py)

## Model Creation and Scoring

In [5]:
X = df[['body']]
y = df['is_2016']
model_base = model_go(X=X,
        y=y,
      gridsearch=gs)
model_base[0].best_params_

Best score: 0.7886558584452329
Training score: 0.8655409350159429
Test score: 0.7897064186725018
Cross val score: 0.7894866969095269
Baseline Accuracy: 0.5
Accuracy: 0.7897064186725018
Misclassification rate: 0.2102935813274982
Sensitivity & Recall: 0.7647702407002188
Specificity: 0.8146425966447848
Precision: 0.8049131561270512
Balanced Accuracy Score: 0.5794128373450036


{'ct__tvec__max_df': 0.8,
 'ct__tvec__max_features': 1500,
 'ct__tvec__min_df': 0.01,
 'ct__tvec__ngram_range': (1, 1),
 'ct__tvec__stop_words': {'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',
  'm',
  'ma',
  'me',
  'mightn',
  "mightn't",
  'm

In [6]:
X = df[['body','vaderSentiment']]
y = df['is_2016']
model_vader = model_go(X=X,
        y=y,
        gridsearch=gs)
model_vader[0].best_params_

Best score: 0.7887681322135897
Training score: 0.8679435936587776
Test score: 0.7929886943836616
Cross val score: 0.7904750519596432
Baseline Accuracy: 0.5
Accuracy: 0.7929886943836616
Misclassification rate: 0.2070113056163384
Sensitivity & Recall: 0.7591174325309993
Specificity: 0.8268599562363238
Precision: 0.814278728606357
Balanced Accuracy Score: 0.585977388767323


{'ct__tvec__max_df': 0.8,
 'ct__tvec__max_features': 1500,
 'ct__tvec__min_df': 0.01,
 'ct__tvec__ngram_range': (1, 2),
 'ct__tvec__stop_words': {'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',
  'm',
  'ma',
  'me',
  'mightn',
  "mightn't",
  'm

In [7]:
X = df[['body','senti_score']]
y = df['is_2016']
model_sentiscore = model_go(X=X,
        y=y,
        gridsearch=gs)
model_sentiscore[0].best_params_

Best score: 0.79159743117618
Training score: 0.8703013427942695
Test score: 0.7957695113056164
Cross val score: 0.7910583481759444
Baseline Accuracy: 0.5
Accuracy: 0.7957695113056164
Misclassification rate: 0.20423048869438365
Sensitivity & Recall: 0.7679613420860686
Specificity: 0.8235776805251641
Precision: 0.8131878741069705
Balanced Accuracy Score: 0.5915390226112327


{'ct__tvec__max_df': 0.9,
 'ct__tvec__max_features': 1500,
 'ct__tvec__min_df': 0.01,
 'ct__tvec__ngram_range': (1, 1),
 'ct__tvec__stop_words': {'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',
  'm',
  'ma',
  'me',
  'mightn',
  "mightn't",
  'm