# Modeling using TFIDF on `'body'`, additional features, Bagging with Decision Tree & GridSearch

In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.compose import ColumnTransformer
from tokenizer.tokenizer import RedditTokenizer
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

import nltk
from nltk import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer

import regex
import time

from my_tools.pipes_grids import *

In [2]:
nltk_stops = set(stopwords.words('english'))

In [None]:
try:
    df = pd.read_csv('datasets/data_submissions.csv')
except:
    df = pd.read_csv('https://www.dropbox.com/s/eh48hdw4af7tjc3/data_submissions.csv?dl=0')

In [3]:
df = pd.read_csv('datasets/df_sample.csv')

## Define the pipeline and gridsearch here to be used for this round of analysis.

In [4]:
ct = ColumnTransformer(
    [('tvec',TfidfVectorizer(), 'body')],
    remainder='passthrough',
    sparse_threshold=0.3,
    n_jobs=-1,
)

pipe = Pipeline([
    ('ct',ct),
    ('bagclass',  BaggingClassifier(n_jobs=-1))
])

pipe_params = {
    'ct__tvec__tokenizer' : [RedditIt()],
    'ct__tvec__ngram_range' : [(1,1),(1,2)],
    'ct__tvec__max_features' : [None],
    'ct__tvec__min_df' : [.02,.05],
    'ct__tvec__max_df' : [.9,.85],
    'ct__tvec__stop_words' : [nltk_stops],
    'bagclass__n_estimators' : [10],

    

    
    
    
}


gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv=3,# 3-fold cross-validation.
                 n_jobs = -1) 

## Define functions to automate the model fitting, gridsearch & score outputs
see [`my_tools/pipes_grids.py`](my_tools/pipes_grids.py)

## Model Creation and Scoring

### All Comments and Submission

In [5]:
X = df[['body']]
y = df['is_2016']
model_base = model_go(X=X,
        y=y,
      gridsearch=gs)
model_base[0].best_params_

Best score: 0.7469798356312031
Training score: 0.9892666277450937
Test score: 0.7518234865061999
Cross val score: 0.7498318515259561
Baseline Accuracy: 0.5
Accuracy: 0.7518234865061999
Misclassification rate: 0.2481765134938001
Sensitivity & Recall: 0.6734135667396062
Specificity: 0.8302334062727936
Precision: 0.798659169550173
Balanced Accuracy Score: 0.5036469730123998


{'bagclass__n_estimators': 10,
 'ct__tvec__max_df': 0.9,
 'ct__tvec__max_features': None,
 'ct__tvec__min_df': 0.02,
 'ct__tvec__ngram_range': (1, 2),
 'ct__tvec__stop_words': {'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',
  'm',
  'ma',
  'me',

In [6]:
X = df[['body','vaderSentiment']]
y = df['is_2016']
model_vader = model_go(X=X,
        y=y,
        gridsearch=gs)
model_vader[0].best_params_

Best score: 0.7496070418107513
Training score: 0.9886828041496385
Test score: 0.7529631655725748
Cross val score: 0.7473167890024546
Baseline Accuracy: 0.5
Accuracy: 0.7529631655725748
Misclassification rate: 0.2470368344274252
Sensitivity & Recall: 0.6741429613420861
Specificity: 0.8317833698030634
Precision: 0.8003030631020673
Balanced Accuracy Score: 0.5059263311451496


{'bagclass__n_estimators': 10,
 'ct__tvec__max_df': 0.9,
 'ct__tvec__max_features': None,
 'ct__tvec__min_df': 0.02,
 'ct__tvec__ngram_range': (1, 2),
 'ct__tvec__stop_words': {'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',
  'm',
  'ma',
  'me',

In [None]:
X = df[['body','senti_score']]
y = df['is_2016']
model_sentiscore = model_go(X=X,
        y=y,
        gridsearch=gs)
model_sentiscore[0].best_params_