# Toxic Comments Classification
### See the README file for additional details on the analysis process.

Disclaimer: The dataset displayed contains text that may be considered profane, vulgar, or offensive.

In [1]:
import string
import spacy
import pandas as pd
import numpy as np
from pprint import pprint
import pickle
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [2]:
# Forcing pandas to display all data (instead of cutting off columns & rows in the view)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# Loading in training file
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

### Exploratory Data Analysis

In [4]:
# Checking the overall shape of the data
train_df.shape

(159571, 8)

In [5]:
# Looking at data to get a feel for it overall
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
# Looking at the columns & types
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [7]:
# Getting counts for target columns
train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].apply(pd.Series.value_counts)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,144277,157976,151122,159093,151694,158166
1,15294,1595,8449,478,7877,1405


In [8]:
# Getting ratios for target columns
# Looks like there are strongly imbalanced classes - may need to address this when modeling
train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\
            .apply(pd.Series.value_counts)/train_df.shape[0]

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.904156,0.990004,0.947052,0.997004,0.950636,0.991195
1,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805


In [9]:
# Checking for correlations between target variables
# Looks like comments that fall under one category often fall under other categories
c = train_df.corr().abs()
s = c.unstack()
s.sort_values(ascending=False)

identity_hate  identity_hate    1.000000
insult         insult           1.000000
severe_toxic   severe_toxic     1.000000
obscene        obscene          1.000000
threat         threat           1.000000
toxic          toxic            1.000000
insult         obscene          0.741272
obscene        insult           0.741272
toxic          obscene          0.676515
obscene        toxic            0.676515
insult         toxic            0.647518
toxic          insult           0.647518
obscene        severe_toxic     0.403014
severe_toxic   obscene          0.403014
insult         severe_toxic     0.375807
severe_toxic   insult           0.375807
insult         identity_hate    0.337736
identity_hate  insult           0.337736
toxic          severe_toxic     0.308619
severe_toxic   toxic            0.308619
identity_hate  obscene          0.286867
obscene        identity_hate    0.286867
identity_hate  toxic            0.266009
toxic          identity_hate    0.266009
severe_toxic   i

In [10]:
# Looking at a few of the insult labeled comments to get a feel for this type of data
train_df[train_df['insult']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
55,0020e7119b96eeeb,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0
56,0020fd96ed3b8c8b,=Tony Sidaway is obviously a fistfuckee. He lo...,1,0,1,0,1,0


In [11]:
# Looking at a few of the identity hate labeled comments to get a feel for this type of data
train_df[train_df['identity_hate']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
105,00472b8e2d38d1ea,A pair of jew-hating weiner nazi schmucks.,1,0,1,0,1,1
176,006b94add72ed61c,I think that your a Fagget get a oife and burn...,1,0,1,1,1,1
218,008e0818dde894fb,"Kill all niggers. \n\nI have hard, that others...",1,0,1,0,1,1
238,0097dd5c29bf7a15,u r a tw@ fuck off u gay boy.U r smelly.Fuck u...,1,0,1,0,1,1


### Natural Language Processing

In [12]:
# Loading in spacy
nlp = spacy.load('en')

In [13]:
# Taking a sample of train_df for faster testing
train_df = train_df.sample(n=10000, random_state=42)

In [14]:
# Setting up X and y
X_train = train_df['comment_text']
y_train = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
X_test = test_df['comment_text']

In [15]:
# Setting up stop words
stop_words = set(list(ENGLISH_STOP_WORDS) + ['wikipedia'])

In [16]:
def lematize(text):
    '''
    Cleans text and processes into tokens using spacy
    '''
    # Characters to keep - letters and spaces
    valid = string.ascii_letters + ' '

    # Replacing symbols with spaces and fixing other symbols
    cleaned_text = text.replace('@', ' ').replace(
        '\n', ' ').replace('&amp;', 'and').replace('&', 'and').replace(
        "’", "'")

    # Removing everything that is not a letter or a space and lower all text
    cleaned_text = ''.join([char.lower() for char in cleaned_text if char in valid])

    # Running the doc through spacy
    spacy_text = nlp(cleaned_text)

    # Getting tokens using spacy (leaving out any extra spaces)
    tokens = [token.lemma_ for token in spacy_text if not token.is_space]

    # Removing tokens that don't exist in spacy's vocabulary
    tokens = [token for token in tokens if token in nlp.vocab or token == '-PRON-']

    return tokens

### Testing Models

In [17]:
# Creating text processing & modeling pipeline for gridsearching hyper-parameters
grid_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words, tokenizer=lematize)),
    ('model', DecisionTreeClassifier()),
])

In [18]:
# Setting up parameter list for gridsearching
param_list = [
    {
        'tfidf__max_features': [150, 200]
    },
    {
        'model': [KNeighborsClassifier()],
        'model__n_neighbors': [5, 15]
    },
    {
        'model': [DecisionTreeClassifier()],
        'model__min_samples_split': [.5, 1.0],
        'model__max_depth': [5, 15]
    },
    {
        'model': [RandomForestClassifier()],
        'model__min_samples_split': [.5, 1.0],
        'model__max_depth': [5, 15]
    }
]

In [19]:
# Grid searching using the pipeline's parameters
g = GridSearchCV(grid_pipeline, param_list, cv=5, n_jobs=3,
                 verbose=10, scoring='f1_weighted')
g.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] tfidf__max_features=150 .........................................
[CV] tfidf__max_features=150 .........................................
[CV] tfidf__max_features=150 .........................................
[CV]  tfidf__max_features=150, score=0.29349531997663303, total= 1.4min
[CV] tfidf__max_features=150 .........................................
[CV]  tfidf__max_features=150, score=0.35027714232084456, total= 1.4min
[CV] tfidf__max_features=150 .........................................


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  2.4min


[CV]  tfidf__max_features=150, score=0.3079202324079883, total= 1.4min
[CV] tfidf__max_features=200 .........................................
[CV]  tfidf__max_features=150, score=0.38933316247355015, total= 1.2min
[CV] tfidf__max_features=200 .........................................
[CV]  tfidf__max_features=150, score=0.34905230668509835, total= 1.2min
[CV] tfidf__max_features=200 .........................................
[CV] . tfidf__max_features=200, score=0.406615596666596, total= 1.2min
[CV] tfidf__max_features=200 .........................................
[CV]  tfidf__max_features=200, score=0.2970473015207166, total= 1.4min
[CV] tfidf__max_features=200 .........................................


[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  7.4min


[CV]  tfidf__max_features=200, score=0.39082358040801524, total= 1.4min
[CV] model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=5 
[CV]  tfidf__max_features=200, score=0.4390734334939202, total= 1.4min
[CV] model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=5 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  tfidf__max_features=200, score=0.40824108199490106, total= 1.4min
[CV] model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=5 
[CV]  model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=5, score=0.16534783463164107, total= 1.4min[CV]  model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=5, score=0.17770391494603638, total= 1.4min

[CV] model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=5 


[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 10.3min


[CV] model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=5 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=5, score=0.17416312337850232, total= 1.8min
[CV] model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=15 
[CV]  model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=5, score=0.1485973714346385, total= 1.5min
[CV]  model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=5, score=0.18950658998415443, total= 1.5min
[CV] model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=15, score=0.07032413484610416, total= 1.4min
[CV] model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=15 
[CV]  model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=15, score=0.08296738767753259, total= 1.3min
[CV]  model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=15, score=0.08578608695396257, total= 1.3min
[CV] model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkow

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), model__n_neighbors=15, score=0.09448986841307688, total= 1.3min
[CV] model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=5, model__min_samples_split=0.5 


[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed: 17.6min


[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=5, model__min_samples_split=0.5, score=0.5105280617311602, total= 1.1min
[CV] model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=5, model__min_samples_split=0.5 
[CV]  model=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbo

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=5, model__min_samples_split=0.5, score=0.49085887410899515, total= 1.2min
[CV] model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=5, model__min_samples_split=0.5 
[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=5, model__min_samples_split=0.5, score=0.5399225283257397, total= 1.2min
[CV] model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=5, model__min_samples_split=1.0 
[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=N

[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed: 22.2min


[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=5, model__min_samples_split=1.0, score=0.32670780370332997, total= 1.2min
[CV] model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=5, model__min_samples_split=1.0 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=5, model__min_samples_split=1.0, score=0.3408423634293707, total= 1.2min
[CV] model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=15, model__min_samples_split=0.5 
[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=15, model__min_samples_split=0.5, score=0.5744181767155271, total= 1.4min
[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=15, model__min_samples_split=0.5, score=0.5881070152117103, total= 1.4min

[CV] model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=15, model__min_samples_split=0.5, score=0.586597583872821, total= 1.5min
[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=15, model__min_samples_split=0.5, score=0.6115154968284486, total= 1.5min

[CV] model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
     

[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed: 29.8min


[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=15, model__min_samples_split=1.0, score=0.413455895345608, total= 1.7min
[CV] model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=15, model__min_samples_split=1.0 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=15, model__min_samples_split=1.0, score=0.32670780370332997, total= 1.5min
[CV] model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=15, model__min_samples_split=1.0 
[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_node

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), model__max_depth=15, model__min_samples_split=1.0, score=0.41605762260158985, total= 1.3min
[CV] model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=5, model__min_samples_split=0.5 
[CV]  model=RandomForestClassifier(bootstrap=True, class_weigh

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=5, model__min_samples_split=0.5, score=0.0, total= 1.2min
[CV] model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=5, model__min_samples_split=1.0 

[CV]  m

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 37.7min


[CV]  model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=5, model__min_samples_split=0.5, score=0.0, total= 1.2min
[CV] model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=5, model__min_samples_split=1.0 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=5, model__min_samples_split=1.0, score=0.0, total= 1.7min
[CV] model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=5, model__min_samples_split=1.0 
[CV]  mo

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=5, model__min_samples_split=1.0, score=0.0, total= 1.3min
[CV] model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=15, model__min_samples_split=0.5 
[CV]  m

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=15, model__min_samples_split=0.5, score=0.04485538296620321, total= 1.5min
[CV] model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=15, model__min_samples_s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=15, model__min_samples_split=0.5, score=0.0216232924129996, total= 2.1min
[CV] model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=15, model__min_samples_sp

[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed: 48.4min



[CV]  model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=15, model__min_samples_split=1.0, score=0.0, total= 2.1min
[CV] model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=15, model__min_samples_split=1.0 
[CV] 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=15, model__min_samples_split=1.0, score=0.0, total= 1.9min
[CV]  model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__max_depth=15, model__min_samples_split=1.0, score

[Parallel(n_jobs=3)]: Done  60 out of  60 | elapsed: 51.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=3,
       param_grid=[{'tfidf__max_features': [150, 200]}, {'model': [KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')], 'model__n_neighbors': [5, 15]}, {'model': [DecisionTreeClassifier(class_we...           warm_start=False)], 'model__min_samples_split': [0.5, 1.0], 'model__max_depth': [5, 15]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [21]:
# Printing best hyperparameters and the associated score
print('Best Params: {}, \n\nBest Score: {}'.format(g.best_params_, g.best_score_))

Best Params: {'model': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=0.5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'model__max_depth': 15, 'model__min_samples_split': 0.5}, 

Best Score: 0.5831845346837926


### Final Model and Predictions

In [22]:
# Creating final pipeline using best model & parameters
final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words, tokenizer=lematize, max_features=200)),
    ('model', DecisionTreeClassifier(max_depth=15, min_samples_split=0.5)),
])

In [23]:
# Fitting the final model
final_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=200, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [91]:
# Finding & displaying important features
final_model = final_pipeline.steps[1][1]
tokens = np.array(final_pipeline.steps[0][1].get_feature_names())
importances = final_model.feature_importances_
print('Top 10 Tokens: \n{}'.format(tokens[np.argsort(importances)[-1:-11:-1]]))

Top 10 Tokens: 
['fuck' '-PRON-' 'bullshit' 'life' 'like' 'big' 'little' 'stop' 'hate'
 'block']


In [92]:
# Pickling the final model for reproducibility
with open('../models/final_model.pkl', 'wb') as file:
    pickle.dump(final_model, file)

In [93]:
# Using the final model to make 100 example predictions and display those probabilities
final_pipeline.predict_proba(X_test[:100])

[array([[ 0.89458689,  0.10541311],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.89458689,  0.10541311],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.68965517,  0.31034483],
        [ 0.68965517,  0.31034483],
        [ 0.95724281,  0.04275719],
        [ 0.78823529,  0.21176471],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.04275719],
        [ 0.89458689,  0.10541311],
        [ 0.95724281,  0.04275719],
        [ 0.95724281,  0.042