Use a RF to stack LSTM predictions with engineered features

In [1]:
import os
import logging

In [2]:
dir_path = os.path.realpath('..')

## Import data

In [3]:
import numpy as np
import pandas as pd

In [4]:
path = 'data/processed/stacking.csv'

full_path = os.path.join(dir_path, path)
df = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df.shape))

Dataset has 19171 rows, 13 columns.


In [5]:
# fill NaN with string "unknown"
df.fillna('unknown',inplace=True)

## Feature engineering

In [6]:
df['processed'] = df['comment_text'].str.split()
df['uppercase_count'] = df['processed'].apply(lambda x: sum(1 for t in x if t.isupper() and len(t)>2))
df = df.drop(['processed'], axis=1)

In [7]:
df.head()

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text,toxic_pred,severe_toxic_pred,obscene_pred,threat_pred,insult_pred,identity_hate_pred,uppercase_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
343671421527,0,0,0,0,0,0,I can possibly see this information being adde...,0.000365,1e-06,0.000278,2e-06,5.3e-05,4e-06,0
484437552026,0,0,0,0,0,0,Grandmaster take a look at your own source at ...,0.005549,4.5e-05,0.002324,6e-05,0.002056,0.000581,0
818275822896,0,0,0,0,0,0,I just added details to show the actual scope ...,0.000923,4e-06,0.000714,7e-06,0.000228,3.9e-05,3
788660525714,0,0,0,0,0,0,"""\n\nHi. I am offended by your use of the word...",0.013041,4.5e-05,0.00463,6.7e-05,0.003697,0.00021,1
398705378355,0,0,0,0,0,0,"What is unacceptable, and deserves an apology ...",0.001005,2e-06,0.000333,3e-06,0.0001,6e-06,0


In [11]:
from sklearn.model_selection import train_test_split

seed = 42
np.random.seed(seed)
test_size = 0.2
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
corpus = 'comment_text'

X = df.drop(target + [corpus], axis=1)
y = df[target]


Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=seed)

In [13]:
X.head()

Unnamed: 0_level_0,toxic_pred,severe_toxic_pred,obscene_pred,threat_pred,insult_pred,identity_hate_pred,uppercase_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
343671421527,0.000365,1e-06,0.000278,2e-06,5.3e-05,4e-06,0
484437552026,0.005549,4.5e-05,0.002324,6e-05,0.002056,0.000581,0
818275822896,0.000923,4e-06,0.000714,7e-06,0.000228,3.9e-05,3
788660525714,0.013041,4.5e-05,0.00463,6.7e-05,0.003697,0.00021,1
398705378355,0.001005,2e-06,0.000333,3e-06,0.0001,6e-06,0


## Model fit

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss

In [20]:
#Tuning the model
param_grid = { "n_estimators"      : [150, 200, 250],
           "max_depth"         : [4, 8],
           "min_samples_split" : [4, 8] ,
           "bootstrap": [True]}

In [23]:
%%time
clf = RandomForestClassifier(random_state=seed)

clf_cv = GridSearchCV(clf, param_grid, cv=5)
clf_cv.fit(Xtrain, ytrain)

CPU times: user 4min 26s, sys: 2.53 s, total: 4min 28s
Wall time: 4min 30s


In [69]:
hold_out_preds

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate,0,1,2,3,4,5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
771050023016,,,,,,,0.001502,0.000000,0.000261,0.000004,0.000141,0.000000
752085470079,,,,,,,0.037541,0.000000,0.004826,0.000303,0.013168,0.000345
260894087160,,,,,,,0.001747,0.000000,0.000333,0.000004,0.000164,0.000000
608267384331,,,,,,,0.527004,0.015641,0.074116,0.043282,0.126909,0.022031
743416916956,,,,,,,0.002980,0.000000,0.000659,0.000011,0.000307,0.000000
159004111621,,,,,,,0.001330,0.000000,0.000235,0.000004,0.000136,0.000000
576654454044,,,,,,,0.001316,0.000000,0.000233,0.000004,0.000135,0.000000
900304929759,,,,,,,0.004226,0.000000,0.001490,0.000078,0.000674,0.000000
830539624013,,,,,,,0.002021,0.000000,0.000347,0.000004,0.000145,0.000000
819090374692,,,,,,,0.014643,0.000000,0.003417,0.000706,0.003540,0.000000


In [70]:
# concatenating features with lstm preds
y_pred = clf_cv.predict_proba(Xtest)
hold_out_preds = pd.DataFrame(index=ytest.index, columns=target)
i = 0
for label in target:
    hold_out_preds[label] = y_pred[i][:,1]
    i += 1
losses = []

for label in target:
    loss = log_loss(ytest[label], hold_out_preds[label])
    losses.append(loss)
    print("{} log loss is {} .".format(label, loss))
    
print("Combined log loss: {} .".format(np.mean(losses)))

toxic log loss is 0.09787929010850749 .
severe_toxic log loss is 0.026725898185510445 .
obscene log loss is 0.05470538602085354 .
threat log loss is 0.010094469485710533 .
insult log loss is 0.06404614090386715 .
identity_hate log loss is 0.032323306720357 .
Combined log loss: 0.04762908190413436 .


In [77]:
# Comparing to original preds
for label in target:
    loss = log_loss(ytest[label], Xtest[label+'_pred'])
    losses.append(loss)
    print("{} log loss is {} .".format(label, loss))
    
print("Combined log loss: {} .".format(np.mean(losses)))

toxic log loss is 0.09974531484030262 .
severe_toxic log loss is 0.02043052522355615 .
obscene log loss is 0.05536954658653976 .
threat log loss is 0.010520945531656771 .
insult log loss is 0.06303551417441342 .
identity_hate log loss is 0.02521755794712749 .
Combined log loss: 0.046356294446222135 .


In [78]:
clf_cv.best_params_

{'bootstrap': True,
 'max_depth': 8,
 'min_samples_split': 8,
 'n_estimators': 200}

## Rf only

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss

In [20]:
#Tuning the model
param_grid = { "n_estimators"      : [150, 200, 250],
           "max_depth"         : [4, 8],
           "min_samples_split" : [4, 8] ,
           "bootstrap": [True]}

In [82]:
%%time
clf = RandomForestClassifier(random_state=seed)

clf_cv = GridSearchCV(clf, param_grid, cv=5)
clf_cv.fit(Xtrain['uppercase_count'].reshape(-1,1), ytrain)



CPU times: user 2min 23s, sys: 1.04 s, total: 2min 24s
Wall time: 2min 25s


In [84]:
# features only
y_pred = clf_cv.predict_proba(Xtest['uppercase_count'].reshape(-1,1))
hold_out_preds = pd.DataFrame(index=ytest.index, columns=target)
i = 0
for label in target:
    hold_out_preds[label] = y_pred[i][:,1]
    i += 1
losses = []

for label in target:
    loss = log_loss(ytest[label], hold_out_preds[label])
    losses.append(loss)
    print("{} log loss is {} .".format(label, loss))
    
print("Combined log loss: {} .".format(np.mean(losses)))

  if __name__ == '__main__':


toxic log loss is 0.30907860864538406 .
severe_toxic log loss is 0.039100500903573615 .
obscene log loss is 0.20099846187848056 .
threat log loss is 0.016168582699123156 .
insult log loss is 0.19658491496102598 .
identity_hate log loss is 0.04781023162879908 .
Combined log loss: 0.13495688345273107 .


In [88]:
hold_out_preds

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
771050023016,0.093615,0.009622,0.052160,0.002611,0.051097,0.007506
752085470079,0.076465,0.012159,0.046952,0.002283,0.039784,0.004249
260894087160,0.076465,0.012159,0.046952,0.002283,0.039784,0.004249
608267384331,0.093615,0.009622,0.052160,0.002611,0.051097,0.007506
743416916956,0.093615,0.009622,0.052160,0.002611,0.051097,0.007506
159004111621,0.093615,0.009622,0.052160,0.002611,0.051097,0.007506
576654454044,0.093615,0.009622,0.052160,0.002611,0.051097,0.007506
900304929759,0.093615,0.009622,0.052160,0.002611,0.051097,0.007506
830539624013,0.093615,0.009622,0.052160,0.002611,0.051097,0.007506
819090374692,0.093615,0.009622,0.052160,0.002611,0.051097,0.007506


In [86]:
ytest.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,3835.0,3835.0,3835.0,3835.0,3835.0,3835.0
mean,0.098566,0.007301,0.05189,0.002347,0.050587,0.008344
std,0.298117,0.085146,0.221835,0.048393,0.219181,0.090977
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [77]:
# Comparing to original preds
for label in target:
    loss = log_loss(ytest[label], Xtest[label+'_pred'])
    losses.append(loss)
    print("{} log loss is {} .".format(label, loss))
    
print("Combined log loss: {} .".format(np.mean(losses)))

toxic log loss is 0.09974531484030262 .
severe_toxic log loss is 0.02043052522355615 .
obscene log loss is 0.05536954658653976 .
threat log loss is 0.010520945531656771 .
insult log loss is 0.06303551417441342 .
identity_hate log loss is 0.02521755794712749 .
Combined log loss: 0.046356294446222135 .


In [78]:
clf_cv.best_params_

{'bootstrap': True,
 'max_depth': 8,
 'min_samples_split': 8,
 'n_estimators': 200}