Use logistic regression to A/B test engineered features

In [11]:
import os
import logging

In [12]:
dir_path = os.path.realpath('..')

## Import data

In [13]:
import numpy as np
import pandas as pd

In [15]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df.shape))

Dataset has 95851 rows, 7 columns.


In [16]:
# fill NaN with string "unknown"
df.fillna('unknown',inplace=True)

## Feature engineering

In [17]:
# Uppercase count
df['processed'] = df['comment_text'].str.split()
df['uppercase_count'] = df['processed'].apply(lambda x: sum(1 for t in x if t.isupper() and len(t)>2))
df = df.drop(['processed'], axis=1)

In [18]:
df.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,uppercase_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,0
27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,0
54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,0
77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,0
79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,0


## Pre-processing

In [19]:
from sklearn.model_selection import train_test_split

seed = 42
np.random.seed(seed)
test_size = 0.2
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
corpus = 'comment_text'

X = df.drop(target, axis=1)
y = df[target]


Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=seed)

In [20]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer( max_features = 20000, ngram_range = ( 1, 3 ), sublinear_tf = True )
Xtrain_TFIDF = vectorizer.fit_transform(Xtrain[corpus])
Xtest_TFIDF = vectorizer.transform(Xtest[corpus])

CPU times: user 46.3 s, sys: 1.26 s, total: 47.5 s
Wall time: 47.5 s


## Model train - benchmark

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.multiclass import OneVsRestClassifier

In [103]:
%%time

# Fit model
param_grid = {'estimator__C': [1, 10, 100] }

# Fit model
clf = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)
clf_cv = GridSearchCV(clf, param_grid, cv=3, verbose=1)
clf_cv.fit(Xtrain_TFIDF, ytrain)

# Evaluate
y_pred = clf_cv.predict_proba(Xtest_TFIDF)
hold_out_preds = pd.DataFrame(y_pred, index=ytest.index, columns=target)

losses = []
for label in target:
    loss = log_loss(ytest[label], hold_out_preds[label])
    losses.append(loss)
    print("{} log loss is {} .".format(label, loss))
    
print("Combined log loss: {} .".format(np.mean(losses)))
print(clf_cv.best_params_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.2min finished


toxic log loss is 0.11785787188111052 .
severe_toxic log loss is 0.028077677127375727 .
obscene log loss is 0.06643701773534948 .
threat log loss is 0.01023452352151921 .
insult log loss is 0.08809528921186026 .
identity_hate log loss is 0.026494130137366796 .
Combined log loss: 0.056199418269096996 .
{'estimator__C': 10}
CPU times: user 11.6 s, sys: 2.17 s, total: 13.8 s
Wall time: 1min 22s


## Model train - with uppercase count

In [92]:
from scipy import sparse

Xtrain_upper = sparse.hstack((Xtrain_TFIDF, sparse.csr_matrix(Xtrain['uppercase_count'].values).T))
Xtest_upper = sparse.hstack((Xtest_TFIDF, sparse.csr_matrix(Xtest['uppercase_count'].values).T))

In [99]:
%%time

#Tune
param_grid = {'estimator__C': [0.1, 1, 10, 100] }

# Fit model
clf = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)
clf_cv = GridSearchCV(clf, param_grid, cv=3, verbose=1)
clf_cv.fit(Xtrain_upper, ytrain)

# Evaluate
y_pred = clf_cv.predict_proba(Xtest_upper)
hold_out_preds = pd.DataFrame(y_pred, index=ytest.index, columns=target)

losses = []
for label in target:
    loss = log_loss(ytest[label], hold_out_preds[label])
    losses.append(loss)
    print("{} log loss is {} .".format(label, loss))
    
print("Combined log loss: {} .".format(np.mean(losses)))
print(clf_cv.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  1.9min finished


toxic log loss is 0.11767262272812692 .
severe_toxic log loss is 0.027748306941294806 .
obscene log loss is 0.06653476231333189 .
threat log loss is 0.010313098881364245 .
insult log loss is 0.08820827454349409 .
identity_hate log loss is 0.026470834420777287 .
Combined log loss: 0.05615798330473154 .
{'estimator__C': 10}
CPU times: user 15.7 s, sys: 3.1 s, total: 18.8 s
Wall time: 2min 8s
