Use logistic regression to A/B test engineered features.

- Benchmark Combined log loss: 0.05601320817782069
- Uppercase Count Combined log loss: 0.05493923371472729

In [5]:
import os
import logging
import numpy as np
import pandas as pd

## Import data

In [6]:
def load_data(path):
    full_path = os.path.join(os.path.realpath('..'), path)
    df = pd.read_csv(full_path, header=0, index_col=0)
    print("Dataset has {} rows, {} columns.".format(*df.shape))
    return df

In [7]:
df_train = load_data('data/processed/train.csv')
df_test = load_data('data/processed/test.csv')

Dataset has 159571 rows, 11 columns.
Dataset has 153164 rows, 5 columns.


In [8]:
df_train.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,uppercase_count,bad_words,typos,length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1,0,4,43
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1,0,3,17
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,0,0,42
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,0,3,113
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,0,0,13


## Pre-processing

In [10]:
df = df_train

In [11]:
from sklearn.model_selection import train_test_split

seed = 42
np.random.seed(seed)
test_size = 0.2
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
corpus = 'comment_text'

X = df.drop(target, axis=1)
y = df[target]


Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=seed)

In [12]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer( max_features = 20000, ngram_range = ( 1, 3 ), sublinear_tf = True )
Xtrain_TFIDF = vectorizer.fit_transform(Xtrain[corpus])
Xtest_TFIDF = vectorizer.transform(Xtest[corpus])

CPU times: user 1min 40s, sys: 4.97 s, total: 1min 45s
Wall time: 1min 46s


In [13]:
%%time

Xsubmission_TFIDF = vectorizer.fit_transform(df_test[corpus])

CPU times: user 1min 58s, sys: 10.5 s, total: 2min 9s
Wall time: 2min 28s


## Model train

In [16]:
from scipy import sparse

# Concat engineered features
features = ['uppercase_count', 'bad_words', 'typos', 'length']

Xtrain_combined = sparse.hstack((Xtrain_TFIDF, sparse.csr_matrix(Xtrain[features].values)))
Xtest_combined = sparse.hstack((Xtest_TFIDF, sparse.csr_matrix(Xtest[features].values)))
Xsubmission = sparse.hstack((Xsubmission_TFIDF, sparse.csr_matrix(df_test[features].values)))

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.multiclass import OneVsRestClassifier

In [23]:
%%time

#Tune
param_grid = {'estimator__C': [0.1, 1, 10] }

# Fit model
clf = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)
clf_cv = GridSearchCV(clf, param_grid, cv=3, verbose=1)
clf_cv.fit(Xtrain_combined, ytrain)

# Evaluate
y_pred = clf_cv.predict_proba(Xtest_combined)
hold_out_preds = pd.DataFrame(y_pred, index=ytest.index, columns=target)

losses = []
for label in target:
    loss = log_loss(ytest[label], hold_out_preds[label])
    losses.append(loss)
    print("{} log loss is {} .".format(label, loss))
    
print("Combined log loss: {} .".format(np.mean(losses)))
print(clf_cv.best_params_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  7.3min finished


toxic log loss is 0.1181271841550637 .
severe_toxic log loss is 0.027992274427895425 .
obscene log loss is 0.06681333244229817 .
threat log loss is 0.021545038308391853 .
insult log loss is 0.0803736299363951 .
identity_hate log loss is 0.05453448420067547 .
Combined log loss: 0.061564323911786616 .
{'estimator__C': 10}
CPU times: user 21 s, sys: 4.96 s, total: 26 s
Wall time: 8min 37s


## Outputs

In [24]:
def save_df(df, path):
    full_path = os.path.join(os.path.realpath('..'), path)
    df.to_csv(full_path, header=True, index=True)
    print('Dataframe ({}, {}) saved as csv.'.format(*df.shape))

In [26]:
# submissions
model_name = 'LR'
y_submission = clf_cv.predict_proba(Xsubmission)
submission = pd.DataFrame(y_submission, index=df_test.index, columns=target)
save_df(submission, 'data/submissions/'+ model_name + '.csv')

Dataframe (153164, 6) saved as csv.
