Use logistic regression to A/B test engineered features.

- Benchmark Combined log loss: 0.05601320817782069
- Uppercase Count Combined log loss: 0.05493923371472729

In [1]:
import os
import logging

In [2]:
dir_path = os.path.realpath('..')

## Import data

In [3]:
import numpy as np
import pandas as pd

In [4]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df.shape))

Dataset has 159571 rows, 7 columns.


In [14]:
path = 'data/raw/test.csv'

full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 153164 rows, 1 columns.


In [15]:
# fill NaN with string "unknown"
df.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

## Feature engineering

In [6]:
# Uppercase count
df['processed'] = df['comment_text'].str.split()
df['uppercase_count'] = df['processed'].apply(lambda x: sum(1 for t in x if t.isupper() and len(t)>2))
df = df.drop(['processed'], axis=1)

In [17]:
# Uppercase count for submission
df_test['processed'] = df_test['comment_text'].str.split()
df_test['uppercase_count'] = df_test['processed'].apply(lambda x: sum(1 for t in x if t.isupper() and len(t)>2))
df_test= df_test.drop(['processed'], axis=1)

In [7]:
df.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,uppercase_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0


## Pre-processing

In [8]:
from sklearn.model_selection import train_test_split

seed = 42
np.random.seed(seed)
test_size = 0.2
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
corpus = 'comment_text'

X = df.drop(target, axis=1)
y = df[target]


Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=seed)

In [9]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer( max_features = 20000, ngram_range = ( 1, 3 ), sublinear_tf = True )
Xtrain_TFIDF = vectorizer.fit_transform(Xtrain[corpus])
Xtest_TFIDF = vectorizer.transform(Xtest[corpus])

CPU times: user 1min 9s, sys: 2.42 s, total: 1min 11s
Wall time: 1min 11s


In [18]:
%%time

Xsubmission_TFIDF = vectorizer.fit_transform(df_test[corpus])

CPU times: user 1min 21s, sys: 2.75 s, total: 1min 23s
Wall time: 1min 23s


## Model train

In [19]:
from scipy import sparse

# Concat engineered features
Xtrain_upper = sparse.hstack((Xtrain_TFIDF, sparse.csr_matrix(Xtrain['uppercase_count'].values).T))
Xtest_upper = sparse.hstack((Xtest_TFIDF, sparse.csr_matrix(Xtest['uppercase_count'].values).T))
Xsubmission = sparse.hstack((Xsubmission_TFIDF, sparse.csr_matrix(df_test['uppercase_count'].values).T))

In [13]:
%%time

#Tune
param_grid = {'estimator__C': [0.1, 1, 10, 100] }

# Fit model
clf = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)
clf_cv = GridSearchCV(clf, param_grid, cv=3, verbose=1)
clf_cv.fit(Xtrain_upper, ytrain)

# Evaluate
y_pred = clf_cv.predict_proba(Xtest_upper)
hold_out_preds = pd.DataFrame(y_pred, index=ytest.index, columns=target)

losses = []
for label in target:
    loss = log_loss(ytest[label], hold_out_preds[label])
    losses.append(loss)
    print("{} log loss is {} .".format(label, loss))
    
print("Combined log loss: {} .".format(np.mean(losses)))
print(clf_cv.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  3.7min finished


toxic log loss is 0.11764567880513536 .
severe_toxic log loss is 0.028174758155668073 .
obscene log loss is 0.06678863438342879 .
threat log loss is 0.007392555208941596 .
insult log loss is 0.08052120886640897 .
identity_hate log loss is 0.029112566868780936 .
Combined log loss: 0.05493923371472729 .
{'estimator__C': 10}
CPU times: user 27.8 s, sys: 4.57 s, total: 32.4 s
Wall time: 4min 9s


In [20]:
# submissions
model_name = 'LR'
y_submission = y_pred = clf_cv.predict_proba(Xsubmission)
submission = pd.DataFrame(y_submission, index=df_test.index, columns=target)
path = 'data/submissions/' + model_name + '.csv'
full_path = os.path.join(dir_path, path)
submission.to_csv(full_path, header=True, index=True)