# 1. Loading Data

In [1]:
from IPython.display import clear_output

def hint(message):
    """
    erase previous ipynb output and show new message
    """
    clear_output()
    print(message)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

hint('loading data...')
train = pd.read_csv('data/train.csv')
labels = [
    'toxic', 
    'severe_toxic', 
    'obscene', 
    'threat', 
    'insult', 
    'identity_hate'
]
hint('Done')

Done


# 2. Pre-processing

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

hint("Building char vectors...")
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    max_features=50000)
Xchar = char_vectorizer.fit_transform(train['comment_text'])

hint("Building word vectors...")
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=10000
)
Xword = word_vectorizer.fit_transform(train['comment_text'])

hint('Done')

Done


# 3. Finding C Value (Regularization Strength)

In [4]:
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

def pr(X, y_i, y):
    p = X[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def find_best_C():
    Y = train['toxic'].values
    r1 = np.log(pr(Xchar, 1, Y) / pr(Xchar, 0, Y))
    r2 = np.log(pr(Xword, 1, Y) / pr(Xword, 0, Y))
    X = hstack([Xchar.multiply(r1), Xword.multiply(r2)])
    lg = LogisticRegressionCV(
        Cs=[4, 2, 1, 0.5, 0.25],
        cv=3, 
        max_iter=200, 
        class_weight='balanced',
        scoring='roc_auc', 
        solver='sag'
    )
    lg.fit(X, Y)
    return lg.C_

search_for_c = True
best_C = find_best_C() if search_for_c else 1
hint('Best C is %.4f' % best_C)

Best C is 0.5000


# 4. Analysis and Submission

In [5]:
make_submission = True
scoring = True

if make_submission:
    hint("Loading test set")
    test = pd.read_csv('data/test.csv')

    hint("Transforming")
    Xword_ = word_vectorizer.transform(test['comment_text'])
    Xchar_ = char_vectorizer.transform(test['comment_text'])

hint("Done")

Done


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
scores = []
preds = np.zeros((test.shape[0], len(labels)))
for i, label in enumerate(labels):
    print("Working on label %s" % label)
    Y = train[label].values
    r1 = np.log(pr(Xword, 1, Y) / pr(Xword, 0, Y))
    r2 = np.log(pr(Xchar, 1, Y) / pr(Xchar, 0, Y))
    lg = LogisticRegression(C=best_C[0], max_iter=200, class_weight='balanced', solver='sag')
    X = hstack([Xword.multiply(r1), Xchar.multiply(r2)])
    if scoring:
        print("Scoring...")
        score = np.mean(cross_val_score(lg, X, Y, cv=3, scoring='roc_auc'))
        print("CVed AUC for class %s: %.4f" % (label, score))
        scores.append(score)
    if make_submission:
        print("Fitting...")
        lg.fit(X, Y)
        print("Predicting...")
        X_ = hstack([Xword_.multiply(r1), Xchar_.multiply(r2)])
        preds[:, i] = lg.predict_proba(X_)[:, 1]

Working on label toxic
Scoring...
CVed AUC for class toxic: 0.9811
Fitting...
Predicting...
Working on label severe_toxic
Scoring...




CVed AUC for class severe_toxic: 0.9838
Fitting...
Predicting...
Working on label obscene
Scoring...
CVed AUC for class obscene: 0.9924
Fitting...
Predicting...
Working on label threat
Scoring...




CVed AUC for class threat: 0.9752
Fitting...
Predicting...
Working on label insult
Scoring...
CVed AUC for class insult: 0.9852
Fitting...
Predicting...
Working on label identity_hate
Scoring...




CVed AUC for class identity_hate: 0.9770
Fitting...
Predicting...


In [7]:
if scoring: print("Global CVed AUC: %.4f" % (np.mean(scores)))

Global CVed AUC: 0.9825


In [8]:
if make_submission:
    from time import strftime
    hint("Uploading...")
    file_name = 'submission_' + strftime("%Y%m%d-%H%M%S") + '.csv'
    sumbit_id = pd.DataFrame({'id': test['id']})
    sumbit_labels = pd.DataFrame(preds, columns=labels)
    submission = pd.concat([sumbit_id, sumbit_labels], axis=1)
    submission.to_csv(file_name, index=False)
    hint("Done")

Done
