In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
%matplotlib inline

In [117]:
import nltk

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import classifier

# preprocess data

In [4]:
subset_train = 1
max_features = 50000

In [5]:
datapath = '../data/'

In [6]:
train = pd.read_csv(datapath + 'train.csv')
test = pd.read_csv(datapath + 'test.csv')
sub = pd.read_csv(datapath + 'sample_submission.csv')
train = train.iloc[:int(len(train) * subset_train), :]
test = test.fillna('unknown')
comment_types = train.columns[2:]

In [7]:
len(train)

95851

In [8]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)

In [9]:
X = vectorizer.fit_transform(train['comment_text'].values)

In [10]:
y = train.loc[:, comment_types].values
X_test = vectorizer.transform(test.loc[:, 'comment_text'])
y_test = sub.loc[:, comment_types].values

In [27]:
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    random_state=42,
                                                    stratify=np.sum(y, 1),
                                                    test_size=0.2)

In [23]:
lr = LogisticRegression()

In [32]:
yhat = np.zeros_like(y_val).astype(float)

for i, comment_type in enumerate(comment_types):
    lr.fit(X_train, y_train[:, i])
    yhat[:, i] = lr.predict_proba(X_val)[:, 1]

In [33]:
yhat[:, 0].shape

(19171,)

In [34]:
classifier.lloss(yhat, y_val)

0.061486948389721859

In [35]:
yhat = np.zeros_like(y_test).astype(float)

for i, comment_type in enumerate(comment_types):
    lr.fit(X, y[:, i])
    yhat[:, i] = lr.predict_proba(X_test)[:, 1]

In [190]:
sub.loc[:, comment_types] = yhat

In [191]:
sub.to_csv(datapath + 'submission.csv', index = False)

## using a pipeline to test different preprocessing ideas

In [68]:
tfidf = TfidfTransformer(sublinear_tf=True)
vectorizer = CountVectorizer(analyzer='char',
                             stop_words='english',
                             max_features=max_features,
                            ngram_range=(3, 6),
                            min_df=3,
                            max_df=.9)

In [128]:
pattern = r'''(?x)([A-Z]\.)+|\d+:\d+|(https?://)?(\w+\.)(\w{2,})+([\w/]+)?|[@\#]?\w+(?:[-']\w+)*|\$\d+(\.\d+)?%?|\\[Uu]\w+|\.\.\.|[!?]+'''

In [131]:
train['comment_text'].values[1]

'"\n\n Please do not vandalize pages, as you did with this edit to W. S. Merwin. If you continue to do so, you will be blocked from editing.    "'

In [135]:
from nltk.tokenize import TweetTokenizer
tknzer = TweetTokenizer(preserve_case=False,
                       reduce_len=True,
                       strip_handles=True)

In [69]:
X = vectorizer.fit_transform(train['comment_text'].fillna('unknown').values.astype(str))
X = tfidf.fit_transform(X)

In [137]:
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    random_state=42,
                                                    stratify=np.sum(y, 1),
                                                    test_size=0.2)

In [138]:
lr = LogisticRegression(C=5.0)

In [139]:
yhat = np.zeros_like(y_val).astype(float)

for i, comment_type in enumerate(comment_types):
    lr.fit(X_train, y_train[:, i])
    yhat[:, i] = lr.predict_proba(X_val)[:, 1]

In [140]:
classifier.lloss(yhat, y_val)

0.051561544426453922

In [53]:
# previous .5192 (4.0)
# previous .5186 (5.0)
# previous .5196 (6.0)

In [87]:
X_test = vectorizer.transform(test.loc[:, 'comment_text'].fillna('unknown').values)
X_test = tfidf.transform(X_test)

In [88]:
# and the whole thing
yhat = np.zeros_like(y_test).astype(float)

for i, comment_type in enumerate(comment_types):
    lr.fit(X, y[:, i])
    yhat[:, i] = lr.predict_proba(X_test)[:, 1]

In [89]:
sub.loc[:, comment_types] = yhat

In [90]:
sub.to_csv(datapath + 'submission.csv', index = False)

## trying random forest

In [112]:
rf = RandomForestClassifier(n_jobs=-1,
                           max_features='auto',
                           n_estimators=100,
                           min_samples_leaf=10)

In [113]:
%time rf.fit(X_train, y_train)

CPU times: user 14min 5s, sys: 1e+03 ms, total: 14min 6s
Wall time: 3min 54s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=10,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [114]:
y_hat = rf.predict_proba(X_val)

In [115]:
y_hat = np.array(y_hat)[:, :, 1].T

In [116]:
classifier.lloss(y_hat, y_val)

0.063100800501228557

In [24]:
# and the real thing
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [25]:
y_hat = rf.predict_proba(X_test)
y_hat = np.array(y_hat)[:, :, 1].T

In [26]:
sub.loc[:, comment_types] = y_hat
sub.to_csv(datapath + 'submission.csv', index = False)

## trying extra trees

In [162]:
et = ExtraTreesClassifier(n_jobs=-1)

In [163]:
%time et.fit(X_train, y_train)

CPU times: user 1min 39s, sys: 72 ms, total: 1min 39s
Wall time: 30.6 s


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [164]:
y_hat = np.array(et.predict_proba(X_val))[:, :, 1].T

In [165]:
classifier.lloss(y_hat, y_val)

0.12734564390771935