In [1]:
%matplotlib inline
import os
import re
import string
import shutil

from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import scipy as sp

import sklearn as sk
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

from gensim.models import KeyedVectors

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, LSTM, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding, Flatten, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tqdm import tqdm, tqdm_notebook, tnrange, trange

Using TensorFlow backend.


In [2]:
tqdm.monitor_interval = 0
tqdm_notebook().pandas()




In [None]:
import pathlib
import pickle
os.makedirs("WORDBAG", exist_ok=True)
targdir = pathlib.Path("WORDBAG")
with open(targdir/"X_trains.P", 'rb') as f:
    X_trains = pickle.load(f)
with open(targdir/"Y_train.P", 'rb') as f:
    Y_train = pickle.load(f)
with open(targdir/"vectorizer.P", 'rb') as f:
    tfidf_vect = pickle.load(f)
with open(targdir/"selectors.P", 'rb') as f:
    selectors = pickle.load(f)

In [3]:
train = pd.read_pickle('CLEAN/train.P.gz', compression='gzip')

In [4]:
test = pd.read_pickle('CLEAN/test.P.gz', compression='gzip')

In [5]:
train.iloc[0:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comments
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,False,False,False,False,False,False,explanation \n why the edit make under my user...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,False,False,False,False,False,False,daww he match this background colour I be seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",False,False,False,False,False,False,hey man I be really not try to edit war its ju...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",False,False,False,False,False,False,more \n I can not make any real suggestion on ...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",False,False,False,False,False,False,you sir be my hero any chance you remember wha...


In [6]:
print(train.shape)
print(train.columns)

(159571, 9)
Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'cleaned_comments'],
      dtype='object')


In [7]:
cats = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [17]:
def test_skmodel(cat, models):
    preds = models[cat].predict(X_trains[cat])
    print(classification_report(Y_train[cat], preds))

In [18]:
# from sklearn.neighbors import KNeighborsClassifier
mnbs = dict()
for cat in cats:
    mnb = MultinomialNB(alpha=1e-6)
    sample_weights = compute_sample_weight('balanced', Y_train[cat])
    mnb.fit(X_trains[cat], Y_train[cat], sample_weight=sample_weights)
#     mnb.fit(X_trains[cat], Y_train[cat])
    mnbs[cat] = mnb

In [19]:
for cat in cats:
    print("%s:" % cat)
    test_skmodel(cat, mnbs)

toxic:
             precision    recall  f1-score   support

      False       1.00      0.98      0.99    144277
       True       0.82      0.96      0.88     15294

avg / total       0.98      0.98      0.98    159571

severe_toxic:
             precision    recall  f1-score   support

      False       1.00      0.99      1.00    157976
       True       0.51      1.00      0.68      1595

avg / total       1.00      0.99      0.99    159571

obscene:
             precision    recall  f1-score   support

      False       1.00      0.98      0.99    151122
       True       0.70      0.99      0.82      8449

avg / total       0.98      0.98      0.98    159571

threat:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00    159093
       True       0.43      1.00      0.60       478

avg / total       1.00      1.00      1.00    159571

insult:
             precision    recall  f1-score   support

      False       1.00      0.98      0.

In [60]:
from sklearn.linear_model import SGDClassifier
sgds = dict()
for cat in cats:
    sgd = SGDClassifier(
        loss='log', 
        penalty='elasticnet',
        alpha=1e-6,
        l1_ratio=0.25,
        tol=1e-6,
        max_iter=2000,
        n_jobs=-1,
        class_weight='balanced',
        learning_rate='optimal',
#         verbose=3
    )
#     sample_weights = compute_sample_weight('balanced', Y_train[cat])
#     mnb.fit(X_trains[cat], Y_train[cat], sample_weight=sample_weights)
    sgd.fit(X_trains[cat], Y_train[cat])
    sgds[cat] = sgd
# sample_weights = compute_sample_weight('balanced', Y_train)

In [61]:
for cat in cats:
    print("%s:" % cat)
    test_skmodel(cat, sgds)

toxic:
             precision    recall  f1-score   support

      False       1.00      0.97      0.99    144277
       True       0.79      0.99      0.88     15294

avg / total       0.98      0.97      0.98    159571

severe_toxic:
             precision    recall  f1-score   support

      False       1.00      0.99      1.00    157976
       True       0.60      1.00      0.75      1595

avg / total       1.00      0.99      0.99    159571

obscene:
             precision    recall  f1-score   support

      False       1.00      0.99      1.00    151122
       True       0.85      1.00      0.92      8449

avg / total       0.99      0.99      0.99    159571

threat:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00    159093
       True       0.67      1.00      0.80       478

avg / total       1.00      1.00      1.00    159571

insult:
             precision    recall  f1-score   support

      False       1.00      0.98      0.

In [22]:
from sklearn.ensemble import AdaBoostClassifier
adabs = dict()
for cat in cats:
    adab = AdaBoostClassifier(
        base_estimator=MultinomialNB(alpha=1e-6),
        n_estimators=200,
        learning_rate=0.25
    )
#     sample_weights = compute_sample_weight('balanced', Y_train[cat])
#     mnb.fit(X_trains[cat], Y_train[cat], sample_weight=sample_weights)
    adab.fit(X_trains[cat], Y_train[cat])
    adabs[cat] = adab
# sample_weights = compute_sample_weight('balanced', Y_train)

In [23]:
for cat in cats:
    print("%s:" % cat)
    test_skmodel(cat, adabs)

toxic:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00    144277
       True       1.00      0.99      1.00     15294

avg / total       1.00      1.00      1.00    159571

severe_toxic:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00    157976
       True       0.97      1.00      0.98      1595

avg / total       1.00      1.00      1.00    159571

obscene:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00    151122
       True       1.00      0.99      1.00      8449

avg / total       1.00      1.00      1.00    159571

threat:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00    159093
       True       0.98      1.00      0.99       478

avg / total       1.00      1.00      1.00    159571

insult:
             precision    recall  f1-score   support

      False       1.00      1.00      1.

In [25]:
def get_model_avgs(cat, ens):
    ens_preds = pd.DataFrame()
    for i in range(len(ens)):
        probas = ens[i][cat].predict_proba(X_trains[cat])
        ens_preds[i] = probas[:,1]
    return ens_preds.mean(axis=1)

def get_ens_preds(cat, ens):
    ens_preds = pd.DataFrame()
    ens_preds['preds'] = get_model_avgs(cat, ens) >= 0.5
    return  ens_preds['preds'].astype(int)

def test_skmodel_ensemble(cat, ens):
    print(classification_report(Y_train[cat], get_ens_preds(cat, ens)))

In [62]:
for cat in cats:
    test_skmodel_ensemble(cat, [mnbs, sgds])

             precision    recall  f1-score   support

      False       1.00      0.98      0.99    144277
       True       0.86      0.97      0.91     15294

avg / total       0.98      0.98      0.98    159571

             precision    recall  f1-score   support

      False       1.00      0.99      1.00    157976
       True       0.63      1.00      0.77      1595

avg / total       1.00      0.99      0.99    159571

             precision    recall  f1-score   support

      False       1.00      0.99      1.00    151122
       True       0.86      1.00      0.92      8449

avg / total       0.99      0.99      0.99    159571

             precision    recall  f1-score   support

      False       1.00      1.00      1.00    159093
       True       0.77      1.00      0.87       478

avg / total       1.00      1.00      1.00    159571

             precision    recall  f1-score   support

      False       1.00      0.98      0.99    151694
       True       0.78      1.00 

In [51]:
preds_df = pd.DataFrame()
preds_df['id'] = train['id']
preds_df['comment_text'] = train['comment_text'].values
preds_df['cleaned_comments'] = train['cleaned_comments'].values
    
for cat in cats:
    preds_df[cat] = pd.Series(get_ens_preds(cat, [mnbs]))

In [52]:
preds_df

Unnamed: 0,id,comment_text,cleaned_comments,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,explanation \n why the edit make under my user...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,daww he match this background colour I be seem...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",hey man I be really not try to edit war its ju...,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",more \n I can not make any real suggestion on ...,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",you sir be my hero any chance you remember wha...,0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",congratulation from me as well use the tool we...,0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,cocksucker before you pis around on my work,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,your vandalism to the matt shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,sorry if the word nonsense be offensive to you...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,alignment on this subject and which be contrar...,0,0,0,0,0,0


In [53]:
# cat = 'identity_hate'

# preds_df.where(preds_df[cat].astype(int) == 1).where(train[cat].astype(int)==0).dropna(how='all')

In [44]:
import gc
gc.collect()

0

In [45]:
all_test_terms = tfidf_vect.transform(test.cleaned_comments)

In [46]:
X_tests = dict()
for cat in cats:
    X_tests[cat] = selectors[cat].transform(all_test_terms)

In [63]:
for cat in cats:
    test[cat] = (pd.Series(mnbs[cat].predict_proba(X_tests[cat])[:,1]) + pd.Series(sgds[cat].predict_proba(X_tests[cat])[:,1]))/2

In [64]:
test

Unnamed: 0,id,comment_text,cleaned_comments,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,yo bitch ja rule be more succesful then you wi...,0.999998,0.425709,0.999913,4.099008e-01,0.999528,0.497790
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,from rfc the title be fine as it be imo,0.016324,0.000941,0.024992,2.105033e-06,0.016870,0.001008
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",source zawe ashton on lapland,0.191457,0.165568,0.167901,7.180652e-03,0.165887,0.164772
3,00017563c3f7919a,":If you have a look back at the source, the in...",if you have a look back at the source the info...,0.003146,0.000569,0.001079,5.115975e-05,0.001705,0.000313
4,00017695ad8997eb,I don't anonymously edit articles at all.,I do not anonymously edit article at all,0.076120,0.000632,0.127904,1.487069e-06,0.115827,0.000565
5,0001ea8717f6de06,Thank you for understanding. I think very high...,thank you for understand I think very highly o...,0.049374,0.000919,0.025690,3.806402e-04,0.035886,0.000726
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...,please do not add nonsense to wikipedia such e...,0.003165,0.000126,0.001641,9.824095e-07,0.002217,0.001079
7,000247e83dcc1211,:Dear god this site is horrible.,dear god this site be horrible,0.833509,0.002698,0.237861,4.623485e-05,0.301575,0.001201
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ...",only a fool can believe in such number the cor...,0.152480,0.000417,0.053887,1.130967e-06,0.043350,0.000065
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...,double redirect when fix double redirect do no...,0.006331,0.000224,0.009152,3.482646e-06,0.005155,0.000315


In [65]:
test[['id']+cats].to_csv('submission.csv', index=False)