In [5]:
%matplotlib inline
import os
import re
import string
import shutil

from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import scipy as sp

import sklearn as sk
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

from gensim.models import KeyedVectors

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, LSTM, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding, Flatten, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tqdm import tqdm, tqdm_notebook, tnrange, trange

In [6]:
tqdm.monitor_interval = 0
tqdm_notebook().pandas()




In [2]:
train = pd.read_pickle('CLEAN/train.P.gz', compression='gzip')

In [3]:
test = pd.read_pickle('CLEAN/test.P.gz', compression='gzip')

In [8]:
train.iloc[0:5]

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comments
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
22256635,"Nonsense? kiss off, geek. what I said is true...",True,False,False,False,False,False,nonsense kiss off geek what I say be true I wi...
27450690,"""\n\n Please do not vandalize pages, as you di...",False,False,False,False,False,False,please do not vandalize page as you do with th...
54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",False,False,False,False,False,False,point of interest I remove the point of intere...
77493077,Asking some his nationality is a Racial offenc...,False,False,False,False,False,False,ask some his nationality be a racial offence w...
79357270,The reader here is not going by my say so for ...,False,False,False,False,False,False,the reader here be not go by my say so for eth...


In [9]:
cats = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [6]:
# train.where(train['identity_hate'] != 0).where(train['toxic'] == 0).dropna(how='all').shape

(63, 10)

In [8]:
# train.where(train[cats].sum(axis='columns') == 0).dropna(how='all').shape

(86061, 10)

In [9]:
# train["cleaned_comments"] = train["cleaned_comments"].astype(str)
# test["cleaned_comments"] = test["cleaned_comments"].astype(str)

In [10]:
print(train.shape)
print(train.columns)

(95851, 8)
Index(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate', 'cleaned_comments'],
      dtype='object')


In [11]:
def get_vect_vocab(data, target_maxlen):
    cvect = CountVectorizer(ngram_range=(1,3), max_features=target_maxlen)
    cvect.fit(data)
    return set(cvect.vocabulary_.keys())
    
def generate_vectorizer(target_maxlen=40000):
    terms = set()
    df = train.where(train[cats].sum(axis=1)==0).dropna(how='all').cleaned_comments
    terms.update(get_vect_vocab(df,target_maxlen=target_maxlen))
    for cat in cats:
        df = train.where(train[cat]==1 ).dropna(how='all').cleaned_comments
        terms.update(get_vect_vocab(df,target_maxlen=target_maxlen))
    return CountVectorizer(ngram_range=(1,3),vocabulary=terms),terms

In [12]:
cvect,vocab = generate_vectorizer()
voc_size = len(vocab)
print(voc_size)
X_train = cvect.transform(train.cleaned_comments)
Y_train = train[cats]

128313


In [13]:
cat_counts=pd.DataFrame(Y_train.values).sum().to_dict()
total = Y_train.shape[0]
cat_weight = dict()
for cat in cat_counts:
    cat_weight[cat] = (total-cat_counts[cat]) / total
abnorm_total = (Y_train.sum(axis=1)>0).apply(int).sum()
norm_weight = abnorm_total/total
def calc_weight_for(s_obj):
    if s_obj.sum() == 0:
        return norm_weight
    else:
        weights = list()
        for i in range(s_obj.shape[0]):
            if s_obj[i]:
                weights.append(cat_weight[i])
        return max(weights)
samp_weight = Y_train.apply(calc_weight_for,axis=1)

In [14]:
def to_preds(prob):
    return int(prob >= 0.5)

round_preds = np.vectorize(to_preds)

def custom_metric(y_true, y_pred):
    return keras.backend.mean(keras.metrics.binary_accuracy(y_true, keras.backend.round(y_pred)))

def test_model(model, train_data, cat):
    # result = model.predict(X_train, batch_size=128, verbose=1)
    result = model.predict(train_data, batch_size=128, verbose=1)
    res_df = pd.DataFrame(round_preds(result))
#     res_df.columns = [cat]
    print(classification_report(Y_train[cat], res_df))

In [11]:
# all_comments = pd.concat([train.cleaned_comments,test.cleaned_comments])
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(all_comments)
# word_index = tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))
# train_sequences = tokenizer.texts_to_sequences(train.cleaned_comments)
# test_sequences = tokenizer.texts_to_sequences(test.cleaned_comments)
# sequence_pad_len=5000
# train_data = pad_sequences(train_sequences, maxlen=sequence_pad_len)
# test_data = pad_sequences(test_sequences, maxlen=sequence_pad_len)

In [12]:
# max_comment_len = 0
# big_lim = 4000
# big_lens = {'set_name':list(),'index':list(),'comment_len':list()}
# i=0
# for s in train_sequences:
#     max_comment_len=max(max_comment_len, len(s))
#     if len(s) > big_lim:
#         big_lens['set_name'].append('train')
#         big_lens['index'].append(i)
#         big_lens['comment_len'].append(len(s))
#     i += 1
# i=0
# for s in test_sequences:
#     max_comment_len=max(max_comment_len, len(s))
#     if len(s) > big_lim:
#         big_lens['set_name'].append('test')
#         big_lens['index'].append(i)
#         big_lens['comment_len'].append(len(s))
#     i += 1
# big_lens_df = pd.DataFrame(big_lens)
# max_comment_len
# import seaborn as sn
# sn.distplot(a=big_lens_df.comment_len)

In [13]:
# embedding_dim = 25
# word2vec = KeyedVectors.load_word2vec_format(
#             os.path.join("W2V",
#                 'w2v.twitter.27B.%dd.txt' % embedding_dim),
#             binary=False)

In [14]:
# embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
# for word, i in word_index.items():
#     if word in word2vec.wv.vocab:
#         embedding_matrix[i] = word2vec.wv[word]

In [15]:
# keras.backend.clear_session()
# conv_model = Sequential()
# conv_model.add(Embedding(len(word_index) + 1, 
#                             embedding_dim, 
#                             weights=[embedding_matrix], 
#                             input_length=sequence_pad_len)
#               )
# conv_model.add(Conv1D(128, 5, activation='relu'))
# conv_model.add(MaxPooling1D(5))
# conv_model.add(Conv1D(128, 5, activation='relu'))
# conv_model.add(MaxPooling1D(5))
# conv_model.add(Conv1D(128, 5, activation='relu'))
# conv_model.add(MaxPooling1D(35))
# conv_model.add(Flatten())
# conv_model.add(Dense(128, activation='relu'))
# conv_model.add(Dense(128, activation='relu'))
# conv_model.add(Dense(len(cats), activation='hard_sigmoid'))

# model.compile(loss='mean_squared_error',
#               optimizer='rmsprop',
#               metrics=['acc', custom_metric])
# model.summary()

In [16]:
# model.fit(x=train_data, y=Y_train, batch_size=32, sample_weight=samp_weight,epochs=2)

In [78]:
# test_model(model, train_data)

In [15]:
# cvect = CountVectorizer(ngram_range=(1,3),lowercase=False)
# all_terms = cvect.fit_transform(train.cleaned_comments)
tfidf_vect = TfidfVectorizer(ngram_range=(1,3),lowercase=False)
all_terms = tfidf_vect.fit_transform(train.cleaned_comments)

In [16]:
selector = SelectKBest(chi2, k=200000)
# X_train = selector.fit_transform(all_terms, Y_train[['toxic', cat]])
all_selected = selector.fit_transform(all_terms, Y_train)

In [17]:
# sample_weights = compute_sample_weight('balanced',Y_train[cat])
sample_weights = compute_sample_weight('balanced', Y_train)

In [18]:
# tfidf_trans = TfidfTransformer()
# X_train = tfidf_trans.fit_transform(selected, Y_train)

In [19]:
X_train.shape

(95851, 128313)

In [20]:
X_trains = dict()
for cat in cats:
    selector = SelectKBest(chi2, k=200000)
    # X_train = selector.fit_transform(all_terms, Y_train[['toxic', cat]])
    selected = selector.fit_transform(all_terms, Y_train[cat])
    X_trains[cat] = selected

In [21]:
def test_skmodel(cat, models):
    preds = models[cat].predict(X_trains[cat])
    print(classification_report(Y_train[cat], preds))

In [22]:
# from sklearn.neighbors import KNeighborsClassifier
mnbs = dict()
for cat in cats:
    mnb = MultinomialNB(alpha=1e-6)
    sample_weights = compute_sample_weight('balanced', Y_train[cat])
    mnb.fit(X_trains[cat], Y_train[cat], sample_weight=sample_weights)
#     mnb.fit(X_trains[cat], Y_train[cat])
    mnbs[cat] = mnb

In [23]:
for cat in cats:
    print("%s:" % cat)
    test_skmodel(cat, mnbs)

toxic:
             precision    recall  f1-score   support

      False       1.00      0.99      0.99     86614
       True       0.88      0.98      0.93      9237

avg / total       0.99      0.99      0.99     95851

severe_toxic:
             precision    recall  f1-score   support

      False       1.00      0.99      1.00     94886
       True       0.64      1.00      0.78       965

avg / total       1.00      0.99      0.99     95851

obscene:
             precision    recall  f1-score   support

      False       1.00      0.98      0.99     90742
       True       0.72      1.00      0.84      5109

avg / total       0.99      0.98      0.98     95851

threat:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00     95546
       True       0.63      1.00      0.77       305

avg / total       1.00      1.00      1.00     95851

insult:
             precision    recall  f1-score   support

      False       1.00      0.98      0.

In [24]:
from sklearn.linear_model import SGDClassifier
sgds = dict()
for cat in cats:
    sgd = SGDClassifier(
        loss='log', 
        penalty='l2',
        alpha=1e-9,
        tol=1e-6,
        max_iter=2000,
        n_jobs=-1,
        class_weight='balanced',
        learning_rate='optimal',
#         verbose=3
    )
#     sample_weights = compute_sample_weight('balanced', Y_train[cat])
#     mnb.fit(X_trains[cat], Y_train[cat], sample_weight=sample_weights)
    sgd.fit(X_trains[cat], Y_train[cat])
    sgds[cat] = sgd
# sample_weights = compute_sample_weight('balanced', Y_train)

In [25]:
for cat in cats:
    print("%s:" % cat)
    test_skmodel(cat, sgds)

toxic:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00     86614
       True       0.98      1.00      0.99      9237

avg / total       1.00      1.00      1.00     95851

severe_toxic:
             precision    recall  f1-score   support

      False       1.00      0.99      0.99     94886
       True       0.45      0.99      0.62       965

avg / total       0.99      0.99      0.99     95851

obscene:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00     90742
       True       0.95      1.00      0.98      5109

avg / total       1.00      1.00      1.00     95851

threat:
             precision    recall  f1-score   support

      False       1.00      0.99      1.00     95546
       True       0.33      0.98      0.49       305

avg / total       1.00      0.99      1.00     95851

insult:
             precision    recall  f1-score   support

      False       1.00      1.00      1.

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adabs = dict()
for cat in cats:
    adab = SGDClassifier(
        loss='log', 
        penalty='l2',
        alpha=1e-9,
        tol=1e-6,
        max_iter=2000,
        n_jobs=-1,
        class_weight='balanced',
        learning_rate='optimal',
#         verbose=3
    )
#     sample_weights = compute_sample_weight('balanced', Y_train[cat])
#     mnb.fit(X_trains[cat], Y_train[cat], sample_weight=sample_weights)
    adab.fit(X_trains[cat], Y_train[cat])
    adabs[cat] = adab
# sample_weights = compute_sample_weight('balanced', Y_train)

In [163]:
def get_model_avgs(cat, ens):
    ens_preds = pd.DataFrame()
    for i in range(len(ens)):
        probas = ens[i][cat].predict_proba(X_trains[cat])
        ens_preds[i] = probas[:,1]
    return ens_preds.mean(axis=1)

def get_ens_preds(cat, ens):
    ens_preds = pd.DataFrame()
    ens_preds['preds'] = get_model_avgs(cat, ens) >= 0.5
    return  ens_preds['preds'].astype(int)

def test_skmodel_ensemble(cat, ens):
    print(classification_report(Y_train[cat], get_ens_preds(cat, ens)))

In [164]:
for cat in cats:
    test_skmodel_ensemble(cat, [mnbs, sgds])

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     86614
          1       0.98      1.00      0.99      9237

avg / total       1.00      1.00      1.00     95851



  np.exp(prob, prob)


             precision    recall  f1-score   support

          0       1.00      0.98      0.99     94886
          1       0.37      1.00      0.54       965

avg / total       0.99      0.98      0.99     95851

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     90742
          1       0.94      1.00      0.97      5109

avg / total       1.00      1.00      1.00     95851

             precision    recall  f1-score   support

          0       1.00      0.99      0.99     95546
          1       0.18      1.00      0.30       305

avg / total       1.00      0.99      0.99     95851

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     91086
          1       0.93      1.00      0.97      4765

avg / total       1.00      1.00      1.00     95851

             precision    recall  f1-score   support

          0       1.00      0.99      1.00     95037
          1       0.46      1.00 

In [165]:
preds_df = pd.DataFrame()
for col in ['id', 'comment_text', 'cleaned_comments']:
    preds_df[col] = train[col]
for cat in cats:
    preds_df[cat] = pd.Series(get_ens_preds(cat, [sgds, mnbs]))
preds_df


  np.exp(prob, prob)


Unnamed: 0,id,comment_text,cleaned_comments,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",nonsense kiss off geek what -PRON- say be true...,1,0,0,0,0,0
1,27450690,"""\r\n\r\n Please do not vandalize pages, as yo...",please do not vandalize page as -PRON- do with...,0,0,0,0,0,0
2,54037174,"""\r\n\r\n """"Points of interest"""" \r\n\r\nI rem...",point of interest -PRON- remove the point of i...,0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,ask some -PRON- nationality be a racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,the reader here be not go by -PRON- say so for...,0,0,0,0,0,0
5,82428052,Fried chickens \r\n\r\nIs dat sum fried chickens?,fry chicken be dat sum fry chicken,0,0,0,0,0,0
6,87311443,Why can you put English for example on some pl...,why can -PRON- put english for example on some...,0,0,0,0,0,0
7,114749757,Guy Fawkes \r\n\r\nim a resident in bridgwater...,guy fawk -PRON- be a resident in bridgwater an...,0,0,0,0,0,0
8,138560519,as far as nicknames go this article is embarra...,as far as nickname go this article be embarras...,0,0,0,0,0,0
9,139353149,Woodland Meadows\r\nGood to hear that you corr...,woodland meadow \r\n good to hear that -PRON- ...,0,0,0,0,0,0


In [166]:
cat = 'insult'
preds_df.where(preds_df[cat]  == 1).where(train[cat] != 1).dropna(how='all')

Unnamed: 0,id,comment_text,cleaned_comments,toxic,severe_toxic,obscene,threat,insult,identity_hate
26,2.936680e+08,"""\r\nThe Graceful Slick....\r\nIs non other th...",the graceful slick \r\n be non other than an u...,1.0,0.0,1.0,0.0,1.0,0.0
234,2.414065e+09,"""\r\nok a direct quote from the page for fuck ...",ok a direct quote from the page for fuck be ot...,1.0,0.0,1.0,0.0,1.0,0.0
359,3.733982e+09,Why do you like to smear fecal matter all over...,why do -PRON- like to smear fecal matter all o...,1.0,0.0,0.0,0.0,1.0,0.0
717,7.542551e+09,Ram it up ur ass very hard till ur eyes water,ram -PRON- up ur ass very hard till ur eye water,0.0,0.0,1.0,0.0,1.0,0.0
1758,1.808836e+10,"Hi \r\n\r\nDear Raimagini,\r\n\r\nI would requ...",hi dear raimagini \r\n -PRON- would request al...,0.0,0.0,1.0,1.0,1.0,0.0
2786,2.851414e+10,fucking liberal prick and get a life!,fuck liberal prick and get a life,1.0,1.0,1.0,1.0,1.0,1.0
3862,4.004265e+10,69.47.197.27 Ip fuck you.,number19727 ip fuck -PRON-,1.0,1.0,1.0,0.0,1.0,0.0
5543,5.786735e+10,And your a PRICK TO EMPOWER ASSHOLES LIKE PROPOL,and -PRON- a prick to empower asshole like propol,1.0,0.0,0.0,0.0,1.0,0.0
6289,6.535200e+10,how bout \r\n\r\nyou fuck off,how bout -PRON- fuck off,1.0,1.0,1.0,1.0,1.0,0.0
7209,7.487965e+10,do you arrogant bastards have nothing better d...,do -PRON- arrogant bastard have nothing good d...,1.0,1.0,1.0,0.0,1.0,0.0


In [92]:
preds_df.iloc[717]

id                                                        7542550676
comment_text           Ram it up ur ass very hard till ur eyes water
cleaned_comments    ram -PRON- up ur ass very hard till ur eye water
toxic                                                              1
severe_toxic                                                       0
obscene                                                            1
threat                                                             0
insult                                                             1
identity_hate                                                      0
Name: 717, dtype: object

In [None]:
sample_weights = compute_sample_weight('balanced', Y_train)

In [190]:
keras.backend.clear_session()
#input layer
MLP_model = Sequential()
MLP_model.add(Dense(256, input_shape=(X_train.shape[1],), activation='relu'))
#hidden layers
MLP_model.add(Dense(256, activation='relu'))
MLP_model.add(Dense(256, activation='relu'))
MLP_model.add(Dense(256, activation='relu'))
#output layer
# MLP_model.add(Dense(len(cats), activation='hard_sigmoid'))
MLP_model.add(Dense(1, activation='hard_sigmoid'))

MLP_model.compile(optimizer='adam',
             loss='mse',
             metrics=['acc', custom_metric])
MLP_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               51200256  
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257       
Total params: 51,397,889
Trainable params: 51,397,889
Non-trainable params: 0
_________________________________________________________________


In [191]:
MLP_model.fit(x=X_train, y=Y_train[cat], batch_size=256, sample_weight=sample_weights, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x18d0be6bb38>

In [192]:
# MLP_model.optimizer.lr = keras.backend.variable(1e-4, name='lr')
MLP_model.fit(x=X_train, y=Y_train[cat], batch_size=256, sample_weight=sample_weights, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x18d0b560fd0>

In [193]:
MLP_model.fit(x=X_train, y=Y_train[cat], batch_size=256, sample_weight=sample_weights, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [None]:
MLP_model.fit(x=X_train, y=Y_train[cat], batch_size=256, epochs=3)

In [None]:
test_model(MLP_model, X_train, cat)

In [166]:
test_input = selector.transform(cvect.transform(test.cleaned_comments))

In [167]:

test_preds = MLP_model.predict(test_input , batch_size=256,verbose=1)



In [168]:
import pickle 
f = open('%s_preds.P' % cat,'wb')
pickle.dump(test_preds, f)
f.close()

In [165]:
MLP_model.save('%s.keras.mdl' % cat)

In [154]:
train.where(train['severe_toxic'] == True).where(train['toxic']  != True).dropna(how='all')

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comments


In [76]:
import gc
gc.collect()

69648