In [1]:
%matplotlib inline
import os
import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

import re
import string
import shutil

from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sn

import sklearn as sk
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

from gensim.models import KeyedVectors

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, LSTM, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding, Flatten, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tqdm import tqdm, tqdm_notebook, tnrange, trange
import hyperas
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional
from hyperopt import Trials, STATUS_OK, tpe, STATUS_FAIL
from keras_tqdm import TQDMNotebookCallback

Using TensorFlow backend.


In [2]:
tqdm.monitor_interval = 0
tqdm_notebook().pandas()
sn.set_context('notebook')




In [20]:
cats = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
cat_col_i = {'toxic':0,'severe_toxic':1,'obscene':2,'threat':3,'insult':4,'identity_hate':5}

In [3]:
train = pd.read_pickle('CLEAN/train.P.gz', compression='gzip')
test = pd.read_pickle('CLEAN/test.P.gz', compression='gzip')

In [13]:
import pickle

with open("NN_INPUTS/train.seq.P", 'rb') as f:
    train_data = pickle.load(f)
with open("NN_INPUTS/test.seq.P", 'rb') as f:
    test_data = pickle.load(f)
with open("NN_INPUTS/embedding.matrix.P", 'rb') as f:
    embedding_matrix = pickle.load(f)
with open("NN_INPUTS/word_index.P", 'rb') as f:
    word_index = pickle.load(f)

## Assemble the Model

In [14]:
EMBEDDING_DIM=100
MAX_SEQUENCE_LENGTH=400
MAX_NUM_WORDS=25000

In [21]:
def custom_compute_class_weights(y):
    cat_counts = y.sum(axis='rows')
    total_comments = y.shape[0]
    results = (total_comments / cat_counts).tolist()
    result_dict = dict()
    for i in range(len(results)):
        result_dict[i] = results[i]
    return result_dict

In [58]:
Y_train = pd.DataFrame()
Y_train['anytoxic'] = train[cats].astype(int).sum(axis='columns') > 0

In [126]:
def generate_model():    
    model = Sequential()
    model.add(Embedding(MAX_NUM_WORDS + 1,
                        EMBEDDING_DIM,
                        input_length=MAX_SEQUENCE_LENGTH,
                        weights=[embedding_matrix],
                        trainable=True
                       )
             )
    # model.add(Conv1D(250, 3, activation='relu'))
    # model.add(MaxPooling1D(3))
    model.add(Conv1D(250, 3, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='hard_sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    import gc
    gc.collect()
    print(model.summary())
    return model

In [100]:
# sample_weight = compute_sample_weight('balanced', train[cats])

In [111]:
class_weight = custom_compute_class_weights(train[cats])

In [164]:
def test_model(cat, model):
    model_preds = model.predict_proba(x=train_data)
    print(classification_report(np.asarray(train[cat], dtype='float32'), model_preds>0.5))
    return model_preds

def fit_model(cat, model, epochs=5):
    model.fit(
        x=train_data, 
        y=np.asarray(train[cat], dtype='float32'), 
        epochs=epochs, 
        batch_size=128, 
        verbose=0,
        class_weight=compute_class_weight('balanced', [False, True],train[cat]),
        callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=False)]
    )
    return test_model(cat, model)

In [137]:
models=dict()
pred_dict = dict()

In [138]:
for cat in cats:
    models[cat] = generate_model()
    pred_dict[cat]=fit_model(cat, models[cat], epochs=15)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 400, 100)          2500100   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 398, 250)          75250     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 250)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               32128     
_________________________________________________________________
dense_8 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 129       
Total params: 2,624,119
Trainable params: 2,624,119
Non-trainable params: 0
_________________________________________________________________



        0.0       1.00      0.98      0.99    144277
        1.0       0.82      0.99      0.90     15294

avg / total       0.98      0.98      0.98    159571

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 400, 100)          2500100   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 398, 250)          75250     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 250)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 128)               32128     
_________________________________________________________________
dense_11 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_12 (Dense)             (None, 1)        

             precision    recall  f1-score   support

        0.0       1.00      0.99      1.00    157976
        1.0       0.52      0.97      0.68      1595

avg / total       0.99      0.99      0.99    159571

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 400, 100)          2500100   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 398, 250)          75250     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 250)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 128)               32128     
_________________________________________________________________
dense_14 (Dense)             (None, 128)               16512     
___________________________________________________________


        0.0       1.00      0.99      0.99    151122
        1.0       0.81      0.99      0.89      8449

avg / total       0.99      0.99      0.99    159571

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 400, 100)          2500100   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 398, 250)          75250     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 250)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 128)               32128     
_________________________________________________________________
dense_17 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_18 (Dense)             (None, 1)        


        0.0       1.00      1.00      1.00    159093
        1.0       0.00      0.00      0.00       478

avg / total       0.99      1.00      1.00    159571



  'precision', 'predicted', average, warn_for)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 400, 100)          2500100   
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 398, 250)          75250     
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 250)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 128)               32128     
_________________________________________________________________
dense_20 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 129       
Total params: 2,624,119
Trainable params: 2,624,119
Non-trainable params: 0
_________________________________________________________________



        0.0       1.00      0.98      0.99    151694
        1.0       0.77      0.99      0.87      7877

avg / total       0.99      0.98      0.99    159571

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 400, 100)          2500100   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 398, 250)          75250     
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 250)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 128)               32128     
_________________________________________________________________
dense_23 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_24 (Dense)             (None, 1)        


        0.0       1.00      0.99      1.00    158166
        1.0       0.61      0.95      0.74      1405

avg / total       1.00      0.99      0.99    159571



In [124]:
import gc
gc.collect()
keras.backend.clear_session()

In [142]:
pred_df = pd.DataFrame()

In [147]:
for cat in cats:
    pred_df[cat] = pd.Series(pred_dict[cat].flatten())

In [148]:
pred_df

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4,0.000000,0.000000,0.043969,0.0,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
6,1.000000,1.000000,1.000000,0.0,1.000000,0.000000
7,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.0,0.000000,0.000000


In [168]:

threat_weight = compute_class_weight('balanced', [False, True],train[cat])

In [169]:
threat_weight[0] *= 0.01

In [171]:
threat_weight[1] *= 100

In [172]:
threat_weight

array([  5.04441536e-03,   5.67868327e+03])

In [174]:
models['threat'].fit(
        x=train_data, 
        y=np.asarray(train[cat], dtype='float32'), 
        epochs=5, 
        batch_size=128, 
        verbose=0,
        class_weight=threat_weight,
        callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=False)]
    )

<keras.callbacks.History at 0x7efc28c494a8>

In [175]:
threat_preds = test_model('threat', models['threat'])


        0.0       1.00      1.00      1.00    159093
        1.0       0.00      0.00      0.00       478

avg / total       0.99      1.00      1.00    159571



  'precision', 'predicted', average, warn_for)


In [177]:
test_pred_df = pd.DataFrame()
test_pred_df['id'] = test['id']

In [180]:
for cat in cats:
    print("\nprocessing: %s" % cat)
    if cat == "threat":
        test_pred_df[cat] = 0
    else:
        test_pred_df[cat] = models[cat].predict_proba(test_data)

processing: toxic
processing: insult

In [181]:
cats

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [186]:
test_pred_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1.000000,0.000000,1.000000,0,1.000000,0.849143
1,0000247867823ef7,0.000000,0.000000,0.000000,0,0.000000,0.000000
2,00013b17ad220c46,0.000000,0.000000,0.000000,0,0.000000,0.000000
3,00017563c3f7919a,0.000000,0.000000,0.000000,0,0.000000,0.000000
4,00017695ad8997eb,0.000000,0.000000,0.000000,0,0.000000,0.000000
5,0001ea8717f6de06,0.000000,0.000000,0.000000,0,0.000000,0.000000
6,00024115d4cbde0f,0.000000,0.000000,0.000000,0,0.000000,0.000000
7,000247e83dcc1211,1.000000,0.000000,0.000000,0,0.000000,0.000000
8,00025358d4737918,0.000000,0.000000,0.000000,0,0.000000,0.000000
9,00026d1092fe71cc,0.000000,0.000000,0.000000,0,0.000000,0.000000


In [187]:
ens_df = pd.read_csv("SUBMIT/ens.csv")

In [195]:
ens_df.to_csv("SUBMIT/convolution.csv", index=False)

In [196]:
combined_df = pd.DataFrame()
combined_df['id'] = test['id']
for cat in cats:
    if cat == 'threat':
        combined_df[cat] = ens_df[cat]
    else:
        combined_df[cat] = test_pred_df[cat]

In [197]:
combined_df.to_csv("SUBMIT/convolution_with_threat_from_ens.csv", index=False)