* Based on https://www.kaggle.com/michaelsnell/conv1d-dpcnn-in-keras/code
* Paper - http://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf

In [1]:
#dpcnn http://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf
#dpcnn with conv1d, model architecture and all parameters copied from neptune-ml since it's publicly available
#https://github.com/neptune-ml/kaggle-toxic-starter/blob/master/best_configs/fasttext_dpcnn.yaml
#Got it to PLB 0.984 with 10fold cv on local computer after playing with parameters
#Try to improve score on your own local pc or throw it in the blender with the rest of them :)

import os
import gc
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.models import Model
from keras.layers import Input, Dense, Embedding, MaxPooling1D, Conv1D, SpatialDropout1D
from keras.layers import add, Dropout, Activation, PReLU, BatchNormalization, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam, Nadam
from keras import initializers, regularizers, constraints, callbacks
from keras.callbacks import Callback, CSVLogger, ModelCheckpoint, EarlyStopping



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Data preparation

In [2]:
embeddings = 'fasttext' #'glove', 'fasttext

if embeddings == 'fasttext':
    EMBEDDING_FILE = '../data/fasttext/crawl-300d-2M.vec'
else:
    EMBEDDING_FILE = '../data/glove/glove.840B.300d.txt'    

max_features = 100000  #100000 , 30000
maxlen = 200
embed_size = 300
prefix = 'c1' #x, #c1

print(EMBEDDING_FILE)

../data/fasttext/crawl-300d-2M.vec


In [3]:
# loading pre-trained word sequences and embedding matrix (see notebook "LSTM + FastText_GloVe + MaxPool")

import pickle

train_feats_path = '../models/{}_train_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
test_feats_path = '../models/{}_test_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
embedding_matrix_path = '../models/{}_{}_embedding_matrix_feat_{}.pkl'.format(prefix, embeddings, max_features)
print(train_feats_path)

#pickle.dump(x_train, open(train_feats_path, 'wb'))
#pickle.dump(x_test, open(test_feats_path, 'wb'))
#pickle.dump(embedding_matrix, open(embedding_matrix_path, 'wb'))

x_train = pickle.load(open(train_feats_path, 'rb') )
x_test = pickle.load(open(test_feats_path, 'rb') )
embedding_matrix = pickle.load(open(embedding_matrix_path, 'rb') )

train = pd.read_csv('../data/train.csv')
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
submission = pd.read_csv('../data/sample_submission.csv')

del train

../models/c1_train_feat_100000_seq_200.pkl


In [4]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.stopped_epoch = 0
        self.best = 0        

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

            # stopping condition - ROC stops improving
            if score > self.best:
                self.best = score
            else:
                pass
                #self.stopped_epoch = epoch
                #self.model.stop_training = True
                #print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))    

def schedule(ind):
    a = [0.001, 0.0005, 0.0001, 0.0001]
    return a[ind] if ind<len(a) else a[len(a)-1]

### Model

In [5]:
session_conf = tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4)
K.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))

#model
filter_nr = 32
filter_size = 3
max_pool_size = 3
max_pool_strides = 2
dense_nr = 256
spatial_dropout = 0.3
dense_dropout = 0.5
train_embed = False
cnn_depth = 7 #proposed in the paper
conv_kern_reg = regularizers.l2(0.00001)
conv_bias_reg = regularizers.l2(0.00001)

# try:
# spatial_dropout = 0.4
# remove dense dropout
# remove first additional CNN
# add linear activation
# use Conv filter 256

# a repeating block in the CNN
def conv_conv_pool_block(inp):
    pre_act = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(inp)

    block = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(pre_act)
    block = BatchNormalization()(block)
    block = PReLU()(block)
    block = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block)
    block = BatchNormalization()(block)
    block = PReLU()(block)

    return add([block, pre_act])  

def build_model():
    comment = Input(shape=(maxlen,))
    emb_comment = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=train_embed)(comment)
    emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment)

    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)
    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)

    #we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output
    #if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output
    #resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear', 
    #            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
    #resize_emb = PReLU()(resize_emb)
    
    resize_emb = Activation('relu')(emb_comment)
    resize_emb = Dense(filter_nr, activation='linear')(resize_emb)    
    block = add([block1, resize_emb])

    for i in range(0, cnn_depth-1):
        block = conv_conv_pool_block(block)

    output = GlobalMaxPooling1D()(block)

    output = Dense(dense_nr, activation='linear')(output)
    output = BatchNormalization()(output)
    output = PReLU()(output)
    output = Dropout(dense_dropout)(output)
    output = Dense(6, activation='sigmoid')(output)

    return Model(comment, output)

In [6]:
batch_size = 128
epochs = 30

#opt = Nadam(lr=0.001) #optimizer
opt = Adam(lr = 0.001, decay = 0) #Nadam(lr=0.002) #optimizer

model = build_model()
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 200, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 200, 32)      28832       spatial_dropout1d_1[0][0]        
__________________________

In [None]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
checkPoint = ModelCheckpoint('../snapshots/weights.{epoch:02d}-{val_loss:.2f}.hdf5')
csv_logger = CSVLogger('../training.log')
lr = callbacks.LearningRateScheduler(schedule)

model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), callbacks = [lr, ra_val] ,verbose=1)

### Stratified k-fold learning

In [9]:
## Stratified k-fold training

n_folds = 10
batch_size = 128
epochs = 30
predict_batch_size = 1024
run_id = 'dpcnn_fasttext_128'
opt = Adam(lr = 0.005, decay = 0) #Nadam(lr=0.002) #optimizer

kfold = StratifiedKFold(n_splits = 20, shuffle = True, random_state = 32)

csv_logger = CSVLogger('../training.log')
early_stop = EarlyStopping(verbose=2, monitor = "val_loss", mode = "min", patience = 5)

pred = np.zeros((x_test.shape[0], 6))
y_packed = np.packbits(y_train, axis=1)

for i, (train_idx, valid_idx) in enumerate(kfold.split(x_train, y_packed)):
    print("Running fold {} / {}".format(i + 1, n_folds))
    print("Training / Valid set counts {} / {}".format(train_idx.shape, valid_idx.shape))

    model = None    
    model = build_model()
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    #model.summary()
    
    xs_train, ys_train = x_train[train_idx], y_train[train_idx] 
    xs_valid, ys_valid = x_train[valid_idx], y_train[valid_idx]

    #check_point = ModelCheckpoint('../snapshots/' + run_id + '_fold_' + str(i) + '_weights.{epoch:02d}-{val_loss:.2f}.hdf5')
    roc_auc = RocAucEvaluation(validation_data=(xs_valid, ys_valid), interval=1)
    check_point = ModelCheckpoint('../snapshots/' + run_id + '_fold_' + str(i) +'.hdf5', monitor = "val_loss", 
                                  verbose = 1, save_best_only = True, mode = "min")

    # training
    history = model.fit(xs_train, ys_train, batch_size = batch_size, epochs = epochs, validation_data = (xs_valid, ys_valid), 
                          verbose = 2, callbacks=[roc_auc, csv_logger, check_point, early_stop])        
    # predict
    pred += model.predict(x_test, batch_size = predict_batch_size, verbose = 1)

    if (i + 1) == n_folds: break    
    
y_pred = pred/n_folds

Running fold 1 / 10
Training / Valid set counts (151582,) / (7989,)
Train on 151582 samples, validate on 7989 samples
Epoch 1/30
 - 62s - loss: 0.0668 - acc: 0.9783 - val_loss: 0.0549 - val_acc: 0.9813

 ROC-AUC - epoch: 1 - score: 0.976472 


Epoch 00001: val_loss improved from inf to 0.05492, saving model to ../snapshots/dpcnn_fasttext_128_fold_0.hdf5
Epoch 2/30
 - 56s - loss: 0.0555 - acc: 0.9813 - val_loss: 0.0522 - val_acc: 0.9824

 ROC-AUC - epoch: 2 - score: 0.979186 


Epoch 00002: val_loss improved from 0.05492 to 0.05221, saving model to ../snapshots/dpcnn_fasttext_128_fold_0.hdf5
Epoch 3/30
 - 56s - loss: 0.0538 - acc: 0.9815 - val_loss: 0.0531 - val_acc: 0.9822

 ROC-AUC - epoch: 3 - score: 0.981561 


Epoch 00003: val_loss did not improve
Epoch 4/30
 - 56s - loss: 0.0529 - acc: 0.9820 - val_loss: 0.0506 - val_acc: 0.9828

 ROC-AUC - epoch: 4 - score: 0.983929 


Epoch 00004: val_loss improved from 0.05221 to 0.05055, saving model to ../snapshots/dpcnn_fasttext_128_fold_0.h

Epoch 4/30
 - 61s - loss: 0.0531 - acc: 0.9820 - val_loss: 0.0519 - val_acc: 0.9829

 ROC-AUC - epoch: 4 - score: 0.980774 


Epoch 00004: val_loss improved from 0.05451 to 0.05192, saving model to ../snapshots/dpcnn_fasttext_128_fold_2.hdf5
Epoch 5/30
 - 61s - loss: 0.0519 - acc: 0.9822 - val_loss: 0.0529 - val_acc: 0.9822

 ROC-AUC - epoch: 5 - score: 0.983379 


Epoch 00005: val_loss did not improve
Epoch 6/30
 - 59s - loss: 0.0507 - acc: 0.9826 - val_loss: 0.0690 - val_acc: 0.9780

 ROC-AUC - epoch: 6 - score: 0.980911 


Epoch 00006: val_loss did not improve
Epoch 7/30
 - 57s - loss: 0.0499 - acc: 0.9828 - val_loss: 0.0494 - val_acc: 0.9828

 ROC-AUC - epoch: 7 - score: 0.986149 


Epoch 00007: val_loss improved from 0.05192 to 0.04936, saving model to ../snapshots/dpcnn_fasttext_128_fold_2.hdf5
Epoch 8/30
 - 57s - loss: 0.0493 - acc: 0.9830 - val_loss: 0.0511 - val_acc: 0.9821

 ROC-AUC - epoch: 8 - score: 0.986594 


Epoch 00008: val_loss did not improve
Epoch 9/30
 - 56s - loss


 ROC-AUC - epoch: 4 - score: 0.983190 


Epoch 00004: val_loss did not improve
Epoch 5/30
 - 58s - loss: 0.0511 - acc: 0.9825 - val_loss: 0.0470 - val_acc: 0.9836

 ROC-AUC - epoch: 5 - score: 0.982055 


Epoch 00005: val_loss improved from 0.04727 to 0.04701, saving model to ../snapshots/dpcnn_fasttext_128_fold_4.hdf5
Epoch 6/30
 - 58s - loss: 0.0504 - acc: 0.9828 - val_loss: 0.0474 - val_acc: 0.9839

 ROC-AUC - epoch: 6 - score: 0.984157 


Epoch 00006: val_loss did not improve
Epoch 7/30
 - 57s - loss: 0.0497 - acc: 0.9829 - val_loss: 0.0483 - val_acc: 0.9837

 ROC-AUC - epoch: 7 - score: 0.985074 


Epoch 00007: val_loss did not improve
Epoch 8/30
 - 57s - loss: 0.0491 - acc: 0.9829 - val_loss: 0.0489 - val_acc: 0.9838

 ROC-AUC - epoch: 8 - score: 0.982512 


Epoch 00008: val_loss did not improve
Epoch 9/30
 - 57s - loss: 0.0487 - acc: 0.9833 - val_loss: 0.0482 - val_acc: 0.9831

 ROC-AUC - epoch: 9 - score: 0.982753 


Epoch 00009: val_loss did not improve
Epoch 10/30
 - 57s - l

Epoch 3/30
 - 59s - loss: 0.0546 - acc: 0.9818 - val_loss: 0.0527 - val_acc: 0.9827

 ROC-AUC - epoch: 3 - score: 0.977780 


Epoch 00003: val_loss did not improve
Epoch 4/30
 - 59s - loss: 0.0530 - acc: 0.9822 - val_loss: 0.0516 - val_acc: 0.9829

 ROC-AUC - epoch: 4 - score: 0.978577 


Epoch 00004: val_loss improved from 0.05219 to 0.05156, saving model to ../snapshots/dpcnn_fasttext_128_fold_7.hdf5
Epoch 5/30
 - 59s - loss: 0.0519 - acc: 0.9824 - val_loss: 0.0528 - val_acc: 0.9824

 ROC-AUC - epoch: 5 - score: 0.979064 


Epoch 00005: val_loss did not improve
Epoch 6/30
 - 59s - loss: 0.0510 - acc: 0.9825 - val_loss: 0.0513 - val_acc: 0.9828

 ROC-AUC - epoch: 6 - score: 0.980473 


Epoch 00006: val_loss improved from 0.05156 to 0.05132, saving model to ../snapshots/dpcnn_fasttext_128_fold_7.hdf5
Epoch 7/30
 - 57s - loss: 0.0500 - acc: 0.9828 - val_loss: 0.0500 - val_acc: 0.9833

 ROC-AUC - epoch: 7 - score: 0.982419 


Epoch 00007: val_loss improved from 0.05132 to 0.04996, saving

 - 58s - loss: 0.0473 - acc: 0.9834 - val_loss: 0.0464 - val_acc: 0.9838

 ROC-AUC - epoch: 11 - score: 0.988716 


Epoch 00011: val_loss did not improve
Epoch 12/30
 - 58s - loss: 0.0471 - acc: 0.9835 - val_loss: 0.0460 - val_acc: 0.9831

 ROC-AUC - epoch: 12 - score: 0.989453 


Epoch 00012: val_loss did not improve
Epoch 00012: early stopping


In [None]:
y_pred = model.predict(x_test)
submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('dpcnn_test_preds.csv', index=False)