* Based on https://www.kaggle.com/michaelsnell/conv1d-dpcnn-in-keras/code
* Paper - http://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf

In [3]:
#dpcnn http://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf
#dpcnn with conv1d, model architecture and all parameters copied from neptune-ml since it's publicly available
#https://github.com/neptune-ml/kaggle-toxic-starter/blob/master/best_configs/fasttext_dpcnn.yaml
#Got it to PLB 0.984 with 10fold cv on local computer after playing with parameters
#Try to improve score on your own local pc or throw it in the blender with the rest of them :)

import os
import gc
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

from keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.models import Model
from keras.layers import Input, Dense, Embedding, MaxPooling1D, Conv1D, SpatialDropout1D
from keras.layers import add, Dropout, PReLU, BatchNormalization, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam, Nadam
from keras import initializers, regularizers, constraints, callbacks

### Data preparation

In [4]:
embeddings = 'glove' #'glove', 'fasttext

if embeddings == 'fasttext':
    EMBEDDING_FILE = '../data/fasttext/crawl-300d-2M.vec'
else:
    EMBEDDING_FILE = '../data/glove/glove.840B.300d.txt'    

max_features = 100000  #100000 , 30000
maxlen = 200
embed_size = 300
prefix = 'c1' #x, #c1

print(EMBEDDING_FILE)

../data/glove/glove.840B.300d.txt


In [5]:
# loading pre-trained word sequences and embedding matrix (see notebook "LSTM + FastText_GloVe + MaxPool")

import pickle

train_feats_path = '../models/{}_train_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
test_feats_path = '../models/{}_test_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
embedding_matrix_path = '../models/{}_{}_embedding_matrix_feat_{}.pkl'.format(prefix, embeddings, max_features)
print(train_feats_path)

#pickle.dump(x_train, open(train_feats_path, 'wb'))
#pickle.dump(x_test, open(test_feats_path, 'wb'))
#pickle.dump(embedding_matrix, open(embedding_matrix_path, 'wb'))

x_train = pickle.load(open(train_feats_path, 'rb') )
x_test = pickle.load(open(test_feats_path, 'rb') )
embedding_matrix = pickle.load(open(embedding_matrix_path, 'rb') )

train = pd.read_csv('../data/train.csv')
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
submission = pd.read_csv('../data/sample_submission.csv')

del train

../models/c1_train_feat_100000_seq_200.pkl


In [6]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.stopped_epoch = 0
        self.best = 0        

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

            # stopping condition - ROC stops improving
            if score > self.best:
                self.best = score
            else:
                self.stopped_epoch = epoch
                self.model.stop_training = True
                print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))    

    def schedule(ind):
        a = [0.001, 0.0005, 0.0001, 0.0001]
        return a[ind] if ind<len(a) else a[len(a)-1]

### Model

In [10]:
session_conf = tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4)
K.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))

#model
filter_nr = 64
filter_size = 3
max_pool_size = 3
max_pool_strides = 2
dense_nr = 256
spatial_dropout = 0.2
dense_dropout = 0.5
train_embed = False
cnn_depth = 7 #proposed in the paper
conv_kern_reg = regularizers.l2(0.00001)
conv_bias_reg = regularizers.l2(0.00001)

# try:
# spatial_dropout = 0.4
# remove dense dropout
# remove first additional CNN
# add linear activation
# use Conv filter 256

# a repeating block in the CNN
def conv_conv_pool_block(inp):
    pre_act = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(inp)

    block = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(pre_act)
    block = BatchNormalization()(block)
    block = PReLU()(block)
    block = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block)
    block = BatchNormalization()(block)
    block = PReLU()(block)

    return add([block, pre_act])  

def get_model():
    comment = Input(shape=(maxlen,))
    emb_comment = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=train_embed)(comment)
    emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment)

    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)
    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)

    #we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output
    #if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output
    resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
    resize_emb = PReLU()(resize_emb)
    block1_output = add([block1, resize_emb])

    block = block1_output

    for i in range(0, cnn_depth-1):
        block = conv_conv_pool_block(block)

    output = GlobalMaxPooling1D()(block)

    output = Dense(dense_nr, activation='linear')(output)
    output = BatchNormalization()(output)
    output = PReLU()(output)
    output = Dropout(dense_dropout)(output)
    output = Dense(6, activation='sigmoid')(output)

    return Model(comment, output)

In [11]:
batch_size = 128
epochs = 4

#opt = Nadam(lr=0.001) #optimizer
opt = Adam(lr=0.001) # lr=0.003, decay=0.01, beta_1=0.9, beta_2=0.999

model = get_model()
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 300)     30000000    input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 200, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 200, 64)      57664       spatial_dropout1d_2[0][0]        
__________________________________________________________________________________________________
batch_norm

In [None]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
checkPoint = ModelCheckpoint('../snapshots/weights.{epoch:02d}-{val_loss:.2f}.hdf5')
csv_logger = CSVLogger('../training.log')
lr = callbacks.LearningRateScheduler(schedule)

model.fit(Xtrain, ytrain, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), callbacks = [lr, ra_val] ,verbose=1)

In [None]:
y_pred = model.predict(x_test)
submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('dpcnn_test_preds.csv', index=False)