Reference: https://www.kaggle.com/umbertogriffo/combined-gru-and-cnn-fasttext-badwords/code

In [13]:
import os
import time
import gc

import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import h5py

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout
from keras.layers import GRU, LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D
from keras.optimizers import Adam, Nadam
from keras.preprocessing import text, sequence
from keras.callbacks import Callback, CSVLogger, ModelCheckpoint, EarlyStopping

import warnings
warnings.filterwarnings('ignore')

os.environ['OMP_NUM_THREADS'] = '4'

In [2]:
embeddings = 'glove' #'glove', 'fasttext

if embeddings == 'fasttext':
    EMBEDDING_FILE = '../data/fasttext/crawl-300d-2M.vec'
else:
    EMBEDDING_FILE = '../data/glove/glove.840B.300d.txt'    

max_features = 100000  #100000 , 30000
maxlen = 200
embed_size = 300
prefix = 'c1' #x, #c1

print(EMBEDDING_FILE)

../data/glove/glove.840B.300d.txt


In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

del train
del test

### Learning

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
gc.collect()

In [None]:
#len(word_index), max_features
embedding_matrix.shape

In [3]:
import pickle

train_feats_path = '../models/{}_train_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
test_feats_path = '../models/{}_test_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
embedding_matrix_path = '../models/{}_{}_embedding_matrix_feat_{}.pkl'.format(prefix, embeddings, max_features)

#pickle.dump(x_train, open(train_feats_path, 'wb'))
#pickle.dump(x_test, open(test_feats_path, 'wb'))
#pickle.dump(embedding_matrix, open(embedding_matrix_path, 'wb'))

x_train = pickle.load(open(train_feats_path, 'rb') )
x_test = pickle.load(open(test_feats_path, 'rb') )
embedding_matrix = pickle.load(open(embedding_matrix_path, 'rb') )

train = pd.read_csv('../data/train.csv')
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
submission = pd.read_csv('../data/sample_submission.csv')

del train

In [8]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.stopped_epoch = 0
        self.best = 0
        
    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

            # stopping condition - ROC stops improving
            if score > self.best:
                self.best = score
            else:
                self.stopped_epoch = epoch
                self.model.stop_training = True
                print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))
                

def get_model():
    input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(input)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(LSTM(80, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(x)

    # http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/
    # For text, CNN -> LSTM (or GRU) doesn't seem to work well, but LSTM -> CNN works really well.
    x1 = Conv1D(filters=64, kernel_size=2, padding='valid', kernel_initializer="he_uniform")(x)
    x1 = Dropout(0.2)(x1)
    
    x2 = Conv1D(filters=64, kernel_size=3, padding='valid', kernel_initializer="he_uniform")(x)
    x2 = Dropout(0.2)(x2)
    
    # Global average pooling operation for temporal data.
    # https://www.quora.com/What-is-global-average-pooling
    avg_pool0 = GlobalAveragePooling1D()(x)
    # Global max pooling operation for temporal data.
    max_pool0 = GlobalMaxPooling1D()(x)    

    # Global average pooling operation for temporal data.
    # https://www.quora.com/What-is-global-average-pooling
    avg_pool1 = GlobalAveragePooling1D()(x1)
    # Global max pooling operation for temporal data.
    max_pool1 = GlobalMaxPooling1D()(x1)
    
    # Global average pooling operation for temporal data.
    # https://www.quora.com/What-is-global-average-pooling
    avg_pool2 = GlobalAveragePooling1D()(x2)
    # Global max pooling operation for temporal data.
    max_pool2 = GlobalMaxPooling1D()(x2)
    
    conc = concatenate([avg_pool0, max_pool0, avg_pool1, max_pool1, avg_pool2, max_pool2])

    output = Dense(64, activation="relu")(conc)
    output = Dropout(0.2)(output)
    
    output = Dense(6, activation="sigmoid")(output)
        
    model = Model(inputs=input, outputs=output)
    
    return model 

In [9]:
model = get_model()
opt = Nadam(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 300)     30000000    input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 200, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 200, 160)     243840      spatial_dropout1d_1[0][0]        
__________________________

In [None]:
batch_size = 128 # 32
epochs = 3

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
CheckPoint = ModelCheckpoint('../snapshots/cnn_weights.{epoch:02d}-{val_loss:.2f}.hdf5')
csv_logger = CSVLogger('../training.log')

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc, csv_logger, CheckPoint], verbose=2)

In [None]:
y_pred = model.predict(x_test, batch_size=1024)

### Stratified k-fold learning & inferencing

In [16]:
## Stratified k-fold training

n_folds = 4
batch_size = 256
epochs = 10
predict_batch_size = 1024
run_id = 'cnn_glove'
opt = Nadam(lr=0.002) #optimizer

kfold = StratifiedKFold(n_splits = 20, shuffle = True, random_state = 32)

csv_logger = CSVLogger('../training.log')
early_stop = EarlyStopping(verbose=2)

pred = np.zeros((x_test.shape[0], 6))
y_packed = np.packbits(y_train, axis=1)

for i, (train_idx, valid_idx) in enumerate(kfold.split(x_train, y_packed)):
    print("Running fold {} / {}".format(i + 1, n_folds))
    print("Training / Valid set counts {} / {}".format(train_idx.shape, valid_idx.shape))

    model = None    
    model = get_model()
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    #model.summary()
    
    xs_train, ys_train = x_train[train_idx], y_train[train_idx] 
    xs_valid, ys_valid = x_train[valid_idx], y_train[valid_idx]

    CheckPoint = ModelCheckpoint('../snapshots/' + run_id + '_fold_' + str(i) + '_weights.{epoch:02d}-{val_loss:.2f}.hdf5')
    RocAuc = RocAucEvaluation(validation_data=(xs_valid, ys_valid), interval=1)

    # training
    history = model.fit(xs_train, ys_train, batch_size = batch_size, epochs = epochs, validation_data = (xs_valid, ys_valid), 
                          verbose = 2, callbacks=[RocAuc, csv_logger, CheckPoint])        
    # predict
    pred += model.predict(x_test, batch_size = predict_batch_size, verbose = 1)

    if (i + 1) == n_folds: break    
    
y_pred = pred/n_folds

Running fold 1 / 4
Training / Valid set counts (151582,) / (7989,)
Train on 151582 samples, validate on 7989 samples
Epoch 1/10
 - 456s - loss: 0.0593 - acc: 0.9789 - val_loss: 0.0465 - val_acc: 0.9820

 ROC-AUC - epoch: 1 - score: 0.982814 

Epoch 2/10
 - 449s - loss: 0.0433 - acc: 0.9832 - val_loss: 0.0489 - val_acc: 0.9813

 ROC-AUC - epoch: 2 - score: 0.986696 

Epoch 3/10
 - 449s - loss: 0.0381 - acc: 0.9847 - val_loss: 0.0442 - val_acc: 0.9833

 ROC-AUC - epoch: 3 - score: 0.986360 

Epoch 00003: early stopping
Running fold 2 / 4
Training / Valid set counts (151588,) / (7983,)
Train on 151588 samples, validate on 7983 samples
Epoch 1/10
 - 458s - loss: 0.0639 - acc: 0.9780 - val_loss: 0.0560 - val_acc: 0.9796

 ROC-AUC - epoch: 1 - score: 0.976736 

Epoch 2/10
 - 454s - loss: 0.0432 - acc: 0.9831 - val_loss: 0.0523 - val_acc: 0.9804

 ROC-AUC - epoch: 2 - score: 0.981848 

Epoch 3/10
 - 450s - loss: 0.0379 - acc: 0.9849 - val_loss: 0.0497 - val_acc: 0.9803

 ROC-AUC - epoch: 3 - 

KeyboardInterrupt: 

In [None]:
mod1 = load_model('../snapshots/lstm_glove_fold_0_weights.02-0.04.hdf5')
mod2 = load_model('../snapshots/lstm_glove_fold_1_weights.02-0.04.hdf5')

pred1 = mod1.predict(x_test, batch_size = predict_batch_size, verbose = 1)
pred2 = mod2.predict(x_test, batch_size = predict_batch_size, verbose = 1)

y_pred1 = (pred1 + pred2)/2

### Saving model and predictions

In [None]:
model_name = 'lstm_glove_fold_0_weights.03-0.04.hdf5'
model = load_model('../snapshots/' + model_name)
y_pred = model.predict(x_test, batch_size=1024)
y_pred

In [None]:
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submissions/cnn_2_window_2_3_filter_64_l1_lstm80_spatial_dr_0_4_lstm_dr_0_2_dense_64_dr_0_2_glove_ep3_batch_128.csv', index=False)

In [None]:
submission.head()

### Records

#### CNN (filters 64, window 2 (dropout 0.2) and window 3 (dropout 0.2) ) + LSTM (80, dropout=0.2, recurrent_dropout=0.2 - Spatial Dropout 0.4) + Glove + last dense 64 dropout (0.2) - Ep3 - Batch size 128, Max features 100,000

Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 902s - loss: 0.0565 - acc: 0.9801 - val_loss: 0.0450 - val_acc: 0.9818

 ROC-AUC - epoch: 1 - score: 0.986522 

Epoch 2/3
 - 880s - loss: 0.0417 - acc: 0.9837 - val_loss: 0.0421 - val_acc: 0.9832

 ROC-AUC - epoch: 2 - score: 0.989570 

Epoch 3/3
 - 872s - loss: 0.0372 - acc: 0.9849 - val_loss: 0.0414 - val_acc: 0.9841

 ROC-AUC - epoch: 3 - score: 0.990200 
 
 * **LB 0.9830**


#### CNN (filters 64, window 2 (dropout 0.2) and window 3 (dropout 0.2) ) + LSTM (80, dropout=0.2, recurrent_dropout=0.2 - Spatial Dropout 0.4) + Glove + last dense 64 dropout (0.1) - Ep3 - Batch size 128, Max features 100,000


* Name: **cnn_2_window_2_3_filter_64_l1_lstm80_spatial_dr_0_4_lstm_dr_0_2_fasttext_ep3_batch_128.csv**

Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 881s - loss: 0.0577 - acc: 0.9798 - val_loss: 0.0449 - val_acc: 0.9822

 ROC-AUC - epoch: 1 - score: 0.984894 

Epoch 2/3
 - 875s - loss: 0.0413 - acc: 0.9839 - val_loss: 0.0415 - val_acc: 0.9837

 ROC-AUC - epoch: 2 - score: 0.989601 

Epoch 3/3
 - 870s - loss: 0.0365 - acc: 0.9854 - val_loss: 0.0406 - val_acc: 0.9839

 ROC-AUC - epoch: 3 - score: 0.990294 

* **LB 0.9833**




#### CNN (filters 64, window 2 (dropout 0.2) and window 3 (dropout 0.2) ) + LSTM (64, dropout=0.2, recurrent_dropout=0.2 - Spatial Dropout 0.4) + FastText + MaxPool - Ep2 - Batch size 128, Max features 100,000

* Total params: 30,332,486

**Attempt 1**

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 876s - loss: 0.0325 - acc: 0.9866 - val_loss: 0.0430 - val_acc: 0.9837

 ROC-AUC - epoch: 1 - score: 0.989429 

Epoch 2/2
 - 877s - loss: 0.0292 - acc: 0.9879 - val_loss: 0.0446 - val_acc: 0.9836

 ROC-AUC - epoch: 2 - score: 0.989135 
 
 
 **Attempt 2**
 
 * name: **cnn_2_window_2_3_l1_lstm256_spatial_dr_0_4_lstm_dr_0_2_fasttext_maxpool_ep2_batch_128.csv**
 
 Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 887s - loss: 0.0552 - acc: 0.9803 - val_loss: 0.0445 - val_acc: 0.9821

 ROC-AUC - epoch: 1 - score: 0.987122 

Epoch 2/2
 - 868s - loss: 0.0409 - acc: 0.9840 - val_loss: 0.0413 - val_acc: 0.9837

 ROC-AUC - epoch: 2 - score: 0.989921 


#### CNN (filters 64, window 2 and window 3) + LSTM (256, dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4) + FastText + MaxPool - Ep3 - Batch size 256, Max features 100,000


Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 655s - loss: 0.0576 - acc: 0.9794 - val_loss: 0.0547 - val_acc: 0.9788

 ROC-AUC - epoch: 1 - score: 0.988215 

Epoch 2/3
 - 623s - loss: 0.0427 - acc: 0.9837 - val_loss: 0.0498 - val_acc: 0.9813

 ROC-AUC - epoch: 2 - score: 0.988926 

Epoch 3/3
 - 624s - loss: 0.0379 - acc: 0.9851 - val_loss: 0.0546 - val_acc: 0.9815

 ROC-AUC - epoch: 3 - score: 0.988650 

**Attempt 2**

Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 627s - loss: 0.0346 - acc: 0.9863 - val_loss: 0.0566 - val_acc: 0.9806

 ROC-AUC - epoch: 1 - score: 0.988491 

Epoch 2/3
 - 628s - loss: 0.0311 - acc: 0.9877 - val_loss: 0.0479 - val_acc: 0.9818

 ROC-AUC - epoch: 2 - score: 0.987230 

Epoch 3/3
 - 629s - loss: 0.0283 - acc: 0.9888 - val_loss: 0.0552 - val_acc: 0.9801

 ROC-AUC - epoch: 3 - score: 0.986574 

**Attempt 3**

Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 639s - loss: 0.0572 - acc: 0.9794 - val_loss: 0.0913 - val_acc: 0.9669

 ROC-AUC - epoch: 1 - score: 0.983046 

Epoch 2/3
 - 635s - loss: 0.0409 - acc: 0.9841 - val_loss: 0.0500 - val_acc: 0.9802

 ROC-AUC - epoch: 2 - score: 0.989757 

Epoch 3/3
 - 637s - loss: 0.0361 - acc: 0.9857 - val_loss: 0.0575 - val_acc: 0.9774

 ROC-AUC - epoch: 3 - score: 0.988192 


**Attempt 4**

* Total params: 31,312,390

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 643s - loss: 0.0571 - acc: 0.9797 - val_loss: 0.0528 - val_acc: 0.9790

 ROC-AUC - epoch: 1 - score: 0.988197 

Epoch 2/2
 - 640s - loss: 0.0411 - acc: 0.9841 - val_loss: 0.0443 - val_acc: 0.9831

 ROC-AUC - epoch: 2 - score: 0.989429 

**LB 0.9837**


#### CNN (filters 128, window 2) + GRU (256) + FastText + MaxPool - Ep3 - dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 128, Max features 100,000


Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 724s - loss: 0.0551 - acc: 0.9803 - val_loss: 0.0943 - val_acc: 0.9755

 ROC-AUC - epoch: 1 - score: 0.981225 

Epoch 2/5
 - 722s - loss: 0.0431 - acc: 0.9835 - val_loss: 0.0731 - val_acc: 0.9794

 ROC-AUC - epoch: 2 - score: 0.983226 

Epoch 3/5
 - 722s - loss: 0.0381 - acc: 0.9851 - val_loss: 0.0652 - val_acc: 0.9781

 ROC-AUC - epoch: 3 - score: 0.985856 

Epoch 4/5
 - 722s - loss: 0.0438 - acc: 0.9848 - val_loss: 0.0803 - val_acc: 0.9714

 ROC-AUC - epoch: 4 - score: 0.973663 

Epoch 5/5

#### CNN (filters 64, window 2) + GRU (256) + FastText + MaxPool - Ep3 - dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 128, Max features 100,000

Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 717s - loss: 0.0550 - acc: 0.9802 - val_loss: 0.0644 - val_acc: 0.9816

 ROC-AUC - epoch: 1 - score: 0.987303 

Epoch 2/5
 - 714s - loss: 0.0422 - acc: 0.9837 - val_loss: 0.0643 - val_acc: 0.9816

 ROC-AUC - epoch: 2 - score: 0.973863 

Epoch 3/5
 - 714s - loss: 0.0465 - acc: 0.9829 - val_loss: 0.1030 - val_acc: 0.9644

 ROC-AUC - epoch: 3 - score: 0.978108 

Epoch 4/5