Reference: https://www.kaggle.com/umbertogriffo/combined-gru-and-cnn-fasttext-badwords/code

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout
from keras.layers import GRU, LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
import time
import gc

os.environ['OMP_NUM_THREADS'] = '4'

Using TensorFlow backend.


In [2]:
embeddings = 'fasttext' #'glove', 'fasttext

if embeddings == 'fasttext':
    EMBEDDING_FILE = '../data/fasttext/crawl-300d-2M.vec'
else:
    EMBEDDING_FILE = '../data/glove/glove.840B.300d.txt'    

max_features = 100000  #100000 , 30000
maxlen = 200
embed_size = 300

print(EMBEDDING_FILE)

../data/fasttext/crawl-300d-2M.vec


In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

del train
del test

### Learning

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
gc.collect()

In [4]:
#len(word_index), max_features
embedding_matrix.shape

(100000, 300)

In [3]:
import pickle

if max_features > 30000:
    #pickle.dump(x_train, open('../models/x_train_feat_' + str(max_features) + '_seq_200.pkl', 'wb'))
    #pickle.dump(x_test, open('../models/x_test_feat_' + str(max_features) + '_seq_200.pkl', 'wb'))
    #pickle.dump(embedding_matrix, open('../models/fasttext_embedding_matrix_feat_' + str(max_features) + '.pkl', 'wb'))

    x_train = pickle.load( open('../models/x_train_feat_' + str(max_features) + '_seq_200.pkl', 'rb') )
    x_test = pickle.load( open('../models/x_test_feat_' + str(max_features) + '_seq_200.pkl', 'rb') )
    embedding_matrix = pickle.load( open('../models/fasttext_embedding_matrix_feat_' + str(max_features) + '.pkl', 'rb') )
else:
    #pickle.dump(x_train, open('../models/x_train_seq_200.pkl', 'wb'))
    #pickle.dump(x_test, open('../models/x_test_seq_200.pkl', 'wb'))
    #pickle.dump(embedding_matrix, open('../models/fasttext_embedding_matrix.pkl', 'wb'))

    x_train = pickle.load( open('../models/x_train_seq_200.pkl', 'rb') )
    x_test = pickle.load( open('../models/x_test_seq_200.pkl', 'rb') )
    embedding_matrix = pickle.load( open('../models/fasttext_embedding_matrix.pkl', 'rb') )    
    
train = pd.read_csv('../data/train.csv')
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
submission = pd.read_csv('../data/sample_submission.csv')

del train

In [8]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(input)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(GRU(256, return_sequences=True, recurrent_dropout=0.5))(x)
    x = Dropout(0.5)(x)

    # http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/
    # For text, CNN -> LSTM (or GRU) doesn't seem to work well, but LSTM -> CNN works really well.
    x = Conv1D(filters=128, kernel_size=2, padding='valid', kernel_initializer="he_uniform")(x)
    x = Dropout(0.5)(x)

    # Global average pooling operation for temporal data.
    # https://www.quora.com/What-is-global-average-pooling
    avg_pool = GlobalAveragePooling1D()(x)
    # Global max pooling operation for temporal data.
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    output = Dense(6, activation="sigmoid")(conc)
        
    model = Model(inputs=input, outputs=output)
    
    return model 

In [9]:
model = get_model() 
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 300)     30000000    input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 200, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 200, 512)     855552      spatial_dropout1d_2[0][0]        
__________________________________________________________________________________________________
dropout_3 

In [None]:
batch_size = 128 # 32
epochs = 5

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 724s - loss: 0.0551 - acc: 0.9803 - val_loss: 0.0943 - val_acc: 0.9755

 ROC-AUC - epoch: 1 - score: 0.981225 

Epoch 2/5
 - 722s - loss: 0.0431 - acc: 0.9835 - val_loss: 0.0731 - val_acc: 0.9794

 ROC-AUC - epoch: 2 - score: 0.983226 

Epoch 3/5
 - 722s - loss: 0.0381 - acc: 0.9851 - val_loss: 0.0652 - val_acc: 0.9781

 ROC-AUC - epoch: 3 - score: 0.985856 

Epoch 4/5
 - 722s - loss: 0.0438 - acc: 0.9848 - val_loss: 0.0803 - val_acc: 0.9714

 ROC-AUC - epoch: 4 - score: 0.973663 

Epoch 5/5


In [None]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submissions/gru_l1_gru128_spatial_dr_0_4_gpu_dr_0_5_fasttext_maxpool_ep1_batch_128.csv', index=False)

In [None]:
submission.head()

### Records



#### CNN (filters 64, window 2) + GRU (256) + FastText + MaxPool - Ep3 - dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 128, Max features 100,000

Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 717s - loss: 0.0550 - acc: 0.9802 - val_loss: 0.0644 - val_acc: 0.9816

 ROC-AUC - epoch: 1 - score: 0.987303 

Epoch 2/5
 - 714s - loss: 0.0422 - acc: 0.9837 - val_loss: 0.0643 - val_acc: 0.9816

 ROC-AUC - epoch: 2 - score: 0.973863 

Epoch 3/5
 - 714s - loss: 0.0465 - acc: 0.9829 - val_loss: 0.1030 - val_acc: 0.9644

 ROC-AUC - epoch: 3 - score: 0.978108 

Epoch 4/5

**Attempt 2**

Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 724s - loss: 0.0551 - acc: 0.9803 - val_loss: 0.0943 - val_acc: 0.9755

 ROC-AUC - epoch: 1 - score: 0.981225 

Epoch 2/5
 - 722s - loss: 0.0431 - acc: 0.9835 - val_loss: 0.0731 - val_acc: 0.9794

 ROC-AUC - epoch: 2 - score: 0.983226 

Epoch 3/5
 - 722s - loss: 0.0381 - acc: 0.9851 - val_loss: 0.0652 - val_acc: 0.9781

 ROC-AUC - epoch: 3 - score: 0.985856 

Epoch 4/5
 - 722s - loss: 0.0438 - acc: 0.9848 - val_loss: 0.0803 - val_acc: 0.9714

 ROC-AUC - epoch: 4 - score: 0.973663 

Epoch 5/5