Reference: https://www.kaggle.com/yekenot/pooled-gru-fasttext/output

In [None]:
import os
import time
import gc

import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import StratifiedKFold

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout, TimeDistributed
from keras.layers import GRU, LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, Multiply
from keras.layers import RepeatVector, Activation, Lambda, Average
from keras.optimizers import Adam, Nadam
from keras.preprocessing import text, sequence
from keras.callbacks import Callback, CSVLogger, ModelCheckpoint, EarlyStopping
from keras.models import load_model
from keras import regularizers

import warnings
warnings.filterwarnings('ignore')

os.environ['OMP_NUM_THREADS'] = '10'

In [None]:
embeddings = 'fasttext' #'glove', 'fasttext

if embeddings == 'fasttext':
    EMBEDDING_FILE = '../data/fasttext/crawl-300d-2M.vec'
else:
    EMBEDDING_FILE = '../data/glove/glove.840B.300d.txt'    

max_features = 100000  #100000 , 30000
maxlen = 200
embed_size = 300
prefix = 'c1' #x, #c1

print(EMBEDDING_FILE)

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

del train
del test

### Learning

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
gc.collect()

In [None]:
#len(word_index), max_features
embedding_matrix.shape

In [None]:
import pickle

train_feats_path = '../models/{}_train_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
test_feats_path = '../models/{}_test_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
embedding_matrix_path = '../models/{}_{}_embedding_matrix_feat_{}.pkl'.format(prefix, embeddings, max_features)
print(train_feats_path)

#pickle.dump(x_train, open(train_feats_path, 'wb'))
#pickle.dump(x_test, open(test_feats_path, 'wb'))
#pickle.dump(embedding_matrix, open(embedding_matrix_path, 'wb'))

x_train = pickle.load(open(train_feats_path, 'rb') )
x_test = pickle.load(open(test_feats_path, 'rb') )
embedding_matrix = pickle.load(open(embedding_matrix_path, 'rb') )

train = pd.read_csv('../data/train.csv')
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
submission = pd.read_csv('../data/sample_submission.csv')

del train

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.stopped_epoch = 0
        self.best = 0        

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

            # stopping condition - ROC stops improving
            if score > self.best:
                self.best = score
            else:
                self.stopped_epoch = epoch
                self.model.stop_training = True
                print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))            

def get_model(): # base
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.0, dropout=0.2))(x)
    x = TimeDistributed(Dense(100, activation = "relu"))(x) # time distributed  (sigmoid)
    x = Dropout(0.1)(x)
    
    # global pooling layer
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)

    return model

def get_model_2(): # 2 dense final layers
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.0, dropout=0.2))(x)
    x = TimeDistributed(Dense(100, activation = "relu"))(x) # time distributed  (sigmoid)
    x = Dropout(0.2)(x)    
    #x = TimeDistributed(Dense(100, activation = "relu"))(x) # time distributed  (sigmoid)
    #x = Dropout(0.2)(x)
    
    # global pooling layer
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    
    # dense layers
    outp = Dense(64, activation="relu")(conc)
    outp = Dropout(0.1)(outp)    
    outp = Dense(6, activation="sigmoid")(outp)
    
    model = Model(inputs=inp, outputs=outp)

    return model


def get_model_3(): #regularized
    kern_reg = regularizers.l2(0.00001)
    bias_reg = regularizers.l2(0.00001)    
    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.0, dropout=0.2, kernel_regularizer=kern_reg, bias_regularizer=bias_reg))(x)
    x = TimeDistributed(Dense(100, activation = "relu"))(x) # time distributed  (sigmoid)
    x = Dropout(0.1)(x)
    
    # global pooling layer
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)

    return model

def get_model_4(): # 2 LSTMs separately pooled
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)

    x1 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.0, dropout=0.2))(x)    
    x2 = Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.0, dropout=0.2))(x)
    conc = concatenate([x1, x2])
    
    #x = TimeDistributed(Dense(100, activation = "relu"))(x) # time distributed  (sigmoid)
    #x = Dropout(0.1)(x)
    
    # global pooling layer
    avg_pool = GlobalAveragePooling1D()(conc)
    max_pool = GlobalMaxPooling1D()(conc)
    conc = concatenate([avg_pool, max_pool])
    
    x = Dense(64, activation='relu')(conc)
    x = Dropout(0.1)(x)    
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)

    return model

In [None]:
model = get_model_3()
#opt = Adam(lr=0.005, decay=0.01, beta_1=0.9, beta_2=0.999)
opt = Nadam(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

### Attention

In [None]:
# Defined shared layers as global variables
repeator = RepeatVector(maxlen)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation('softmax', name='attention_weights') # We are using a custom softmax(axis = 1) loaded in this notebook
dotor = Dot(axes = 1)

n_a = 32
n_s = 64
post_activation_LSTM_cell = LSTM(n_s, return_state = True)

In [None]:
def one_step_attention(a, s_prev):
    """
    Performs one step of attention: Outputs a context vector computed as a dot product of the attention weights
    "alphas" and the hidden states "a" of the Bi-LSTM.
    
    Arguments:
    a -- hidden state output of the Bi-LSTM, numpy-array of shape (m, Tx, 2*n_a)
    s_prev -- previous hidden state of the (post-attention) LSTM, numpy-array of shape (m, n_s)
    
    Returns:
    context -- context vector, input of the next (post-attetion) LSTM cell
    """
    
    ### START CODE HERE ###
    # Use repeator to repeat s_prev to be of shape (m, Tx, n_s) so that you can concatenate it with all hidden states "a" (≈ 1 line)
    s_prev = repeator(s_prev)
    # Use concatenator to concatenate a and s_prev on the last axis (≈ 1 line)
    concat = concatenator([a, s_prev])
    # Use densor1 to propagate concat through a small fully-connected neural network to compute the "intermediate energies" variable e. (≈1 lines)
    e = densor1(concat)
    # Use densor2 to propagate e through a small fully-connected neural network to compute the "energies" variable energies. (≈1 lines)
    energies = densor2(e)
    # Use "activator" on "energies" to compute the attention weights "alphas" (≈ 1 line)
    alphas = activator(energies)
    # Use dotor together with "alphas" and "a" to compute the context vector to be given to the next (post-attention) LSTM-cell (≈ 1 line)
    context = dotor([alphas, a])
    ### END CODE HERE ###
    
    return context

In [None]:
# combination of a single LSTM output and all LSTM state pooling combination 
def get_model_attention(Ty, n_a, n_s): #is all you need attention ?
    """
    Arguments:
    Ty -- length of the output sequence
    n_a -- hidden state size of the Bi-LSTM
    n_s -- hidden state size of the post-attention LSTM

    Returns:
    model -- Keras model instance
    """
    
    inp = Input(shape=(maxlen, ))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')    
    s = s0
    c = c0

    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.0, dropout=0.1))(x)
    
    # Step 2: Iterate for Ty steps
    for t in range(Ty):    
        # Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
        context = one_step_attention(x, s)
        
        # Step 2.B: Apply the post-attention LSTM cell to the "context" vector.
        # Don't forget to pass: initial_state = [hidden state, cell state] (≈ 1 line)
        s, _, c = post_activation_LSTM_cell(context, initial_state = [s, c])
        
    # Step 2.C: Apply Dense layer to the hidden state output of the post-attention LSTM (≈ 1 line)
    out1 = Dense(64, activation="relu")(s)
    out1 = Dropout(0.1)(out1)
        
    # global pooling layer
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    out2 = Dense(64, activation="relu")(conc)
    out2 = Dropout(0.1)(out2)
    
    conc2 = concatenate([out1, out2])
    output = Dense(6, activation="sigmoid")(conc2)
    
    model = Model(inputs=[inp, s0, c0] , outputs=output)

    return model

In [None]:
# using dense layers for output

def get_model_attention_2(Ty, n_a, n_s): #is all you need attention ?
    """
    Arguments:
    Ty -- length of the output sequence
    n_a -- hidden state size of the Bi-LSTM
    n_s -- hidden state size of the post-attention LSTM

    Returns:
    model -- Keras model instance
    """
    
    inp = Input(shape=(maxlen, ))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')    
    s = s0
    c = c0
    outputs = []

    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.0, dropout=0.1))(x)
    
    # Step 2: Iterate for Ty steps
    for t in range(Ty):    
        # Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
        context = one_step_attention(x, s)
        
        # Step 2.B: Apply the post-attention LSTM cell to the "context" vector.
        # Don't forget to pass: initial_state = [hidden state, cell state] (≈ 1 line)
        s, _, c = post_activation_LSTM_cell(context, initial_state = [s, c])
        
        output = Dense(6, activation="relu")(s)
        output = Dropout(0.1)(output)
        outputs.append(output)                
    
    conc = concatenate(outputs)
    out = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=[inp, s0, c0] , outputs=out)

    return model

In [None]:
# using average for output
def get_model_attention_3(Ty, n_a, n_s): #is all you need attention ?
    """
    Arguments:
    Ty -- length of the output sequence
    n_a -- hidden state size of the Bi-LSTM
    n_s -- hidden state size of the post-attention LSTM

    Returns:
    model -- Keras model instance
    """
    
    inp = Input(shape=(maxlen, ))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')    
    s = s0
    c = c0
    outputs = []

    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.0, dropout=0.1))(x)
    
    # Step 2: Iterate for Ty steps
    for t in range(Ty):    
        # Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
        context = one_step_attention(x, s)
        
        # Step 2.B: Apply the post-attention LSTM cell to the "context" vector.
        # Don't forget to pass: initial_state = [hidden state, cell state] (≈ 1 line)
        s, _, c = post_activation_LSTM_cell(context, initial_state = [s, c])
        
        output = Dense(64, activation="relu")(s)
        output = Dropout(0.1)(output)
        outputs.append(output)                

    avg = Average()(outputs)
    out = Dense(6, activation="sigmoid")(avg)
    
    model = Model(inputs=[inp, s0, c0] , outputs=out)

    return model

In [None]:
model = get_model_attention_3(maxlen, n_a, n_s)
#opt = Nadam(lr=0.001)
opt = Adam(lr=0.005, decay=0.01, beta_1=0.9, beta_2=0.999)

model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

### Learning & inferencing

In [None]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
CheckPoint = ModelCheckpoint('../snapshots/weights.{epoch:02d}-{val_loss:.2f}.hdf5')
csv_logger = CSVLogger('../training.log')
early_stop = EarlyStopping(patience=1, verbose=2)

#X_tra.shape, x_train.shape
X_tra.shape

In [None]:
# regular models
batch_size = 128
epochs = 30

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc, csv_logger, CheckPoint], verbose=2)

In [None]:
# attention models
s0 = np.zeros((X_tra.shape[0], n_s))
c0 = np.zeros((X_tra.shape[0], n_s))

s0_val = np.zeros((X_val.shape[0], n_s))
c0_val = np.zeros((X_val.shape[0], n_s))

RocAuc = RocAucEvaluation(validation_data=([X_val, np.copy(s0_val), np.copy(c0_val)], y_val), interval=1)

epochs = 10
batch_size = 128

hist = model.fit([X_tra, s0, c0], y_tra, epochs=epochs, batch_size=batch_size, validation_data=([X_val, s0_val, c0_val], y_val),
                 callbacks=[csv_logger, RocAuc, CheckPoint], verbose=2)

In [None]:
s0 = np.zeros((x_test.shape[0], n_s))
c0 = np.zeros((x_test.shape[0], n_s))

y_pred = model.predict([x_test, s0, c0], batch_size=1024)

In [None]:
y_pred = model.predict(x_test, batch_size=1024)

### Stratified k-fold learning & inferencing

In [None]:
## Stratified k-fold training

n_folds = 5
batch_size = 128
epochs = 20
predict_batch_size = 1024
run_id = 'gru_reg_fasttext_128'
opt = Nadam(lr=0.001) #optimizer
#opt = Adam(lr=0.003, decay=0.01, beta_1=0.9, beta_2=0.999)

kfold = StratifiedKFold(n_splits = 20, shuffle = True, random_state = 32)

csv_logger = CSVLogger('../training.log', append=True)
early_stop = EarlyStopping(verbose=2)

pred = np.zeros((x_test.shape[0], 6))
y_packed = np.packbits(y_train, axis=1)

for i, (train_idx, valid_idx) in enumerate(kfold.split(x_train, y_packed)):
    print("Running fold {} / {}".format(i + 1, n_folds))
    print("Training / Valid set counts {} / {}".format(train_idx.shape, valid_idx.shape))

    model = None    
    model = get_model_3()
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    #model.summary()
    
    xs_train, ys_train = x_train[train_idx], y_train[train_idx] 
    xs_valid, ys_valid = x_train[valid_idx], y_train[valid_idx]

    CheckPoint = ModelCheckpoint('../snapshots/' + run_id + '_fold_' + str(i) + '_weights.{epoch:02d}-{val_loss:.2f}.hdf5')
    RocAuc = RocAucEvaluation(validation_data=(xs_valid, ys_valid), interval=1)

    # training
    history = model.fit(xs_train, ys_train, batch_size = batch_size, epochs = epochs, validation_data = (xs_valid, ys_valid), 
                          verbose = 2, callbacks=[RocAuc, csv_logger, CheckPoint])        
    # predict
    pred += model.predict(x_test, batch_size = predict_batch_size, verbose = 1)

    if (i + 1) == n_folds: break    
    
y_pred = pred/n_folds

### Saving model and predictions

In [None]:
# Model loading and predicting

predict_batch_size = 1024
model_folder = 'gru_fasttext_128_95p_reg/'
models_files = ['gru_reg_fasttext_128_fold_0_weights.04-0.04.hdf5', 'gru_reg_fasttext_128_fold_1_weights.02-0.04.hdf5', 
               'gru_reg_fasttext_128_fold_2_weights.03-0.04.hdf5', 'gru_reg_fasttext_128_fold_3_weights.02-0.04.hdf5',
               'gru_reg_fasttext_128_fold_4_weights.02-0.04.hdf5']
scores = [0.989979, 0.988774, 0.991019, 0.989545, 0.989145]


preds = [load_model('../snapshots/'+model_folder+f).predict(x_test, batch_size = predict_batch_size, verbose = 1) for f in models_files]

In [None]:
# predictions - equal averaging

y_pred = np.array(preds).mean(axis=0)
y_pred

In [None]:
# weighted average of predictions according to the score of each fold

score_rank = np.array(scores).argsort().argsort()
print(score_rank+1)
y_pred = np.average(preds, axis=0, weights=score_rank+1)
y_pred

In [None]:
# individual model prediction

model_name = 'weights.02-0.04.hdf5'
model = load_model('../snapshots/' + model_name)

y_pred = model.predict(x_test, batch_size=1024)
y_pred

In [None]:
# save
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
outfile = 'gru_fasttext_128_95p_reg.5fold_weighted_avg.csv'
submission.to_csv('../submissions/' + outfile, index=False)

In [None]:
submission.head()

## LSTM

### standard LSTM + FastText + TimeDistributed(tanh activation)

Running fold 1 / 5
Training / Valid set counts (143603,) / (15968,)
Train on 143603 samples, validate on 15968 samples
Epoch 1/20
 - 396s - loss: 0.0757 - acc: 0.9755 - val_loss: 0.0468 - val_acc: 0.9825

 ROC-AUC - epoch: 1 - score: 0.977484 

Epoch 2/20
 - 391s - loss: 0.0451 - acc: 0.9833 - val_loss: 0.0449 - val_acc: 0.9832

 ROC-AUC - epoch: 2 - score: 0.980896 

Epoch 3/20
 - 390s - loss: 0.0401 - acc: 0.9845 - val_loss: 0.0436 - val_acc: 0.9833

 ROC-AUC - epoch: 3 - score: 0.983498 

Epoch 4/20
 - 391s - loss: 0.0365 - acc: 0.9857 - val_loss: 0.0427 - val_acc: 0.9836

 ROC-AUC - epoch: 4 - score: 0.984817 

Epoch 5/20
 - 391s - loss: 0.0334 - acc: 0.9868 - val_loss: 0.0438 - val_acc: 0.9831

 ROC-AUC - epoch: 5 - score: 0.986026 

#### LSTM (maxlen 200, Units 64) + FastText + MaxPool, LSTM - dropout=0.2, recurrent_dropout=0.0, Spatial Dropout 0.4, TimeDistributed (100, relu, dr 0.2), final dense 64 - dr 0.1 - Batch size 256, - Ep5, Max features 100,000  - With cleaned text

Total params: 30,200,986

Train on 151592 samples, validate on 7979 samples
Epoch 1/10
 - 402s - loss: 0.0653 - acc: 0.9786 - val_loss: 0.0482 - val_acc: 0.9812

 ROC-AUC - epoch: 1 - score: 0.984191 

Epoch 2/10
 - 398s - loss: 0.0410 - acc: 0.9841 - val_loss: 0.0440 - val_acc: 0.9827

 ROC-AUC - epoch: 2 - score: 0.988491 

Epoch 3/10
 - 398s - loss: 0.0368 - acc: 0.9855 - val_loss: 0.0429 - val_acc: 0.9827

 ROC-AUC - epoch: 3 - score: 0.990585 

Epoch 4/10
 - 398s - loss: 0.0335 - acc: 0.9867 - val_loss: 0.0418 - val_acc: 0.9837

 ROC-AUC - epoch: 4 - score: 0.990823 


#### Best out of stratified - LSTM (maxlen 200, Units 64) + GloVe + MaxPool, LSTM - dropout=0.2, recurrent_dropout=0.0, Spatial Dropout 0.4, TimeDistributed (100, relu, dr 0.1), final dense 64 - dr 0.1 - Batch size 256, - Ep3, nadam 0.002, Max features 100,000 

Train on 151582 samples, validate on 7989 samples
Epoch 1/10
 - 409s - loss: 0.0579 - acc: 0.9795 - val_loss: 0.0447 - val_acc: 0.9832

 ROC-AUC - epoch: 1 - score: 0.984229 

Epoch 2/10
 - 405s - loss: 0.0407 - acc: 0.9841 - val_loss: 0.0415 - val_acc: 0.9831

 ROC-AUC - epoch: 2 - score: 0.988578 

Epoch 3/10
 - 405s - loss: 0.0357 - acc: 0.9857 - val_loss: 0.0443 - val_acc: 0.9826

 ROC-AUC - epoch: 3 - score: 0.989446 

#### LSTM (maxlen 200, Units 64) + GloVe + MaxPool, LSTM - dropout=0.2, recurrent_dropout=0.0, Spatial Dropout 0.4, TimeDistributed (100, relu, dr 0.1), final dense 64 - dr 0.1 - Batch size 256, - Ep5, Max features 100,000 

Total params: 30,213,034

Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 408s - loss: 0.0664 - acc: 0.9768 - val_loss: 0.0516 - val_acc: 0.9804

 ROC-AUC - epoch: 1 - score: 0.982877 

Epoch 2/5
 - 405s - loss: 0.0434 - acc: 0.9833 - val_loss: 0.0450 - val_acc: 0.9819

 ROC-AUC - epoch: 2 - score: 0.988467 

Epoch 3/5
 - 404s - loss: 0.0391 - acc: 0.9846 - val_loss: 0.0424 - val_acc: 0.9830

 ROC-AUC - epoch: 3 - score: 0.990159 

Epoch 4/5
 - 402s - loss: 0.0358 - acc: 0.9856 - val_loss: 0.0412 - val_acc: 0.9834

 ROC-AUC - epoch: 4 - score: 0.990784 

Epoch 5/5
 - 403s - loss: 0.0329 - acc: 0.9866 - val_loss: 0.0416 - val_acc: 0.9834

 ROC-AUC - epoch: 5 - score: 0.990300 

Epoch 00005: early stopping

**LB 0.9832**

#### LSTM (maxlen 200, Units 64) + GloVe + MaxPool, LSTM - dropout=0.2, recurrent_dropout=0.0, Spatial Dropout 0.4, TimeDistributed (100, relu, dr 0.1), final dense 64 - dr 0.1 - Batch size 256, - Ep5, Max features 100,000 

Total params: 30,213,034

Name: **lstm_l1_64_spatial_dr_0_4_lstm_dr_0_2_timedistributed_dense_relu_100_dr_0_1_amaxpool_dense_64_dr_0_1_glove_ep6_batch_256_nadam_001.csv**

Train on 151592 samples, validate on 7979 samples
Epoch 1/6
 - 416s - loss: 0.0637 - acc: 0.9785 - val_loss: 0.0473 - val_acc: 0.9819

 ROC-AUC - epoch: 1 - score: 0.984785 

Epoch 2/6
 - 397s - loss: 0.0427 - acc: 0.9836 - val_loss: 0.0431 - val_acc: 0.9830

 ROC-AUC - epoch: 2 - score: 0.989311 

Epoch 3/6
 - 398s - loss: 0.0387 - acc: 0.9846 - val_loss: 0.0412 - val_acc: 0.9838

 ROC-AUC - epoch: 3 - score: 0.990382 

Epoch 4/6
 - 399s - loss: 0.0354 - acc: 0.9858 - val_loss: 0.0420 - val_acc: 0.9829

 ROC-AUC - epoch: 4 - score: 0.990670 

Epoch 5/6
 - 399s - loss: 0.0326 - acc: 0.9866 - val_loss: 0.0418 - val_acc: 0.9840

 ROC-AUC - epoch: 5 - score: 0.990223 

Epoch 6/6
 - 400s - loss: 0.0302 - acc: 0.9876 - val_loss: 0.0433 - val_acc: 0.9838

 ROC-AUC - epoch: 6 - score: 0.989657 

**LB 0.9843**

#### LSTM (maxlen 200, Units 64) + Glove + MaxPool, LSTM - dropout=0.2, recurrent_dropout=0.0, Spatial Dropout 0.4, TimeDistributed (100, relu, dr 0.1) - Batch size 256, - Ep5, Max features 100,000 

Total params: 30,200,986, Train on 151592 samples, validate on 7979 samples
Name: **lstm_l1_64_spatial_dr_0_4_lstm_dr_0_2_timedistributed_dense_relu_100_glove_ep5_batch_256_nadam_001**

Epoch 1/5
 - 420s - loss: 0.0662 - acc: 0.9778 - val_loss: 0.0469 - val_acc: 0.9824

 ROC-AUC - epoch: 1 - score: 0.982196 

Epoch 2/5
 - 413s - loss: 0.0424 - acc: 0.9837 - val_loss: 0.0430 - val_acc: 0.9831

 ROC-AUC - epoch: 2 - score: 0.987828 

Epoch 3/5
 - 410s - loss: 0.0382 - acc: 0.9850 - val_loss: 0.0411 - val_acc: 0.9840

 ROC-AUC - epoch: 3 - score: 0.989994 

Epoch 4/5
 - 410s - loss: 0.0352 - acc: 0.9861 - val_loss: 0.0409 - val_acc: 0.9839

 ROC-AUC - epoch: 4 - score: 0.990150 

Epoch 5/5
 - 409s - loss: 0.0324 - acc: 0.9871 - val_loss: 0.0417 - val_acc: 0.9836

 ROC-AUC - epoch: 5 - score: 0.990166


#### LSTM (maxlen 200, Units 64) + Glove + MaxPool, LSTM - dropout=0.5, recurrent_dropout=0.0 - Spatial Dropout 0.4, TimeDistributed (100, relu) - Batch size 128, - Ep5, Max features 100,000 

* Total params: 30,200,986

Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 806s - loss: 0.0571 - acc: 0.9797 - val_loss: 0.0450 - val_acc: 0.9825

 ROC-AUC - epoch: 1 - score: 0.984705 

Epoch 2/5
 - 803s - loss: 0.0406 - acc: 0.9843 - val_loss: 0.0427 - val_acc: 0.9827

 ROC-AUC - epoch: 2 - score: 0.989365 

Epoch 3/5
 - 799s - loss: 0.0365 - acc: 0.9856 - val_loss: 0.0408 - val_acc: 0.9840

 ROC-AUC - epoch: 3 - score: 0.990350 

Epoch 4/5
 - 799s - loss: 0.0330 - acc: 0.9868 - val_loss: 0.0425 - val_acc: 0.9824

 ROC-AUC - epoch: 4 - score: 0.990180 

Epoch 5/5
 - 801s - loss: 0.0299 - acc: 0.9879 - val_loss: 0.0423 - val_acc: 0.9837

 ROC-AUC - epoch: 5 - score: 0.989926 


#### LSTM (maxlen 200, Units 256) + FastText + MaxPool - Ep2 - dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 128, Max features 100,000

* **Name: lstm_l1_256_spatial_dr_0_4_lstm_dr_0_5_glove_maxpool_ep3_batch_128_nadam.csv**
* Total params: 31,146,886

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 932s - loss: 0.0507 - acc: 0.9813 - val_loss: 0.0585 - val_acc: 0.9818

 ROC-AUC - epoch: 1 - score: 0.987029 

Epoch 2/2
 - 917s - loss: 0.0391 - acc: 0.9847 - val_loss: 0.0505 - val_acc: 0.9822

 ROC-AUC - epoch: 2 - score: 0.988892 

#### LSTM (maxlen 200, Units 256) + FastText + MaxPool - Ep3 - dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 256, Max features 100,000

* Train on 151592 samples, validate on 7979 samples
* Epoch 1/3
 - 640s - loss: 0.0539 - acc: 0.9802 - val_loss: 0.0602 - val_acc: 0.9822

 ROC-AUC - epoch: 1 - score: 0.988412 

* Epoch 2/3
 - 635s - loss: 0.0390 - acc: 0.9847 - val_loss: 0.0533 - val_acc: 0.9815

 ROC-AUC - epoch: 2 - score: 0.989449 

* Epoch 3/3
 - 635s - loss: 0.0337 - acc: 0.9865 - val_loss: 0.0488 - val_acc: 0.9826

 ROC-AUC - epoch: 3 - score: 0.988610 
 
* **LB 0.9824**


#### LSTM (maxlen 200, Units 256) + GloVe + MaxPool - Ep3 - dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 256, Max features 100,000

Total params: 31,146,886

Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 921s - loss: 0.0500 - acc: 0.9815 - val_loss: 0.0584 - val_acc: 0.9821

 ROC-AUC - epoch: 1 - score: 0.987247 

Epoch 2/3
 - 918s - loss: 0.0383 - acc: 0.9849 - val_loss: 0.0500 - val_acc: 0.9824

 ROC-AUC - epoch: 2 - score: 0.989146 

Epoch 3/3
 - 917s - loss: 0.0332 - acc: 0.9867 - val_loss: 0.0483 - val_acc: 0.9818

 ROC-AUC - epoch: 3 - score: 0.988098 