Reference: https://www.kaggle.com/yekenot/pooled-gru-fasttext/output

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout
from keras.layers import GRU, LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
import time
import gc

os.environ['OMP_NUM_THREADS'] = '4'

Using TensorFlow backend.


In [9]:
embeddings = 'glove' #'glove', 'fasttext

if embeddings == 'fasttext':
    EMBEDDING_FILE = '../data/fasttext/crawl-300d-2M.vec'
else:
    EMBEDDING_FILE = '../data/glove/glove.840B.300d.txt'    

max_features = 30000 #100000
maxlen = 200
embed_size = 300

print(EMBEDDING_FILE)

../data/glove/glove.840B.300d.txt


In [10]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

del train
del test

### Learning

In [11]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
gc.collect()

0

In [12]:
len(word_index), max_features
embedding_matrix.shape
#len(word_index)

(30000, 300)

In [13]:
import pickle

mode = 'load' #'write'

if max_features > 30000:
    embedding_matrix_file = embeddings + '_embedding_matrix_feat_' + str(max_features) + '.pkl'
    
    if mode == 'write':
        pickle.dump(x_train, open('../models/x_train_feat_' + str(max_features) + '_seq_200.pkl', 'wb'))
        pickle.dump(x_test, open('../models/x_test_feat_' + str(max_features) + '_seq_200.pkl', 'wb'))
        pickle.dump(embedding_matrix, open('../models/' + embedding_matrix_file, 'wb'))
    else:        
        x_train = pickle.load( open('../models/x_train_feat_' + str(max_features) + '_seq_200.pkl', 'rb') )
        x_test = pickle.load( open('../models/x_test_feat_' + str(max_features) + '_seq_200.pkl', 'rb') )
        embedding_matrix = pickle.load( open('../models/' + embedding_matrix_file, 'rb') )
else:
    embedding_matrix_file = embeddings + '_embedding_matrix.pkl'
    
    if mode == 'write':    
        pickle.dump(x_train, open('../models/x_train_seq_200.pkl', 'wb'))
        pickle.dump(x_test, open('../models/x_test_seq_200.pkl', 'wb'))
        pickle.dump(embedding_matrix, open('../models/' + embedding_matrix_file, 'wb'))
    else:
        x_train = pickle.load( open('../models/x_train_seq_200.pkl', 'rb') )
        x_test = pickle.load( open('../models/x_test_seq_200.pkl', 'rb') )
        embedding_matrix = pickle.load( open('../models/' + embedding_matrix_file, 'rb') )
    
#train = pd.read_csv('../data/train.csv')
#y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
#submission = pd.read_csv('../data/sample_submission.csv')

#del train

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.5))(x)
    x = Dropout(0.5)(x)
    x = GRU(128, return_sequences=True, recurrent_dropout=0.5)(x)
    x = Dropout(0.5)(x)
    
    # global pooling layer
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)

    return model

def get_model_2():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(GRU(256, return_sequences=True, recurrent_dropout=0.5))(x)
    x = Dropout(0.5)(x)
    
    # 2-layer GPU
    #x = GRU(256, return_sequences=True, recurrent_dropout=0.5)(x)
    #x = Dropout(0.5)(x)
    
    #x = TimeDistributed(Dense(64, activation = "relu"))(x) # time distributed  (sigmoid)
    
    # global pooling layer
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)

    return model

def get_model_3(): #LSTM
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(LSTM(256, return_sequences=True, recurrent_dropout=0.5))(x)
    x = Dropout(0.5)(x)

    # 2-layer GPU
    #x = GRU(256, return_sequences=True, recurrent_dropout=0.5)(x)
    #x = Dropout(0.5)(x)
    
    # global pooling layer
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    
    return model    

In [None]:
model = get_model() 
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
model.summary()

In [None]:
batch_size = 128 # 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)


In [None]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submissions/gru_l1_gru128_spatial_dr_0_4_gpu_dr_0_5_fasttext_maxpool_ep1_batch_128.csv', index=False)

In [None]:
submission.head()

### Records

#### GRU (maxlen 100, Units 64) + FastText + MaxPool - Ep2 
Train on 151592 samples, validate on 7979 samples

* Total params: 9,184,806

* Epoch 1/2
 - 1442s - loss: 0.0499 - acc: 0.9820 - val_loss: 0.0464 - val_acc: 0.9821

 ROC-AUC - epoch: 1 - score: 0.987249 

* Epoch 2/2
 - 1436s - loss: 0.0379 - acc: 0.9852 - val_loss: 0.0449 - val_acc: 0.9825

 ROC-AUC - epoch: 2 - score: 0.987349 
* ** LB 0.9812 **
 
 
 #### GRU (maxlen 100, , Units 64) + FastText + MaxPool - Ep3 

 Train on 151592 samples, validate on 7979 samples
* Epoch 1/3
 - 1430s - loss: 0.0316 - acc: 0.9876 - val_loss: 0.0468 - val_acc: 0.9823

 ROC-AUC - epoch: 1 - score: 0.986716 

* Epoch 2/3
 - 1397s - loss: 0.0204 - acc: 0.9922 - val_loss: 0.0584 - val_acc: 0.9810

* ROC-AUC - epoch: 3 - score: 0.984497 

* ** LB 0.6068 **

 #### GRU (maxlen 200, Units 64) + FastText + MaxPool - Ep3 

Train on 151592 samples, validate on 7979 samples
* Total params: 9,184,806

* Epoch 1/2
 - 2454s - loss: 0.0476 - acc: 0.9823 - val_loss: 0.0440 - val_acc: 0.9834

 ROC-AUC - epoch: 1 - score: 0.988144 

* Epoch 2/2
 - 2556s - loss: 0.0370 - acc: 0.9854 - val_loss: 0.0445 - val_acc: 0.9828

 ROC-AUC - epoch: 2 - score: 0.988467 

* Epoch 2/2

* ** LB 0.9813 **

 #### GRU (maxlen 200, Units 128) + FastText + MaxPool - Ep2

* Total params: 9,332,550

* Train on 151592 samples, validate on 7979 samples
* Epoch 1/2
 - 3034s - loss: 0.0487 - acc: 0.9821 - val_loss: 0.0435 - val_acc: 0.9832

 ROC-AUC - epoch: 1 - score: 0.988728 

* Epoch 2/2
 - 3028s - loss: 0.0367 - acc: 0.9855 - val_loss: 0.0436 - val_acc: 0.9834

 ROC-AUC - epoch: 2 - score: 0.988866 
 
 * **LB 0.9823**
 
#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep2
 
* Total params: 9,861,702
 
* Train on 151592 samples, validate on 7979 samples
* Epoch 1/2
 - 5415s - loss: 0.0476 - acc: 0.9823 - val_loss: 0.0430 - val_acc: 0.9836

 ROC-AUC - epoch: 1 - score: 0.988848 

* Epoch 2/2
 - 5429s - loss: 0.0370 - acc: 0.9855 - val_loss: 0.0444 - val_acc: 0.9827

 ROC-AUC - epoch: 2 - score: 0.988651 
 
 
#### GRU (maxlen 200, Units 128) + FastText + MaxPool - Ep2 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4
  
*  Total params: 9,332,550

* Train on 151592 samples, validate on 7979 samples

* Epoch 1/2
 - 3130s - loss: 0.0537 - acc: 0.9807 - val_loss: 0.0446 - val_acc: 0.9827

 ROC-AUC - epoch: 1 - score: 0.987232 

* Epoch 2/2
 - 3007s - loss: 0.0418 - acc: 0.9839 - val_loss: 0.0435 - val_acc: 0.9835

 ROC-AUC - epoch: 2 - score: 0.988311 
 
 
#### GRU (maxlen 200, Units 128) + FastText + MaxPool - Ep1 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 128, Max features 30,000

* Total params: 9,332,550
 
* Train on 151592 samples, validate on 7979 samples
Epoch 1/1
 - 2375s - loss: 0.0373 - acc: 0.9853 - val_loss: 0.0438 - val_acc: 0.9832

* ROC-AUC - epoch: 1 - score: 0.988689 

* **LB 0.9832**

 
#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep2 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.5 - Batch size 128, Max features 30,000
 
* Total params: 9,861,702

* Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 701s - loss: 0.0445 - acc: 0.9831 - val_loss: 0.0478 - val_acc: 0.9815
 - ROC-AUC - epoch: 1 - score: 0.987924 

* Epoch 2/2
 - 704s - loss: 0.0406 - acc: 0.9843 - val_loss: 0.0453 - val_acc: 0.9823
 - ROC-AUC - epoch: 2 - score: 0.989146
 
 
#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep5 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.5 - Batch size 128, Max features 30,000 - Optimizer: nadam?

* Total params: 9,861,702
 
* Train on 151592 samples, validate on 7979 samples
* Epoch 1/5
 - 702s - loss: 0.0384 - acc: 0.9849 - val_loss: 0.0445 - val_acc: 0.9827

 ROC-AUC - epoch: 1 - score: 0.989080 

* Epoch 2/5
 - 703s - loss: 0.0365 - acc: 0.9857 - val_loss: 0.0441 - val_acc: 0.9826

 ROC-AUC - epoch: 2 - score: 0.989194 

* Epoch 3/5
 - 702s - loss: 0.0348 - acc: 0.9862 - val_loss: 0.0451 - val_acc: 0.9825

 ROC-AUC - epoch: 3 - score: 0.988923 

* Epoch 4/5
 - 702s - loss: 0.0332 - acc: 0.9867 - val_loss: 0.0481 - val_acc: 0.9815

 ROC-AUC - epoch: 4 - score: 0.988581 
 
* Epoch 5/5
 - 702s - loss: 0.0320 - acc: 0.9872 - val_loss: 0.0477 - val_acc: 0.9825

 ROC-AUC - epoch: 5 - score: 0.988151 
 
 
#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep2 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.5 - Batch size 256, Max features 30,000 - Optimizer: nadam

* Total params: 9,478,854

* Train on 151592 samples, validate on 7979 samples
* Epoch 1/2
 - 447s - loss: 0.0588 - acc: 0.9794 - val_loss: 0.0510 - val_acc: 0.9801

 - ROC-AUC - epoch: 1 - score: 0.985962 

* Epoch 2/2
 - 445s - loss: 0.0434 - acc: 0.9835 - val_loss: 0.0496 - val_acc: 0.9805

 - ROC-AUC - epoch: 2 - score: 0.987739 

#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep2 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.5 - Batch size 256, Max features 30,000 - Optimizer: adam

* Total params: 9,861,702

* Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 444s - loss: 0.0693 - acc: 0.9768 - val_loss: 0.0536 - val_acc: 0.9799

 - ROC-AUC - epoch: 1 - score: 0.976985 

* Epoch 2/2
 - 442s - loss: 0.0466 - acc: 0.9827 - val_loss: 0.0484 - val_acc: 0.9811

 - ROC-AUC - epoch: 2 - score: 0.986464 
 
#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep2 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.5 - Batch size 128, Max features 30,000 - Optimizer: adam

* Train on 151592 samples, validate on 7979 samples
* Epoch 1/2
 - 701s - loss: 0.0619 - acc: 0.9789 - val_loss: 0.0537 - val_acc: 0.9792

 - ROC-AUC - epoch: 1 - score: 0.982604 

* Epoch 2/2
  - 692s - loss: 0.0451 - acc: 0.9830 - val_loss: 0.0476 - val_acc: 0.9815
  - ROC-AUC - epoch: 2 - score: 0.987292 
 
  
#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep2 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.5 - Batch size 128, Max features 30,000 - Optimizer: nadam


Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 698s - loss: 0.0546 - acc: 0.9805 - val_loss: 0.0460 - val_acc: 0.9829

 ROC-AUC - epoch: 1 - score: 0.987629 

Epoch 2/3
 - 696s - loss: 0.0424 - acc: 0.9837 - val_loss: 0.0439 - val_acc: 0.9827

 ROC-AUC - epoch: 2 - score: 0.988500 

Epoch 3/3
 - 694s - loss: 0.0395 - acc: 0.9846 - val_loss: 0.0486 - val_acc: 0.9813

 ROC-AUC - epoch: 3 - score: 0.987784 
      

#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep5 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 256, Max features 30,000 - Optimizer: adam


Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 442s - loss: 0.0388 - acc: 0.9850 - val_loss: 0.0471 - val_acc: 0.9811

 ROC-AUC - epoch: 1 - score: 0.988906 

Epoch 2/5
 - 443s - loss: 0.0366 - acc: 0.9855 - val_loss: 0.0440 - val_acc: 0.9825

 ROC-AUC - epoch: 2 - score: 0.989002 

Epoch 3/5
 - 443s - loss: 0.0349 - acc: 0.9861 - val_loss: 0.0441 - val_acc: 0.9830

 ROC-AUC - epoch: 3 - score: 0.989010 

Epoch 4/5
 - 443s - loss: 0.0330 - acc: 0.9868 - val_loss: 0.0460 - val_acc: 0.9819

 ROC-AUC - epoch: 4 - score: 0.988410 

Epoch 5/5


#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep3 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 256, Max features 30,000 - Optimizer: adam

Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 444s - loss: 0.0676 - acc: 0.9773 - val_loss: 0.0487 - val_acc: 0.9815

 ROC-AUC - epoch: 1 - score: 0.982159 

Epoch 2/3
 - 442s - loss: 0.0450 - acc: 0.9832 - val_loss: 0.0479 - val_acc: 0.9812

 ROC-AUC - epoch: 2 - score: 0.987763 

Epoch 3/3
 - 442s - loss: 0.0412 - acc: 0.9841 - val_loss: 0.0447 - val_acc: 0.9825

 ROC-AUC - epoch: 3 - score: 0.988995 
 
* **LB 0.9821**
 
 
#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep3 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 128, Max features 30,000 - Optimizer: adam

 
 Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 687s - loss: 0.0396 - acc: 0.9847 - val_loss: 0.0465 - val_acc: 0.9815

 ROC-AUC - epoch: 1 - score: 0.988641 

Epoch 2/3
 - 687s - loss: 0.0371 - acc: 0.9854 - val_loss: 0.0442 - val_acc: 0.9829

 ROC-AUC - epoch: 2 - score: 0.988676 

Epoch 3/3
 - 687s - loss: 0.0350 - acc: 0.9861 - val_loss: 0.0446 - val_acc: 0.9824

 ROC-AUC - epoch: 3 - score: 0.988746
 
 ----
 
 #### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep2 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 256, Max features 100,000 - Optimizer: adam

* Total params: 30,861,702

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 457s - loss: 0.0679 - acc: 0.9769 - val_loss: 0.0484 - val_acc: 0.9814

 ROC-AUC - epoch: 1 - score: 0.982566 

Epoch 2/2
 - 456s - loss: 0.0451 - acc: 0.9830 - val_loss: 0.0491 - val_acc: 0.9803

 ROC-AUC - epoch: 2 - score: 0.986838 


#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep5 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 256, Max features 100,000 - Optimizer: adam


Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 456s - loss: 0.0407 - acc: 0.9843 - val_loss: 0.0442 - val_acc: 0.9825

 ROC-AUC - epoch: 1 - score: 0.989354 

Epoch 2/5
 - 456s - loss: 0.0375 - acc: 0.9852 - val_loss: 0.0439 - val_acc: 0.9825

 ROC-AUC - epoch: 2 - score: 0.989724 

Epoch 3/5


#### GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep3 -  dropout (external) =0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 256, Max features 100,000 - Optimizer: adam

**Attempt 1**

Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 473s - loss: 0.0530 - acc: 0.9806 - val_loss: 0.0598 - val_acc: 0.9826

 ROC-AUC - epoch: 1 - score: 0.989032 

Epoch 2/3
 - 461s - loss: 0.0390 - acc: 0.9847 - val_loss: 0.0513 - val_acc: 0.9827

 ROC-AUC - epoch: 2 - score: 0.989377 
 
**Attempt 2**
 
Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 462s - loss: 0.0507 - acc: 0.9814 - val_loss: 0.0720 - val_acc: 0.9767

 ROC-AUC - epoch: 1 - score: 0.988744 

Epoch 2/3
 - 462s - loss: 0.0385 - acc: 0.9848 - val_loss: 0.0509 - val_acc: 0.9819

 ROC-AUC - epoch: 2 - score: 0.989492 

Epoch 3/3
 - 462s - loss: 0.0335 - acc: 0.9865 - val_loss: 0.0599 - val_acc: 0.9772

 ROC-AUC - epoch: 3 - score: 0.988826 
 
**Attempt 2** (With nadam, external Dropout and ep 2)
 
 Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 464s - loss: 0.0534 - acc: 0.9804 - val_loss: 0.0620 - val_acc: 0.9736

 ROC-AUC - epoch: 1 - score: 0.988188 

Epoch 2/2
 - 461s - loss: 0.0393 - acc: 0.9846 - val_loss: 0.0528 - val_acc: 0.9788

 ROC-AUC - epoch: 2 - score: 0.989730
 
 * **LB 0.9827**

 ---

## LSTM

#### LSTM (maxlen 200, Units 256) + FastText + MaxPool - Ep5 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4  - Batch size 128, Max features 100,000 - Optimizer: nadam

* Total params: 9,442,374

Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 910s - loss: 0.0502 - acc: 0.9816 - val_loss: 0.0590 - val_acc: 0.9825

 ROC-AUC - epoch: 1 - score: 0.989202 

Epoch 2/5
 - 906s - loss: 0.0379 - acc: 0.9851 - val_loss: 0.0505 - val_acc: 0.9823

 ROC-AUC - epoch: 2 - score: 0.989451 

Epoch 3/5
 - 906s - loss: 0.0323 - acc: 0.9871 - val_loss: 0.0492 - val_acc: 0.9823

 ROC-AUC - epoch: 3 - score: 0.988694 

Epoch 4/5
 - 906s - loss: 0.0266 - acc: 0.9895 - val_loss: 0.0501 - val_acc: 0.9801

 ROC-AUC - epoch: 4 - score: 0.986024 

----

##  Stacked GRU

#### L2 - GRU (maxlen 200, Units 128) + FastText + MaxPool - Ep2 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 128, Max features 30,000
 
* Total params: 9,478,854

* Train on 151592 samples, validate on 7979 samples
* Epoch 1/2
 - 3217s - loss: 0.0448 - acc: 0.9830 - val_loss: 0.0531 - val_acc: 0.9832

 - ROC-AUC - epoch: 1 - score: 0.987085 

* Epoch 2/2
 - 3194s - loss: 0.0393 - acc: 0.9846 - val_loss: 0.0582 - val_acc: 0.9809

 - ROC-AUC - epoch: 2 - score: 0.987217 
 
 
#### L2 - GRU (maxlen 200, Units 256) + FastText + MaxPool - Ep5 -  dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 256, Max features 100,000

* Total params: 31,449,222
 
 ----

## SpatialDropout rate

##### 0.3 - GRU (maxlen 200, Units 128) + FastText + MaxPool - Ep3 (trained on GPU - K80)
 
* Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 2073s - loss: 0.0494 - acc: 0.9819 - val_loss: 0.0435 - val_acc: 0.9832

* ROC-AUC - epoch: 1 - score: 0.988933 

Epoch 2/2

##### 0.3 - GRU (maxlen 200, Units 128) + FastText + MaxPool - Ep3 (trained on CPU quad core)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 2891s - loss: 0.0489 - acc: 0.9820 - val_loss: 0.0437 - val_acc: 0.9830

 ROC-AUC - epoch: 1 - score: 0.988025 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.988614 

* **LB 0.9822**

##### 0.4 - GRU (maxlen 200, Units 128) + FastText + MaxPool - Ep3 (trained on GPU - V100)

* Train on 151592 samples, validate on 7979 samples
* Epoch 1/2
 - 2189s - loss: 0.0499 - acc: 0.9816 - val_loss: 0.0438 - val_acc: 0.9832

 ROC-AUC - epoch: 1 - score: 0.988505 

* Epoch 2/2
 - 2149s - loss: 0.0384 - acc: 0.9850 - val_loss: 0.0435 - val_acc: 0.9837

 ROC-AUC - epoch: 2 - score: 0.988554 
 
 * **0.9825**

-------

## Multiple Dense layers

#### GRU (maxlen 200, Units 256) + FastText -> Spatial Dropout 0.4, dropout (external) =0.5, recurrent_dropout=0.5 -> MaxPool -> Dense (64, relu) - Ep5 - Batch size 256, Max features 100,000 - Optimizer: adam

Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 459s - loss: 0.0643 - acc: 0.9775 - val_loss: 0.0518 - val_acc: 0.9791

 ROC-AUC - epoch: 1 - score: 0.983117 

Epoch 2/5
 - 458s - loss: 0.0463 - acc: 0.9824 - val_loss: 0.0478 - val_acc: 0.9828

 ROC-AUC - epoch: 2 - score: 0.986829 

Epoch 3/5
 - 458s - loss: 0.0418 - acc: 0.9834 - val_loss: 0.0501 - val_acc: 0.9804

 ROC-AUC - epoch: 3 - score: 0.986362 

Epoch 4/5
 - 458s - loss: 0.0379 - acc: 0.9847 - val_loss: 0.0478 - val_acc: 0.9829

 ROC-AUC - epoch: 4 - score: 0.985988 

Epoch 5/5
 - 458s - loss: 0.0349 - acc: 0.9855 - val_loss: 0.0482 - val_acc: 0.9823

 ROC-AUC - epoch: 5 - score: 0.986694 
 
**Attempt 2 (Batch 128) **
 
 Train on 151592 samples, validate on 7979 samples
Epoch 1/5
 - 706s - loss: 0.0357 - acc: 0.9854 - val_loss: 0.0511 - val_acc: 0.9808

 ROC-AUC - epoch: 1 - score: 0.985459 

Epoch 2/5
 - 706s - loss: 0.0332 - acc: 0.9864 - val_loss: 0.0547 - val_acc: 0.9804

 ROC-AUC - epoch: 2 - score: 0.984432 

Epoch 3/5
 - 706s - loss: 0.0312 - acc: 0.9870 - val_loss: 0.0572 - val_acc: 0.9812

 ROC-AUC - epoch: 3 - score: 0.984002 