Reference: https://www.kaggle.com/yekenot/pooled-gru-fasttext/output

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout
from keras.layers import GRU, LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
import time
import gc

os.environ['OMP_NUM_THREADS'] = '4'

Using TensorFlow backend.


In [2]:
embeddings = 'glove' #'glove', 'fasttext

if embeddings == 'fasttext':
    EMBEDDING_FILE = '../data/fasttext/crawl-300d-2M.vec'
else:
    EMBEDDING_FILE = '../data/glove/glove.840B.300d.txt'    

max_features = 100000  #100000 , 30000
maxlen = 200
embed_size = 300

print(EMBEDDING_FILE)

../data/glove/glove.840B.300d.txt


In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

del train
del test

### Learning

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
gc.collect()

In [4]:
#len(word_index), max_features
embedding_matrix.shape

(100000, 300)

In [3]:
import pickle

mode = 'load' #'write', 'load'

if max_features > 30000:
    embedding_matrix_file = embeddings + '_embedding_matrix_feat_' + str(max_features) + '.pkl'
    
    if mode == 'write':
        pickle.dump(x_train, open('../models/x_train_feat_' + str(max_features) + '_seq_200.pkl', 'wb'))
        pickle.dump(x_test, open('../models/x_test_feat_' + str(max_features) + '_seq_200.pkl', 'wb'))
        pickle.dump(embedding_matrix, open('../models/' + embedding_matrix_file, 'wb'))
    else:        
        x_train = pickle.load( open('../models/x_train_feat_' + str(max_features) + '_seq_200.pkl', 'rb') )
        x_test = pickle.load( open('../models/x_test_feat_' + str(max_features) + '_seq_200.pkl', 'rb') )
        embedding_matrix = pickle.load( open('../models/' + embedding_matrix_file, 'rb') )
else:
    embedding_matrix_file = embeddings + '_embedding_matrix.pkl'
    
    if mode == 'write':    
        pickle.dump(x_train, open('../models/x_train_seq_200.pkl', 'wb'))
        pickle.dump(x_test, open('../models/x_test_seq_200.pkl', 'wb'))
        pickle.dump(embedding_matrix, open('../models/' + embedding_matrix_file, 'wb'))
    else:
        x_train = pickle.load( open('../models/x_train_seq_200.pkl', 'rb') )
        x_test = pickle.load( open('../models/x_test_seq_200.pkl', 'rb') )
        embedding_matrix = pickle.load( open('../models/' + embedding_matrix_file, 'rb') )
    
train = pd.read_csv('../data/train.csv')
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
submission = pd.read_csv('../data/sample_submission.csv')

del train

In [5]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(LSTM(256, return_sequences=True, recurrent_dropout=0.5))(x)
    x = Dropout(0.5)(x)
    
    # global pooling layer
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)

    return model

In [6]:
model = get_model() 
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 200, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 200, 512)     1140736     spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
dropout_1 

In [8]:
batch_size = 128
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 915s - loss: 0.0283 - acc: 0.9887 - val_loss: 0.0505 - val_acc: 0.9793

 ROC-AUC - epoch: 1 - score: 0.987319 

Epoch 2/2
 - 917s - loss: 0.0239 - acc: 0.9906 - val_loss: 0.0525 - val_acc: 0.9796

 ROC-AUC - epoch: 2 - score: 0.985392 



In [None]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submissions/lstm_l1_256_spatial_dr_0_4_gpu_dr_0_5_fasttext_maxpool_ep3_batch_256_nadam.csv', index=False)

In [None]:
submission.head()

## LSTM

#### LSTM (maxlen 200, Units 256) + GloVe + MaxPool - Ep3 - dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 256, Max features 100,000

Total params: 31,146,886

Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 - 921s - loss: 0.0500 - acc: 0.9815 - val_loss: 0.0584 - val_acc: 0.9821

 ROC-AUC - epoch: 1 - score: 0.987247 

Epoch 2/3
 - 918s - loss: 0.0383 - acc: 0.9849 - val_loss: 0.0500 - val_acc: 0.9824

 ROC-AUC - epoch: 2 - score: 0.989146 

Epoch 3/3
 - 917s - loss: 0.0332 - acc: 0.9867 - val_loss: 0.0483 - val_acc: 0.9818

 ROC-AUC - epoch: 3 - score: 0.988098 
 

#### LSTM (maxlen 200, Units 256) + FastText + MaxPool - Ep3 - dropout=0.5, recurrent_dropout=0.5 - Spatial Dropout 0.4 - Batch size 256, Max features 100,000

* Train on 151592 samples, validate on 7979 samples
* Epoch 1/3
 - 640s - loss: 0.0539 - acc: 0.9802 - val_loss: 0.0602 - val_acc: 0.9822

 ROC-AUC - epoch: 1 - score: 0.988412 

* Epoch 2/3
 - 635s - loss: 0.0390 - acc: 0.9847 - val_loss: 0.0533 - val_acc: 0.9815

 ROC-AUC - epoch: 2 - score: 0.989449 

* Epoch 3/3
 - 635s - loss: 0.0337 - acc: 0.9865 - val_loss: 0.0488 - val_acc: 0.9826

 ROC-AUC - epoch: 3 - score: 0.988610 
 
* **LB 0.9824**