Reference: https://www.kaggle.com/yekenot/pooled-gru-fasttext/output

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

Using TensorFlow backend.


In [2]:
EMBEDDING_FILE = '../data/fasttext/crawl-300d-2M.vec'

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [3]:
max_features = 30000
maxlen = 100
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [4]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)

    return model

In [6]:
model = get_model() 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 100, 300)     9000000     input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 100, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 100, 160)     182880      spatial_dropout1d_2[0][0]        
__________________________________________________________________________________________________
global_ave

In [7]:
batch_size = 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)


Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 1442s - loss: 0.0499 - acc: 0.9820 - val_loss: 0.0464 - val_acc: 0.9821

 ROC-AUC - epoch: 1 - score: 0.987249 

Epoch 2/2
 - 1436s - loss: 0.0379 - acc: 0.9852 - val_loss: 0.0449 - val_acc: 0.9825

 ROC-AUC - epoch: 2 - score: 0.987349 



In [8]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submissions/gru_fasttext_maxpool_ep2.csv', index=False)

In [14]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.997853,0.685455,0.979116,0.02447501,0.971196,0.529763
1,0000247867823ef7,0.000159,1.9e-05,3.8e-05,3.980839e-07,4.1e-05,1.2e-05
2,00013b17ad220c46,0.003413,0.000457,0.000872,1.196693e-05,0.000658,0.000184
3,00017563c3f7919a,0.000527,4.9e-05,0.000138,2.173727e-05,0.000243,4.5e-05
4,00017695ad8997eb,0.007083,0.000317,0.000932,1.647572e-05,0.000587,0.000176


### Records

#### GRU + FastText + MaxPool - Ep2 
Train on 151592 samples, validate on 7979 samples

* Epoch 1/2
 - 1442s - loss: 0.0499 - acc: 0.9820 - val_loss: 0.0464 - val_acc: 0.9821

 ROC-AUC - epoch: 1 - score: 0.987249 

* Epoch 2/2
 - 1436s - loss: 0.0379 - acc: 0.9852 - val_loss: 0.0449 - val_acc: 0.9825

 ROC-AUC - epoch: 2 - score: 0.987349 
 ** LB 0.9812 **