In [20]:
import numpy as np
np.random.seed(42)
import pandas as pd

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [22]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

In [23]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
import os
os.environ['OMP_NUM_THREADS'] = '4'

In [34]:
EMBEDDING_FILE = 'data/fasttext_300d_crawl_2m/crawl-300d-2M.vec'

In [26]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [27]:
X_train = train["comment_text"].values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].values

In [28]:
max_features = 30000
maxlen = 100
embed_size = 300

In [29]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [31]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

In [35]:
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [36]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [37]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super().__init__()
        self.interval = interval
        self.X_val, self.y_val = varslidation_data
        
    def on_epoch_end(self, epoch, logs={}):
        if epock % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print('\n ROC_AUC - epoch: {} - score: {:0.6f} \n'.format(epoch+1, score))

In [39]:
def get_model():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation='sigmoid')(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model = get_model()

In [40]:
batch_size = 32
epochs = 2

In [41]:
X_train, X_validation, y_train, y_validation = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_validation, y_validation), interval=1)

In [None]:
hist = model.fit(
    X_train, y_train, batch_size=batch_size, epochs=epochs, 
    validation_data=(X_validation, y_validation), callbacks=[RocAuc], verbose=2)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2


In [None]:
y_pred = model.predict(x_test, batch_size=1024)