In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


# Any results you write to the current directory are saved as output.

from keras.models import Model
from keras.layers import Dense, Embedding, Input, LeakyReLU
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, CuDNNGRU
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import RMSprop
from keras.backend.tensorflow_backend import set_session
import os
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [5]:
max_features = 20000
maxlen = 100
embedding_dim = 300
TIME_STEPS = 100
SINGLE_ATTENTION_VECTOR = False

In [64]:
train = pd.concat([pd.read_csv("../input/train.csv"),pd.read_csv("../input/train_es.csv")] )
# train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [65]:
train.shape, test.shape

((319142, 8), (153164, 2))

In [66]:
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
word_index = tokenizer.word_index
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [67]:
embeddings_index={}
f = open( '../input/glove.42B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [68]:
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [35]:
import h5py
with h5py.File('../input/toxic_token.h5','w') as f:
    f.create_dataset("X_t", data = X_t )
    f.create_dataset("X_te", data = X_te )
    f.create_dataset("y", data = y )
    f.create_dataset("embedding_matrix", data=embedding_matrix)

In [36]:
import cPickle as pickle
with open('../input/word_index.p', 'wb') as fp:
    pickle.dump(word_index, fp)

In [69]:
def get_model():
    global embedding_matrix, embedding_dim
    embed_size = embedding_dim
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    
    x = Bidirectional(CuDNNGRU(50, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Bidirectional(CuDNNGRU(50, return_sequences=False))(x)
    x = Dense(40)(x)
    x = LeakyReLU()(x)
    x = Dropout(0.2)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.layers[1].set_weights([embedding_matrix])
    model.layers[1].trainable = False
    
    model.compile(loss='binary_crossentropy',
                  optimizer=RMSprop(clipvalue=1, clipnorm=1),
                  metrics=['accuracy'])
    return model

In [70]:
model = get_model()
batch_size = 32
epochs = 10
fname= 'bi-50gru-bi-50gru-300emb-40'
file_path="weights/"+fname+".hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=4)

In [71]:
callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
intermetiate_X = model.predict(X_t)

Train on 287227 samples, validate on 31915 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [72]:
model.load_weights(file_path)
sample_submission = pd.read_csv("../input/sample_submission.csv")
y_test = model.predict(X_te)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission[list_classes] = y_test
sample_submission.to_csv("output/"+fname+".csv", index=False)