In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [16]:
glove_file='./dataset/glove.twitter.27B.50d.txt'
train_file='./dataset/train.csv'
test_file='./dataset/test.csv'

train=pd.read_csv(train_file)
test=pd.read_csv(test_file)

sent_train=train["comment_text"].fillna("nan")

classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y=train[classes].values

sent_test=test["comment_text"].fillna("nan")

In [17]:
max_words_count=30000
embedding_size=50
max_words_length=150

tokenizer=Tokenizer(num_words=max_words_count)
tokenizer.fit_on_texts(sent_train)
tokens_train = tokenizer.texts_to_sequences(sent_train)
tokens_test = tokenizer.texts_to_sequences(sent_test)

x_train=pad_sequences(tokens_train,maxlen=max_words_length)
x_test=pad_sequences(tokens_test,maxlen=max_words_length)

def index_to_embed(word,*embedding):
    return word,np.asarray(embedding,dtype='float32')

embed_dict=dict(index_to_embed(*o.strip().split())for o in open(glove_file))

all_embs = np.stack(embed_dict.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

word_idx=tokenizer.word_index

embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words_count, embedding_size))

for word,i in word_idx.items():
    if i < max_words_count:
        vec_temp=embed_dict.get(word)
        if vec_temp is not None:
            embedding_matrix[i]=vec_temp

In [18]:
from keras.layers import GRU
from keras.callbacks import EarlyStopping, ModelCheckpoint
inp_1=Input(shape=(max_words_length,))
x_1=Embedding(max_words_count,embedding_size,weights=[embedding_matrix])(inp_1)
x_1=Bidirectional(GRU(embedding_size,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x_1)
x_1=GlobalMaxPool1D()(x_1)
x_1 = Dense(50, activation="relu")(x_1)
x_1 = Dropout(0.1)(x_1)
x_1 = Dense(6, activation="sigmoid")(x_1)
model1 = Model(inputs=inp_1, outputs=x_1)
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

file_path="gru_wordcount30000_best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] #early
model1.fit(x_train, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=callbacks_list)

y_test_1 = model1.predict([x_test], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('./dataset/sample_submission.csv')
sample_submission[classes] = y_test_1
sample_submission.to_csv('submission/submission_wordcount30000baseline_GRU_.csv', index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


In [20]:
import scipy
file_best='submission/submission_wordcount30000baseline_GRU_.csv'
data=pd.read_csv(file_best)
data[classes]=scipy.special.expit(scipy.special.logit(data[classes])-0.5)

In [21]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
sample_submission1 = pd.read_csv('./dataset/sample_submission.csv')
sample_submission2= pd.read_csv('./dataset/sample_submission.csv')
sample_submission1[label_cols] = data[label_cols]
sample_submission2[label_cols]=data[label_cols]/1.4
sample_submission1.to_csv('submission/submission_batchnorm_postprocessing_expit.csv',index=False)
sample_submission2.to_csv('submission/submission_batchnorm_postprocessing_1.4.csv',index=False)

In [25]:
sample_submission5 = pd.read_csv('submission_glovetwitter_GRU.csv')
sample_submission5[label_cols]=sample_submission3[label_cols]/1.4
sample_submission5.to_csv('submission/submission_glovetwitter_postprocessing_1.4v2.csv',index=False)