In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output


# Any results you write to the current directory are saved as output.

from keras.models import Model
from keras.layers import Dense, Embedding, Input, LeakyReLU, merge, Conv2D, Conv1D, PReLU,ELU,Concatenate, Convolution1D
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, CuDNNGRU,MaxPooling1D,MaxPool2D,MaxPooling1D
from keras.preprocessing import text, sequence
from keras.layers.core import *
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.backend.tensorflow_backend import set_session
import os
import tensorflow as tf
import h5py
import cPickle as pickle
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

max_features = 20000
maxlen = 100
embedding_dim = 300
TIME_STEPS = 100
SINGLE_ATTENTION_VECTOR = False
with h5py.File(''.join(['toxic_token_external.h5']), 'r') as hf:
    X_t = hf['X_t'].value
    X_te = hf['X_te'].value
    y = hf['y'].value
    embedding_matrix = hf['embedding_matrix'].value
with open('word_index_external.p', 'rb') as fp:
    word_index = pickle.load(fp)

In [11]:
def get_model():
    global embedding_matrix, embedding_dim
    embed_size = embedding_dim
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    # x = CuDNNGRU(100,return_sequences=True)(x)
    x = Bidirectional(CuDNNGRU(50, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    # x = Conv1D(filters=30, kernel_size=10, strides=2)(x)
    x = GlobalMaxPool1D()(x)
    # x = MaxPooling1D(pool_size = 2)(x)
    # x = Flatten()(x)
    x = Dense(50)(x)
    x = LeakyReLU()(x)
    x = Dropout(0.2)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.layers[1].set_weights([embedding_matrix])
    model.layers[1].trainable = False
    
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

In [6]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train = train.sample(frac=1)

list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
word_index = tokenizer.word_index

list_sentences_insult = pd.read_csv('insult.tsv',sep='\t').comment.fillna('Steeve').values
list_sentences_toxicity = pd.read_csv('toxicity.tsv',sep='\t').comment.fillna('Steeve').values
list_sentences_threat = pd.read_csv('threat.tsv',sep='\t').comment.fillna('Steeve').values

# list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
# list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
# X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
# X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [8]:
list_tokenized_insult = tokenizer.texts_to_sequences(list_sentences_insult)
list_tokenized_toxicity = tokenizer.texts_to_sequences(list_sentences_toxicity)
list_tokenized_threat = tokenizer.texts_to_sequences(list_sentences_threat)


In [9]:
X_i = sequence.pad_sequences(list_tokenized_insult, maxlen=maxlen)
X_to = sequence.pad_sequences(list_tokenized_toxicity, maxlen=maxlen)
X_th = sequence.pad_sequences(list_tokenized_threat, maxlen=maxlen)

In [14]:
model = get_model()
model.load_weights('weights_base.pre_trained.hdf5')

In [18]:
model.predict(X_t)

array([[3.4521485e-03, 1.5647619e-04, 1.0364254e-03, 1.8919716e-05,
        1.7634697e-03, 1.2914139e-04],
       [2.2444291e-04, 1.8916198e-05, 5.1088315e-05, 2.7056192e-06,
        1.3747807e-04, 1.7198017e-05],
       [4.3097930e-03, 1.6854826e-04, 1.0785871e-03, 2.9034918e-05,
        2.2468795e-03, 1.0373683e-04],
       ...,
       [4.6653749e-05, 1.4479777e-05, 2.2151511e-05, 3.2253174e-06,
        3.9215414e-05, 1.4408163e-06],
       [4.5463443e-02, 2.7929505e-04, 9.1972034e-03, 1.5820175e-05,
        1.6677078e-02, 2.3319499e-04],
       [9.4531009e-05, 6.2098865e-05, 6.0262311e-05, 1.2132859e-05,
        9.2419366e-05, 3.4128993e-06]], dtype=float32)

In [None]:
sample_submission = pd.read_csv("sample_submission.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [19]:
pd.read_csv('toxicity.tsv',sep='\t').head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,2232.0,This:NEWLINE_TOKEN:One can make an analogy in ...,2002,True,article,random,train
1,4216.0,`NEWLINE_TOKENNEWLINE_TOKEN:Clarification for ...,2002,True,user,random,train
2,8953.0,Elected or Electoral? JHK,2002,False,article,random,test
3,26547.0,`This is such a fun entry. DevotchkaNEWLINE_...,2002,True,article,random,train
4,28959.0,Please relate the ozone hole to increases in c...,2002,True,article,random,test
