In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [5]:
glove_file='./dataset/glove.6B.50d.txt'
train_file='./dataset/train.csv'
test_file='./dataset/test.csv'

train=pd.read_csv(train_file)
test=pd.read_csv(test_file)

In [6]:
train.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


In [12]:
sent_train=train["comment_text"].fillna("nan")

In [13]:
sent_train[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [18]:
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y=train[classes].values

In [21]:
y[0]

array([0, 0, 0, 0, 0, 0])

In [22]:
sent_test=test["comment_text"].fillna("nan")

In [23]:
max_words_count=20000
embedding_size=50
max_words_length=100

# get texts indexed in order to transfer to Glove

In [24]:
tokenizer=Tokenizer(num_words=max_words_count)
tokenizer.fit_on_texts(sent_train)
tokens_train = tokenizer.texts_to_sequences(sent_train)
tokens_test = tokenizer.texts_to_sequences(sent_test)

In [26]:
tokens_train[0][0]

688

In [28]:
x_train=pad_sequences(tokens_train,maxlen=max_words_length)
x_test=pad_sequences(tokens_test,maxlen=max_words_length)

# define a method to transfer index to embedding dictionary

In [29]:
def index_to_embed(word,*embedding):
    return word,np.asarray(embedding,dtype='float32')

In [30]:
embed_dict=dict(index_to_embed(*o.strip().split())for o in open(glove_file))

In [32]:
all_embs = np.stack(embed_dict.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

In [33]:
word_idx=tokenizer.word_index

In [36]:
word_idx.items()

[("dool's", 44824),
 ('\xca\x8a', 96488),
 ('bailyite', 207103),
 ('sowell', 80221),
 ('tsukino', 182655),
 ('\xca\x84', 96478),
 ('woods', 8642),
 ('spiders', 27486),
 ('gavan', 90396),
 ('dekolb', 52230),
 ('ultimatley', 151764),
 ('woody', 15229),
 ('trawling', 22475),
 ('comically', 44604),
 ('027597675', 184212),
 ('regularize', 198828),
 ('alwiqi', 157876),
 ('pb666', 178108),
 ('sprague', 94901),
 ('\xca\x80', 96465),
 ('jairam', 198046),
 ('acurately', 99062),
 ('supasoldier', 139364),
 ('falseinformation', 124720),
 ('\xe2\x80\x98west\xe2\x80\x99', 154980),
 ('rickman', 195475),
 ('foundation\xe2\x80\x99s', 186269),
 ('riconferma', 64755),
 ('dra\xc5\xbea', 98299),
 ('riconferme', 91972),
 ('naturopathic', 21044),
 ("wood'", 120763),
 ('sidestrand', 51788),
 ('pigment', 40827),
 ('occops', 183357),
 ('infromac\xc3\xad', 174771),
 ('tijfo098', 187418),
 ('bringing', 2804),
 ('raviah', 139245),
 ('jrpg', 53779),
 ('tcby', 173151),
 ('wooded', 64612),
 ('distributerd', 81831),
 (

In [37]:
embed_dict["exact"]

array([ 1.2816    ,  0.53680003,  0.16767   ,  0.041575  ,  1.19280005,
        0.28443   ,  0.38857999,  0.32376999,  0.15879001, -0.45491999,
        0.29056001, -0.10779   ,  0.089968  , -0.43414   ,  0.92761999,
        0.66022003, -0.19475   , -0.81105   , -0.050407  , -0.51450998,
       -0.2437    , -0.45324999,  0.099235  ,  0.19335   ,  0.81345999,
       -0.67914999, -1.56669998,  0.0097384 ,  0.90287   ,  0.17013   ,
        2.31529999, -0.74112999, -0.60159999, -0.79167998,  1.00549996,
       -0.071023  ,  0.79404002,  0.15381999, -0.43011999,  0.35563999,
        0.26311001, -0.43900999,  0.081274  ,  0.38225001, -0.54784   ,
       -0.60224003,  0.98161   ,  1.22510004,  0.53929001, -0.096336  ], dtype=float32)

In [38]:
embed_dict["fakery"]

array([ -1.51960002e-02,  -4.75959986e-01,  -4.81620014e-01,
         2.59920001e-01,  -3.23740005e-01,  -1.83789998e-01,
         1.17960000e+00,  -2.72109985e-01,   5.17220013e-02,
         1.13440001e+00,  -1.01360003e-03,   1.53290004e-01,
         3.10149994e-02,  -8.33539963e-02,   3.67780000e-01,
         1.55420005e-01,  -2.38920003e-01,   1.78540006e-01,
         5.68109989e-01,   2.03270003e-01,  -9.31549966e-02,
        -1.60050005e-01,  -2.30130002e-01,   8.46609995e-02,
        -4.29700017e-02,   1.43360004e-01,  -5.57780027e-01,
         3.93169999e-01,   5.22750020e-01,  -2.11170003e-01,
        -6.48880005e-01,   4.12009992e-02,   4.22960013e-01,
        -1.84560001e-01,  -3.32749993e-01,   5.47469974e-01,
        -2.41699994e-01,   8.66710022e-02,  -3.02410007e-01,
         6.33470016e-03,   1.10540003e-01,   1.78140000e-01,
        -2.40640000e-01,   1.36000001e+00,  -2.70520002e-01,
        -6.21500015e-01,   5.26960015e-01,   2.38820001e-01,
         2.22350001e-01,

In [39]:
len(word_idx)

210554

In [46]:
#we only want top max_words_count frequent number of words to be trained on
#if the word exists in GloVe, then we simply replaces it. If not, stick with the initialization 

In [40]:
embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words_count, embedding_size))

In [44]:
for word,i in word_idx.items():
    if i < max_words_count:
        vec_temp=embed_dict.get(word)
        if vec_temp is not None:
            embedding_matrix[i]=vec_temp

In [45]:
embedding_matrix[0]

array([-0.89690446,  0.07753705,  0.7850366 ,  0.57391319, -0.85204625,
        0.75674168,  0.44376405,  0.13327814, -0.47334658,  0.32604173,
       -0.94512726,  1.09999792,  0.6414134 , -0.15099483,  0.19047679,
        1.45773487, -0.52627395, -1.00130281,  0.04014299,  0.31633026,
        0.97647277,  0.36946742,  0.09132867, -0.56268492,  0.39307874,
        0.7555558 ,  0.52510077,  0.11369801, -0.06185641, -0.56066558,
       -0.24698514,  0.29954995, -0.04037255, -0.77303883, -0.87324753,
       -0.30651435, -0.39139577,  1.03580563, -0.3739567 ,  0.08361749,
        0.8825712 , -0.64445719, -0.48517934, -0.41653875,  0.54661053,
       -0.68894697,  0.34053493,  0.93188955,  0.17374768, -0.74215995])

In [47]:
#Now we can set up the model, first let's use simple bidirectional lstm 

In [50]:
inp=Input(shape=(max_words_length,))
x=Embedding(max_words_count,embedding_size,weights=[embedding_matrix])(inp)
x=Bidirectional(LSTM(embedding_size,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x=GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [52]:
model.fit(x_train, y, batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x18196c1b50>

In [55]:
y_test = model.predict([x_test], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('./dataset/sample_submission.csv')



NameError: name 'list_classes' is not defined

In [56]:
sample_submission[classes] = y_test
sample_submission.to_csv('submission.csv', index=False)

In [57]:
model.save_weights("lstm_0.0438.h5")

In [59]:
from keras.layers import GRU
inp_1=Input(shape=(max_words_length,))
x_1=Embedding(max_words_count,embedding_size,weights=[embedding_matrix])(inp_1)
x_1=Bidirectional(GRU(embedding_size,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x_1)
x_1=GlobalMaxPool1D()(x_1)
x_1 = Dense(50, activation="relu")(x_1)
x_1 = Dropout(0.1)(x_1)
x_1 = Dense(6, activation="sigmoid")(x_1)
model1 = Model(inputs=inp_1, outputs=x_1)
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [62]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
file_path="gru_best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] #early
model1.fit(x_train, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=callbacks_list)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x184a590f50>

In [63]:
y_test_1 = model1.predict([x_test], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('./dataset/sample_submission.csv')
sample_submission[classes] = y_test_1
sample_submission.to_csv('submission_GRU.csv', index=False)



In [64]:
file_lstm='submission.csv'
file_GRU='submission_GRU.csv'
p_lstm = pd.read_csv(file_lstm)
p_gru = pd.read_csv(file_GRU)

In [66]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
p_res_avg = p_lstm.copy()
p_res_max = p_lstm.copy()
p_res_avg[label_cols] = (p_gru[label_cols] + p_lstm[label_cols]) / 2
p_res_max[label_cols] = np.maximum(p_gru[label_cols], p_lstm[label_cols])

In [67]:
p_res_avg.to_csv('submission_lstm+gru_avg.csv', index=False)
p_res_max.to_csv('submission_lstm+gru_max.csv', index=False)

In [75]:
p_res_avg_max=p_lstm.copy()

for label in label_cols:
    for it in xrange(len(p_lstm['toxic'])):
        if p_gru[label][it]>=0.8 and p_lstm[label][it]>=0.8:
            p_res_avg_max[label][it]=max(p_gru[label][it], p_lstm[label][it])
        elif p_gru[label][it]<=0.2 and p_lstm[label_cols]<=0.2:
            p_res_avg_max[label][it]=min(p_gru[label][it], p_lstm[label][it])
        else:
            p_res_avg_max[label][it] = (p_gru[label][it] + p_lstm[label][it]) / 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().