In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers




In [2]:
EMBEDDING_FILE='glove.6B.50d.txt'
TRAIN_DATA_FILE='train_data.csv'
TEST_DATA_FILE='test_data.csv'

In [3]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use


In [4]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)



In [5]:
train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [6]:
train.shape

(159571, 8)

In [7]:
train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [8]:
test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [9]:
test.shape

(153164, 2)

In [10]:
test.isnull().sum()

id              0
comment_text    0
dtype: int64

In [11]:
list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [12]:
list_sentences_train

array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communit

In [13]:
list_sentences_test

array(["Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",
       '== From RfC == \n\n The title is fine as it is, IMO.',
       '" \n\n == Sources == \n\n * Zawe Ashton on Lapland —  /  "', ...,
       '" \n\n == Okinotorishima categories == \n\n I see your changes and agree this is ""more correct.""  I had gotten confused, but then found this: \n :... while acknowledging Japan\'s territorial rights to Okinotorishima itself ... \n However, is there a category for  \n :... did not acknowledge Japan\'s claim to an exclusive economic zone (EEZ) stemming from Okinotorishima. \n That is, is there a category for ""disputed EEZ""s?   "',
       '" \n\n == ""One of the founding n

In [14]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [15]:
len(y)

159571

In [16]:
max_features

20000

In [17]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))


In [18]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)


In [19]:
list_tokenized_train

[[688,
  75,
  1,
  126,
  130,
  177,
  29,
  672,
  4511,
  12052,
  1116,
  86,
  331,
  51,
  2278,
  11448,
  50,
  6864,
  15,
  60,
  2756,
  148,
  7,
  2937,
  34,
  117,
  1221,
  15190,
  2825,
  4,
  45,
  59,
  244,
  1,
  365,
  31,
  1,
  38,
  27,
  143,
  73,
  3462,
  89,
  3085,
  4583,
  2273,
  985],
 [52,
  2635,
  13,
  555,
  3809,
  73,
  4556,
  2706,
  21,
  94,
  38,
  803,
  2679,
  992,
  589,
  8377,
  182],
 [412,
  437,
  73,
  134,
  14,
  249,
  2,
  71,
  314,
  78,
  50,
  9,
  13,
  626,
  8,
  2284,
  492,
  502,
  102,
  4,
  611,
  2,
  35,
  325,
  126,
  363,
  3,
  29,
  38,
  27,
  52,
  208,
  2,
  434,
  57,
  36,
  1,
  2394,
  93,
  1,
  737,
  468],
 [57,
  7,
  228,
  97,
  54,
  328,
  1436,
  15,
  2133,
  7,
  6024,
  22,
  1,
  123,
  2502,
  56,
  16,
  513,
  15,
  25,
  5,
  4236,
  3,
  1327,
  3,
  9762,
  7,
  67,
  1,
  277,
  85,
  122,
  13503,
  37,
  9,
  51,
  19,
  42,
  10,
  1,
  1460,
  138,
  1257,
  2153,
  426,
 

In [20]:
len(list_tokenized_train)

159571

In [21]:
list_tokenized_test

[[2665,
  655,
  8849,
  656,
  8,
  57,
  16388,
  83,
  884,
  356,
  16,
  3222,
  76,
  21,
  6,
  4,
  6865,
  6,
  1521,
  7,
  56,
  655,
  4942,
  1898,
  682,
  6908,
  4,
  96,
  6,
  2,
  5104,
  29,
  417,
  6,
  726,
  35,
  8849,
  656,
  8,
  36,
  4122,
  10,
  2818,
  660,
  437,
  454,
  19612,
  9,
  333,
  15,
  153,
  4,
  8,
  240,
  49,
  52,
  24,
  5,
  2045,
  162,
  3132,
  682,
  2880,
  96,
  219,
  145,
  493,
  84],
 [31, 1185, 1, 348, 8, 676, 17, 11, 8, 2826],
 [109, 15, 355],
 [22,
  6,
  18,
  5,
  151,
  157,
  34,
  1,
  119,
  1,
  102,
  7,
  1501,
  24,
  1,
  364,
  640,
  7,
  40,
  77,
  645,
  1,
  119,
  3098,
  1501,
  7,
  1002,
  1400,
  1,
  102,
  396,
  125,
  26,
  127,
  6,
  12,
  20,
  349],
 [7, 59, 7516, 71, 80, 34, 42],
 [127, 6, 12, 1085, 7, 67, 101, 1092, 3, 6, 4, 47, 14, 360, 175, 137],
 [45,
  33,
  14,
  149,
  777,
  2,
  28,
  108,
  126,
  19,
  408,
  199,
  4,
  1688,
  4876,
  22,
  6,
  47,
  49,
  2,
  1254,
  45,
  

In [22]:
len(list_tokenized_test)

153164

In [23]:
maxlen

100

In [24]:
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [25]:
X_t

array([[    0,     0,     0, ...,  4583,  2273,   985],
       [    0,     0,     0, ...,   589,  8377,   182],
       [    0,     0,     0, ...,     1,   737,   468],
       ...,
       [    0,     0,     0, ...,  3509, 13675,  4528],
       [    0,     0,     0, ...,   151,    34,    11],
       [    0,     0,     0, ...,  1627,  2056,    88]])

In [26]:
type(X_t)

numpy.ndarray

In [27]:
type(X_te)

numpy.ndarray

In [28]:
len(X_t)

159571

In [29]:
len(X_t[0])

100

In [30]:
X_te

array([[   0,    0,    0, ...,  145,  493,   84],
       [   0,    0,    0, ...,   11,    8, 2826],
       [   0,    0,    0, ...,  109,   15,  355],
       ...,
       [   0,    0,    0, ...,   12, 1652,  358],
       [   0,    0,    0, ..., 9844, 3506,  355],
       [   0,    0,    0, ...,  100, 5220,    6]])

In [31]:
len(X_te)

153164

In [32]:
len(X_te[0])

100

In [33]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE,encoding='utf-8'))

In [34]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

  if await self.run_code(code, result, async_=asy):


(0.020940498, 0.6441043)

In [35]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [36]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])





In [37]:
model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1);


Epoch 1/2


Epoch 2/2

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [38]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)




In [39]:
y_test

array([[9.95462060e-01, 2.89433658e-01, 9.44655955e-01, 5.28728291e-02,
        8.51854444e-01, 2.04360038e-01],
       [3.17890401e-04, 8.76637145e-08, 3.43434112e-05, 3.88055014e-07,
        1.38492533e-05, 1.38847292e-06],
       [1.22620910e-03, 9.57488169e-07, 1.42517994e-04, 4.20807555e-06,
        7.50978870e-05, 9.44177373e-06],
       ...,
       [4.74015163e-04, 1.06466516e-07, 3.50664013e-05, 5.21417235e-07,
        2.19056619e-05, 2.44179137e-06],
       [1.72140810e-03, 9.05510205e-07, 1.67072329e-04, 3.83512815e-06,
        1.20938173e-04, 1.18853015e-04],
       [9.83624518e-01, 6.39071167e-02, 8.54529738e-01, 1.60582643e-02,
        6.87466860e-01, 1.34097524e-02]], dtype=float32)

In [40]:
len(y_test)

153164

In [43]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission[list_classes] = y_test


In [44]:
sample_submission

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.995462,2.894337e-01,0.944656,5.287283e-02,0.851854,0.204360
1,0000247867823ef7,0.000318,8.766371e-08,0.000034,3.880550e-07,0.000014,0.000001
2,00013b17ad220c46,0.001226,9.574882e-07,0.000143,4.208076e-06,0.000075,0.000009
3,00017563c3f7919a,0.000604,2.089668e-07,0.000061,7.758437e-07,0.000032,0.000003
4,00017695ad8997eb,0.002308,2.083747e-06,0.000270,7.570480e-06,0.000160,0.000016
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.793527,2.857133e-03,0.261617,3.477594e-03,0.202521,0.002867
153160,fffd7a9a6eb32c16,0.012073,1.807692e-05,0.001049,9.210721e-05,0.001125,0.000222
153161,fffda9e8d6fafa9e,0.000474,1.064665e-07,0.000035,5.214172e-07,0.000022,0.000002
153162,fffe8f1340a79fc2,0.001721,9.055102e-07,0.000167,3.835128e-06,0.000121,0.000119


In [45]:
sample_submission.to_csv('submission.csv', index=False)