In [1]:
import sys, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns

In [2]:
path = '../input/'
comp = 'Toxic-comment-classification/'
EMBEDDING_FILE = 'glove.txt'
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [6]:
embed_size = 50 # to specify how big is each word vector
max_feature = 20000 # to specify how many unique words to use
maxlen = 100 # maximum number of words to use in a given comment

In [7]:
train_data['comment_text'].isna().sum()

0

In [8]:
train_sentences_list = train_data['comment_text'].values

In [9]:
train_sentences_list

array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communit

In [10]:
y = train_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 
               'identity_hate']].values

In [11]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [12]:
test_sentences_list = test_data['comment_text'].values

In [13]:
tokenizer = Tokenizer(num_words=max_feature)
tokenizer.fit_on_texts(list(train_sentences_list))

In [14]:
tokenized_train_list = tokenizer.texts_to_sequences(train_sentences_list)
tokenized_test_list = tokenizer.texts_to_sequences(test_sentences_list)

In [15]:
X_train =pad_sequences(tokenized_train_list, maxlen=maxlen)
X_test = pad_sequences(tokenized_test_list, maxlen=maxlen)

In [16]:
X_train

array([[    0,     0,     0, ...,  4583,  2273,   985],
       [    0,     0,     0, ...,   589,  8377,   182],
       [    0,     0,     0, ...,     1,   737,   468],
       ...,
       [    0,     0,     0, ...,  3509, 13675,  4528],
       [    0,     0,     0, ...,   151,    34,    11],
       [    0,     0,     0, ...,  1627,  2056,    88]])

In [17]:
X_test

array([[   0,    0,    0, ...,  145,  493,   84],
       [   0,    0,    0, ...,   11,    8, 2826],
       [   0,    0,    0, ...,  109,   15,  355],
       ...,
       [   0,    0,    0, ...,   12, 1652,  358],
       [   0,    0,    0, ..., 9844, 3506,  355],
       [   0,    0,    0, ...,  100, 5220,    6]])

In [18]:
def get_coefficients(word, *arr):
    return word, np.asarray(arr, dtype='float32')

In [19]:
embeddings_index = dict(get_coefficients(*x.strip().split()) for 
                        x in open(EMBEDDING_FILE, encoding="utf8"))

In [20]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

  """Entry point for launching an IPython kernel.


In [21]:
emb_mean,emb_std

(0.020940498, 0.6441043)

In [22]:
word_index = tokenizer.word_index

In [23]:
words_ = min(max_feature, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (words_, embed_size))

In [24]:
for word, i in word_index.items():
    if i>=max_feature:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

In [25]:
embedding_matrix

array([[ 0.96463343,  0.05253674,  0.41968981, ...,  0.33720069,
        -0.8099663 ,  0.52729261],
       [ 0.41800001,  0.24968   , -0.41242   , ..., -0.18411   ,
        -0.11514   , -0.78580999],
       [ 0.68046999, -0.039263  ,  0.30186   , ..., -0.073297  ,
        -0.064699  , -0.26043999],
       ...,
       [-0.13491   , -0.8635    , -0.033898  , ...,  0.29484999,
        -0.24315999,  0.81682003],
       [ 0.1178    ,  0.14624   , -0.28240001, ..., -0.19529   ,
        -0.13610999,  1.04270005],
       [-0.64388001, -0.54152   ,  0.10305   , ..., -0.06732   ,
        -0.73308003,  0.88625002]])

We are going to use simple bi-directional LSTM with 2 fully connected layers with some dropouts.

In [26]:
input = Input(shape = (maxlen, ))
x = Embedding(max_feature, embed_size, weights=[embedding_matrix])(input)
x = Bidirectional(LSTM(50, return_sequences=True, 
                       dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(6, activation='softmax')(x)

model = Model(inputs = input, output = x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', 
             metrics = ['accuracy'])

W0815 12:17:27.227001  4152 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0815 12:17:27.814034  4152 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0815 12:17:27.925041  4152 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0815 12:17:28.054048  4152 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0815 12:17:28.055048  4152 deprecation_wrapper.py:119] 

In [27]:
model.fit(X_train, y, batch_size=124, validation_split=0.1, epochs=2)

W0815 12:17:31.772261  4152 deprecation.py:323] From C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1d8179e8>

In [28]:
y_test = model.predict([X_test], batch_size=1000, verbose = 1)



In [29]:
sample_submission = pd.read_csv('sample_submission.csv')

In [30]:
sample_submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 
               'identity_hate']] = y_test

In [31]:
sample_submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.271073,0.099963,0.275962,0.005367,0.231942,0.115692
1,0000247867823ef7,0.739204,0.001602,0.154545,0.000276,0.099893,0.00448
2,00013b17ad220c46,0.785804,0.001787,0.102981,0.000752,0.104218,0.004458
3,00017563c3f7919a,0.850983,0.000827,0.082514,0.000481,0.064491,0.000705
4,00017695ad8997eb,0.80837,0.001336,0.104233,0.000655,0.083432,0.001972


In [32]:
sample_submission.to_csv('toxic_comment_classification.csv', index = False)