# **Toxic comment classification -LSTM- kaggle challenge**

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Importing Libraries 

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import io

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint


Using TensorFlow backend.


Loading data

In [6]:
train=pd.read_csv('drive/My Drive/Toxic_Comment/train.csv') 
test=pd.read_csv('drive/My Drive/Toxic_Comment/test.csv') 
embedding_file = 'drive/My Drive/Toxic_Comment/glove.6B.50d.txt'
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
train.head(5)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
train.apply(lambda x: sum(x.isnull()), axis=0)

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [10]:
print(len(train))

116211


In [14]:
print("class distribution of each class")
for i in list_classes:
  print(train[i].value_counts())

class distribution of each class
0    105077
1     11134
Name: toxic, dtype: int64
0    115055
1      1156
Name: severe_toxic, dtype: int64
0    110048
1      6163
Name: obscene, dtype: int64
0    115853
1       358
Name: threat, dtype: int64
0    110471
1      5740
Name: insult, dtype: int64
0    115220
1       991
Name: identity_hate, dtype: int64


Preparing embedding dictionary reading from embedding text file

In [0]:
def get_embedding_coefs(word, *arr): 
  return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_embedding_coefs(*o.strip().split()) for o in open(embedding_file,'r', encoding="utf8"))

In [0]:
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

Tokenizing the text comments

In [0]:
max_features = 2000
maxlen = 100

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

Preparing embedding matrix

In [0]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = 50

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

Model selection

In [0]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(128, return_sequences=True,dropout=0.5,recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.5)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [23]:
model = get_model()
batch_size = 32
epochs = 2
model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 104589 samples, validate on 11622 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f91ea83b8d0>

In [0]:
y_test = model.predict(X_test)
sample_submission = pd.read_csv('drive/My Drive/Toxic_comment/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv("baseline.csv", index=False)
files.download('baseline.csv')

In [0]:
from keras.models import Sequential
from keras.layers import CuDNNLSTM, Dense, Bidirectional

In [0]:
def get_model():
    embed_size = 300
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64))(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [28]:
model = get_model()
batch_size = 32
epochs = 2
model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 104589 samples, validate on 11622 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f91c76b6da0>