# JIGSAW TOXIC COMMENT CLASSIFICATION CHALLENGE

In [8]:
import os
import numpy as np
import pandas as pd
from keras.layers import Dense, Input, LSTM, Bidirectional, Conv1D
from keras.layers import Dropout, Embedding
from keras.preprocessing import text, sequence
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model


In [9]:
#import dataset
path = '/home/kenneth/Documents/MLDM M2/DEEP LEARNING/PROJECT_JTCC/JigSaw-Toxic-Comment-Classification-Challenge/DATASETS/PREPROCESSED'

EMBEDDING_FILE = os.path.join(path, 'glove.txt') 
train_x = pd.read_csv(os.path.join(path, 'train.csv')).fillna(" ")
test_x = pd.read_csv(os.path.join(path, 'test.csv')).fillna(" ")

In [10]:
max_features = 100000
maxlen = 150
embed_size = 300

train_x['comment_text'].fillna(' ')
test_x['comment_text'].fillna(' ')
train_y = train_x[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
train_x = train_x['comment_text'].str.lower()

test_x = test_x['comment_text'].str.lower()


# Vectorize text + Prepare GloVe Embedding
tokenizer = text.Tokenizer(num_words = max_features, lower=True)
tokenizer.fit_on_texts(list(train_x))


In [11]:
train_x = tokenizer.texts_to_sequences(train_x)
test_x = tokenizer.texts_to_sequences(test_x)

train_x = sequence.pad_sequences(train_x, maxlen=maxlen)
test_x = sequence.pad_sequences(test_x, maxlen=maxlen)

In [12]:
#Word Embeddings
embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [13]:
def firstLayer(X, m):
    N = len(X) 
    return 2*np.sqrt(N/(m+2)) + np.sqrt(N*(m+2))
def secondLayer(X, m):
    N = len(X)
    return m * np.sqrt(N/(m+2))

In [14]:
firstLayer(train_x, 6)

1412.3163597438077

In [15]:
secondLayer(train_x, 6)

847.3898158462846

In [19]:
# Build Model
inputshape = Input(shape=(maxlen,))
#encoder
encoder = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inputshape)
encoder = SpatialDropout1D(0.35)(encoder)
encoder = Bidirectional(LSTM(128, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(encoder)
encoder = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(encoder)

avg_pool = GlobalAveragePooling1D()(encoder)
max_pool = GlobalMaxPooling1D()(encoder)
encoder = concatenate([avg_pool, max_pool])
#decoder
decoder = Dense(6, activation='sigmoid')(encoder)

model = Model(inputshape, decoder)
model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])
model.summary()


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 150, 300)     30000000    input_3[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 150, 300)     0           embedding_3[0][0]                
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 150, 256)     439296      spatial_dropout1d_2[0][0]        
____________________________________________________________________________________________

In [20]:
# Prediction
batch_size = 32
epochs = 1
model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, verbose=1)

Epoch 1/1


<keras.callbacks.callbacks.History at 0x7fd7bebe0dd8>

In [21]:
predictions = model.predict(test_x, batch_size = batch_size, verbose = 1)



In [23]:
predictions

array([[9.8534811e-01, 4.3440899e-01, 9.8330569e-01, 5.2334607e-02,
        8.9475262e-01, 1.5655231e-01],
       [1.7772913e-03, 6.7263842e-05, 2.6789308e-04, 3.0097365e-04,
        2.7653575e-04, 2.0280480e-04],
       [3.5466254e-03, 3.0556321e-04, 6.9993734e-04, 1.3738871e-03,
        5.5083632e-04, 1.3113618e-03],
       ...,
       [9.3677640e-04, 1.9192696e-05, 1.3113022e-04, 6.6518784e-05,
        1.3068318e-04, 1.0955334e-04],
       [3.4124553e-03, 3.0755997e-05, 2.3522973e-04, 1.1944771e-04,
        9.1484189e-04, 2.1167696e-03],
       [9.3823326e-01, 2.8718054e-02, 8.2268226e-01, 1.0962993e-02,
        5.2494091e-01, 2.3588002e-02]], dtype=float32)

In [24]:
submission = pd.read_csv(os.path.join(path, 'sample_submission.csv'))
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictions
submission.to_csv(os.path.join(path, 'submission2.csv'), index=False)