# Comment Toxicity Model

## Setup

Data available at https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf
from tensorflow import keras

In [3]:
data = pd.read_csv("./data/train.csv")

data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Data preprocessing

In [4]:
X = data['comment_text']
y = data.iloc[:,2:].values

### Adding a text vectorizer layer

In [5]:
MAX_FEATURES = 200000 # Number of words in the vocabulary

In [6]:
vectorizer = keras.layers.TextVectorization(
    max_tokens=MAX_FEATURES, output_sequence_length=1800, output_mode="int"
)

In [7]:
vectorizer.adapt(X.values)

In [8]:
# Look at the first 10 words in the vocabulary
vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'to', 'of', 'and', 'a', 'you', 'i', 'is']

In [9]:
vectorizer('Hello world!')

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([288, 263,   0, ...,   0,   0,   0], dtype=int64)>

In [10]:
vectorized_text = vectorizer(X.values)

In [11]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

### Create dataset

In [12]:
# MCSHBAP
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache().shuffle(160000).batch(16).prefetch(8)

In [13]:
batch_x, batch_y = dataset.as_numpy_iterator().next()

batch_x.shape, batch_y.shape

((16, 1800), (16, 6))

In [14]:
train = dataset.take(int(len(dataset) * 0.7))
val = dataset.skip(int(len(dataset) * 0.7)).take(int(len(dataset) * 0.2))
test = dataset.skip(int(len(dataset) * 0.9)).take(int(len(dataset) * 0.1))

In [15]:
len(train), len(val), len(test)

(6981, 1994, 997)

In [16]:
train_generator = train.as_numpy_iterator()

In [32]:
train_generator.next()

(array([[  289,     4,     6, ...,     0,     0,     0],
        [60309,  1084,  6613, ...,     0,     0,     0],
        [   49,    14,     9, ...,     0,     0,     0],
        ...,
        [  451, 27492, 19386, ...,     0,     0,     0],
        [ 1053,  1109,  1048, ...,     0,     0,     0],
        [   10,  3510,     5, ...,     0,     0,     0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

## Creating Sequential Model

In [34]:
from keras import Sequential
from keras.layers import Bidirectional, LSTM, Dense, Dropout, Embedding

In [35]:
model = Sequential()
#Creating the embedding layer, we are adding 1 because we want to reserve the 0 index
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor fully connected layers
model.add(Dense(128,activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128,activation='relu'))
#Final layer
model.add(Dense(6, activation='sigmoid'))

In [36]:
model.compile(loss='binary_crossentropy', optimizer='adam')

In [37]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [38]:
history = model.fit(train, epochs=1, validation_data=val)

   9/6981 [..............................] - ETA: 1:25:58 - loss: 0.6535

KeyboardInterrupt: 