<a href="https://colab.research.google.com/github/iamnikhilpal/deeplearning-practice/blob/main/toxic_comment_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv

In [2]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [3]:
# Avoid OOM errors by setting GPU memory consumption growth
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
df = pd.read_csv('https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# Example of toxic comment
print(f'Test {df.iloc[42]["comment_text"]}\n\rTags:\n\r{df[df.columns[2:]].iloc[42]}')

# df[df['toxic']==1]

Test You are gay or antisemmitian? 

Archangel WHite Tiger

Meow! Greetingshhh!

Uh, there are two ways, why you do erased my comment about WW2, that holocaust was brutally slaying of Jews and not gays/Gypsys/Slavs/anyone...

1 - If you are anti-semitian, than shave your head bald and go to the skinhead meetings!

2 - If you doubt words of the Bible, that homosexuality is a deadly sin, make a pentagram tatoo on your forehead go to the satanistic masses with your gay pals!


Beware of the Dark Side!
Tags:
toxic            1
severe_toxic     0
obscene          1
threat           0
insult           1
identity_hate    1
Name: 42, dtype: int64


In [5]:
# preprocess the data
# 1. Tokenize the data
from tensorflow.keras.layers import TextVectorization

In [6]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [7]:
MAX_FEATURE = 200000 #number of words in the vocab

vectorizer = TextVectorization(max_tokens=MAX_FEATURE,
                              output_sequence_length=1800,
                              output_mode='int')

In [8]:
vectorizer.adapt(X.values)
vectorizer_text = vectorizer(X.values)

In [9]:
# tensorflow data pipeline
# MCSHBAP, is basic data pipeline generation steps, map, cache, shuffle, batch, prefetch, from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorizer_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [10]:
batch_x, batch_y = dataset.as_numpy_iterator().next()

In [11]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [12]:
print(f'Train dataset size(in batches):{len(train)}, Validation:{len(val)}, Test:{len(test)}')

Train dataset size(in batches):6981, Validation:1994, Test:997


In [13]:
train_generator = train.as_numpy_iterator()
train_generator.next()

(array([[567,  39,  54, ...,   0,   0,   0],
        [ 46,  33,  15, ...,   0,   0,   0],
        [ 56,  28,   2, ...,   0,   0,   0],
        ...,
        [202,   8,  51, ...,   0,   0,   0],
        [600, 112, 433, ...,   0,   0,   0],
        [  8, 617, 798, ...,   0,   0,   0]]), array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 0, 0]]))

In [14]:
# build model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [15]:
model = Sequential()
# Create a embedding layer
model.add(Embedding(MAX_FEATURE+1, 32))
# Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))

# Final layer
model.add(Dense(6, activation='sigmoid'))


In [16]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [None]:
history = model.fit(train, epochs=5, validation_data=val)

Epoch 1/5
Epoch 2/5

In [None]:
history.history

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

In [None]:
# make prediction
input_text = vectorizer('you freaking suck!')

In [None]:
model.predict(np.expand_dims(input_text, 0))

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

In [None]:
# Evaludate model
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
  X_true, y_true = batch
  yhat = model.predict(X_true)

  y_true = y_true.flatten()
  yhat = yhat.flatten()

  pre.update_state(y_true, yhat)
  re.update_state(y_true, yhat)
  acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

In [None]:
model.save('comment_toxicity_tagger.h5')
model_new = tf.keras.models.load_model('comment_toxicity_tagger.h5')