## 01. Setup & Data Loading

In [1]:
!pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.15.0-cp311-cp311-win_amd64.whl (2.1 kB)
Collecting tensorflow-intel==2.15.0 (from tensorflow)
  Using cached tensorflow_intel-2.15.0-cp311-cp311-win_amd64.whl (300.9 MB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.15.0->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.15.0->tensorflow)
  Using cached gast-0.5.4-py3-none-any.whl (19 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.15.0->tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting libclang>=13.0.0 (from tensorflow-intel==2.15.0->tensorflow)
  Using cached libclang-16.0.6-py2.py3-none-win_amd64.whl (24.4 MB)
Collecting ml-dtypes~=0.2.0 (from tensorflow-intel==2.15.0->tensorflow)
  Using cached ml_dtypes-0.2.0-cp311-cp311-win_amd64.whl (938 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow-intel==2.15.0->tensorflow)
  Using cached opt_

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf




In [6]:
df = pd.read_csv('train.csv') #pandas dataframe containing only the training file

In [7]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
df.iloc[0]['comment_text'] #check the comment of the first row of data

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [9]:
df[df.columns[2:]] #get the toxicity labels of all comments
df[df.columns[2:]].iloc[0] #get the toxicity labels of the first commment

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 0, dtype: int64

## 02. Data Preprocessing

### *Tokenizing sentences data*

In [10]:
from tensorflow.keras.layers import TextVectorization

In [11]:
X = df['comment_text']
y = df[df.columns[2:]].values # values helps return a numpy array instead of pd dataframe

In [12]:
max_features = 200000 # number of words in the dictionary

In [13]:
# initializing vectorizing layer
vectorizer = TextVectorization(max_tokens=max_features,
                               output_sequence_length=1800,
                               output_mode='int')




In [14]:
# learn the words in the dictionary, training the vectorizer
vectorizer.adapt(X.values)




In [15]:
vectorized_txt = vectorizer(X.values)

In [16]:
vectorized_txt.shape

TensorShape([159571, 1800])

the input data has 159571 samples with 1800 words (specified above)









In [17]:
# create a tensorflow data pipeline
ds = tf.data.Dataset.from_tensor_slices((vectorized_txt, y)) #
ds = ds.cache()
ds = ds.shuffle(160000)
ds = ds.batch(16)
ds = ds.prefetch(8)

In [18]:
# split data
train = ds.take(int(len(ds)*.7))
val = ds.skip(int(len(ds)*.7)).take(int(len(ds)*.2))
test = ds.skip(int(len(ds)*.9)).take(int(len(ds)*.1))

## 03. Create Deep Learning Sequential Model

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [20]:
model = Sequential()
# embedding layer
model.add(Embedding(max_features+1, 32))
# biderctional LSTM layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# dense layers; feature extractor
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# final layer
model.add(Dense(6, activation='sigmoid'))

In [21]:
# compile model
model.compile(loss='BinaryCrossentropy', # Binary for an effective classification in this case of zeros and ones
              optimizer='Adam')




In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [23]:
hist = model.fit(train, epochs=10, validation_data=val)

   8/6981 [..............................] - ETA: 2:52:55 - loss: 0.6712

KeyboardInterrupt: 

In [24]:
# upload trained model 
from tensorflow.keras.models import load_model
model = load_model('toxicity.h5')

## 04. Make Prediction 

In [41]:
input_txt = vectorizer('you freaking suck! i am going to hurt you.')

In [42]:
pred = model.predict(np.expand_dims(input_txt, 0))
pred



array([[0.05002171, 0.00021458, 0.02343961, 0.00127284, 0.01057333,
        0.00277017]], dtype=float32)

In [43]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [48]:
(model.predict(batch_X)>0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

## 05. Model Evaluation

In [49]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [50]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [52]:
for batch in test.as_numpy_iterator():
    # unpack the batch
    X_true, y_true = batch 
    # make prediction
    y_pred = model.predict(X_true) 
    # flatten values; making it into one vector
    y_true = y_true.flatten() 
    y_pred = y_pred.flatten()
    
    # evaluate predictions
    pre.update_state(y_true, y_pred)
    re.update_state(y_true, y_pred)
    acc.update_state(y_true, y_pred)  



In [53]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

Precision: 0.3501606285572052, Recall: 0.22414806485176086, Accuracy: 0.16950853168964386
