In [1]:
# !pip install tensorflow opencv-python matplotlib
# !pip install opencv-contrib-python
# !pip list


In [2]:
import keras;
import numpy as np
import pandas as pd
import tensorflow as tf
import os

In [3]:
gpus=tf.config.experimental.list_physical_devices('GPU')
gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
df=pd.read_csv('/content/train.csv')

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
df.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [7]:
df[df.columns[2:]]

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [8]:
df[df['toxic']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


## Text Preprocessing

In [9]:
from tensorflow.keras.layers import TextVectorization

In [10]:
X=df['comment_text']
y=df[df.columns[2:]].values

In [11]:
X.head()

Unnamed: 0,comment_text
0,Explanation\nWhy the edits made under my usern...
1,D'aww! He matches this background colour I'm s...
2,"Hey man, I'm really not trying to edit war. It..."
3,"""\nMore\nI can't make any real suggestions on ..."
4,"You, sir, are my hero. Any chance you remember..."


In [12]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [13]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [14]:
MAX_FEATURES=20000 ## no of words in vocab

In [15]:
type(X.values)

numpy.ndarray

In [16]:
vectorize=TextVectorization(max_tokens=MAX_FEATURES,
                            output_sequence_length=1800,
                            output_mode='int')
vectorize.adapt(X.values)

In [17]:
vectorize('hello world, life is great')[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([288, 263, 306,   9, 275])>

In [18]:
print(vectorize.get_vocabulary())



In [19]:
vectorize_text=vectorize(X.values)
vectorize_text.shape

TensorShape([159571, 1800])

Together, this pipeline:

Loads the data from memory/tensors,

Caches it for faster reuse,

Randomizes the sample order,

Batches it into small chunks,

Prefetches batches to keep training fast and efficient.

In [20]:
dataset = tf.data.Dataset.from_tensor_slices((vectorize_text, y))
dataset=dataset.cache()
dataset=dataset.shuffle(160000)
dataset=dataset.batch(16)
dataset=dataset.prefetch(8)

In [21]:
batch_x,batch_y=dataset.as_numpy_iterator().next()

In [22]:
print(batch_x.shape,batch_y.shape)

(16, 1800) (16, 6)


In [23]:
train=dataset.take(int(len(dataset)*.7))  #70%
val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) # skip 70% take 20%
test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) # skip 90% take 10%

In [24]:
train_generator=train.as_numpy_iterator()

In [25]:
train_generator.next()

(array([[  35,    2, 1619, ...,    0,    0,    0],
        [  64,   17, 7899, ...,    0,    0,    0],
        [5808, 1114,    4, ...,    0,    0,    0],
        ...,
        [   8,   25,  273, ...,    0,    0,    0],
        [  15,    3,  385, ...,    0,    0,    0],
        [ 425,    1,    5, ...,    0,    0,    0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0]]))

## Create Sequential Data

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dropout ,Bidirectional,Dense,Embedding


In [27]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))  # Embedding layer
model.add(Bidirectional(LSTM(32, activation='tanh')))  # LSTM in both directions
model.add(Dense(128, activation='relu'))  # Dense layer 1
model.add(Dense(256, activation='relu'))  # Dense layer 2
model.add(Dense(128, activation='relu'))  # Dense layer 3
model.add(Dense(6, activation='sigmoid'))  # Output layer


In [28]:
model.compile(optimizer='Adam',loss='BinaryCrossentropy',metrics=['acc'])

In [29]:
model.summary()

In [30]:
history=model.fit(
    train,
    validation_data=val,
    epochs=1,
)

[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m707s[0m 100ms/step - acc: 0.9537 - loss: 0.0850 - val_acc: 0.9940 - val_loss: 0.0531


In [31]:
batch=test.as_numpy_iterator().next()

In [32]:
input_text=vectorize('You freaking suck!');


In [33]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [41]:
model.predict(np.expand_dims(input_text,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step


array([[0.9804536 , 0.09886777, 0.8622591 , 0.01477769, 0.64873904,
        0.09457114]], dtype=float32)

In [34]:
res=model.predict(np.expand_dims(input_text,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 377ms/step


In [35]:
batch_x,batch_y=test.as_numpy_iterator().next()

In [42]:
batch_y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [36]:
(model.predict(batch_x)>0.5).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step


## Evaluate Model

In [37]:
from tensorflow.keras.metrics import Precision,Recall,CategoricalAccuracy

In [38]:
pre=Precision()
re=Recall()
acc=CategoricalAccuracy()

In [39]:
for batch in test.as_numpy_iterator():
    X_true,y_true=batch
    yhat=model.predict(X_true)

    y_true=y_true.flatten()
    yhat=yhat.flatten()

    pre.update_state(y_true,yhat)
    re.update_state(y_true,yhat)
    acc.update_state(y_true,yhat)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77

In [40]:
print(f'Precision:{pre.result().numpy()}')
print(f'Recall:{re.result().numpy()}')
print(f'Accuracy:{acc.result().numpy()}')

Precision:0.8728191256523132
Recall:0.5581327080726624
Accuracy:0.454363077878952
