<a href="https://colab.research.google.com/github/iamharkirat/NLP/blob/main/toxic_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [4]:
df = pd.read_csv('/content/drive/MyDrive/ML Projects/toxic_comments/data/train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
df.iloc[2]['comment_text']

"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."

In [12]:
df[df.columns[2:]].iloc[2]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 2, dtype: int64

# Processing the Data

In [10]:
from tensorflow.keras.layers import TextVectorization

In [13]:
x = df['comment_text']
y = df[df.columns[2:]].values

In [14]:
MAX_FEATURES = 200000 # no of words in the vocab

In [15]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=2000,
                               output_mode='int')

In [16]:
vectorizer.adapt(x.values)

In [17]:
vectorizer('hello world, life is great')

<tf.Tensor: shape=(2000,), dtype=int64, numpy=array([288, 263, 306, ...,   0,   0,   0])>

In [18]:
vectorized_text = vectorizer(x.values)

In [19]:
vectorized_text

<tf.Tensor: shape=(159571, 2000), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [21]:
# MCSHBAP - map, cache, shuffle, batch, prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps prevent bottlenecks

In [24]:
batch_x, batch_y = dataset.as_numpy_iterator().next()

In [25]:
batch_x.shape

(16, 2000)

In [26]:
batch_y.shape

(16, 6)

In [27]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# Build the Neural Network

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [35]:
model = Sequential()

# create embedding layer
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation ='tanh')))

# feature extractor fully connected layers
model.add(Dense(128, activation ='relu'))
model.add(Dense(256, activation ='relu'))
model.add(Dense(128, activation ='relu'))

# final layer
model.add(Dense(6, activation ='sigmoid'))


In [36]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [38]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [39]:
history = model.fit(train, epochs=1, validation_data=val)



# Make Predictions

In [40]:
input_text=vectorizer('You freaking suck!')

In [42]:
model.predict(np.expand_dims(input_text,0))



array([[0.996236  , 0.17105964, 0.9524967 , 0.02551672, 0.78477395,
        0.12287587]], dtype=float32)

In [43]:
res=model.predict(np.expand_dims(input_text, 0))



In [44]:
batch_x, batch_y = test.as_numpy_iterator().next()
model.predict(batch_x)



array([[1.33049674e-04, 4.55413584e-07, 1.08961205e-04, 2.15230875e-06,
        5.80446467e-05, 1.18346707e-05],
       [7.94720650e-03, 3.48976246e-05, 3.90823372e-03, 1.20037083e-04,
        1.70258794e-03, 3.16705526e-04],
       [6.97100710e-04, 5.28332566e-06, 5.19452908e-04, 1.98313628e-05,
        3.11668031e-04, 7.75714652e-05],
       [5.70736825e-03, 1.87804344e-05, 2.83265347e-03, 6.99084849e-05,
        1.14262046e-03, 1.92163177e-04],
       [3.78295709e-03, 3.95693605e-05, 2.15905998e-03, 1.22457306e-04,
        1.30960252e-03, 3.53056559e-04],
       [1.61430463e-02, 1.11396155e-04, 7.90422037e-03, 3.17821308e-04,
        3.77767743e-03, 7.55977759e-04],
       [3.06276940e-02, 3.65301181e-04, 1.35231521e-02, 9.30752372e-04,
        7.88562186e-03, 1.93844305e-03],
       [5.23028243e-03, 1.58826060e-05, 2.70634238e-03, 5.83982037e-05,
        1.05492061e-03, 1.69158302e-04],
       [9.93081834e-04, 8.55088638e-06, 7.20464799e-04, 3.07084338e-05,
        4.34369751e-04, 

In [45]:
(model.predict(batch_x)>0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0]])

# Evaluating the Model

In [46]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [47]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [48]:
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



In [49]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')


Precision: 0.8580375909805298, Recall:0.6158226132392883, Accuracy:0.49047142267227173
