# Importing the necessary Libraries

In [1]:
import numpy as np 
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

## Data Loading and Pre-processing


In [3]:
df = pd.read_csv("C:/Users/hp/Desktop/ELECT/Heavy AI Projects/Toxic-Comment/toxic_train.csv")

In [4]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [5]:
X = df["comment_text"]
y = df[df.columns[2:]].values

In [6]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
MAX_FEATURES = 20000 # Number of words in the vocabulary

In [None]:
vectorizer = TextVectorization(
    max_tokens = MAX_FEATURES,
    output_sequence_length = 1800,
    output_mode = 'int'
)

In [73]:
# Let the model Learn Vocabularies in our dataset
vectorizer.adapt(X.values)

In [74]:
vectorized_text = vectorizer(X.values)

#### MCSHBAP Pipeline
-map\
-cache\
-shuffle\
-batch\
-prefetch

In [75]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps prevent bottlenecks

In [76]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

#### Train_test_val Split
- Train 70%
- Test 20%
- Validation 10%

In [77]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

## Create Sequential Model

In [78]:
from tensorflow.keras.layers import LSTM, Dropout, Dense, Bidirectional, Embedding
from tensorflow.keras.models import Sequential

In [79]:
model = Sequential()

# create embedding layer
model.add(Embedding(MAX_FEATURES+1, 32))

# 'tanh' is for the gpu acceleration
# Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32, activation ='tanh')))

# Fully connected feature extractors
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))

# Output layer
model.add(Dense(6, activation='sigmoid'))

In [80]:
model.compile(loss='BinaryCrossentropy', optimizer= 'Adam')

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs=1, validation_data = val)

In [None]:
import os.path
if os.path.isfile('toxic_comment_classifier_modified.h5') is False:
    model.save('toxic_comment_classifier_modified.h5')

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

# Make Predictions

In [25]:
text = vectorizer("You sick or something")
results = model.predict(np.expand_dims(text, 0))



array([[0.6637224 , 0.0323654 , 0.26499566, 0.03708137, 0.32203284,
        0.09373499]], dtype=float32)

In [27]:
batch_X, batch_y = test.as_numpy_iterator().next()

# Model Evaluation

In [28]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [29]:
pre = Precision()
rec = Recall()
cat = CategoricalAccuracy()

In [30]:
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    rec.update_state(y_true, yhat)
    cat.update_state(y_true, yhat)









In [32]:
print(f'Precision :{pre.result().numpy()},Recall :{rec.result().numpy()},Accuracy :{cat.result().numpy()}')

Precision :0.813725471496582,Recall :0.6016169786453247,Accuracy :0.4824473559856415


# Test with Gradio

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text = '{}: {}\n'.format(col, results[0][idx]>0.5)
        
    return text

In [37]:
import gradio as gr
interface = gr.Interface(fn=score_comment,
                         inputs=gr.inputs.Textbox(lines=2, placeholder='comment to score'),
                         outputs='text'
                        )
interface.launch(share=False)

  inputs=gr.inputs.Textbox(lines=2, placeholder='comment to score'),
  inputs=gr.inputs.Textbox(lines=2, placeholder='comment to score'),
  inputs=gr.inputs.Textbox(lines=2, placeholder='comment to score'),


Running on local URL:  http://127.0.0.1:7861

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


