In [30]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import TextVectorization, LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.models import Sequential
import gradio

In [2]:
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
X = df["comment_text"]
y = df[df.columns[2:]].values
MAX_FEATURES = 200000 # number of words in the vocabulary

In [4]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800, # max length of a comment
                               output_mode="int") 

vectorizer.adapt(X.values)

In [5]:
vectorized_text = vectorizer(X.values)

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [7]:
train = dataset.take(int(len(dataset)* 0.7))    # 70% of the dataset
val = dataset.skip(int(len(dataset)* 0.7)).take(int(len(dataset)* 0.2)) # 20% of the dataset
test = dataset.skip(int(len(dataset)* 0.9)).take(int(len(dataset)* 0.1)) # 10% of the dataset

In [8]:
train_generator = train.as_numpy_iterator()

In [9]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [10]:
history = model.fit(train, validation_data=val, epochs=1)



In [24]:
text = vectorizer("Hello")
res = model.predict(np.expand_dims(text, 0))

res = pd.DataFrame(res, columns=df.columns[2:])
res

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.008465,7.926457e-07,0.00037,6.7e-05,0.000633,0.000122


In [25]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

pre = Precision()
rec = Recall()
acc = CategoricalAccuracy()

In [26]:
for batch in test.as_numpy_iterator():
    X_true, y_true = batch
    yhat = model.predict(X_true)
    
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    rec.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [27]:
print(f'Precision: {pre.result().numpy()}, Recall: {rec.result().numpy()}, Accuracy: {acc.result().numpy()}')

Precision: 0.8438818454742432, Recall: 0.6620689630508423, Accuracy: 0.47943830490112305


In [28]:
model.save("toxicity_detector.h5")

In [32]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ""
    for idx, col in enumerate(df.columns[2:]):
        text += f"{col}: {results[0][idx]>0.5}\n"
        
    return text

In [33]:
interface = gradio.Interface(fn=score_comment,
                             inputs= gradio.inputs.Textbox(lines=2, placeholder="Enter your comment here..."),
                             outputs="text")



In [35]:
interface.launch()

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


(<gradio.routes.App at 0x216fe043040>, 'http://127.0.0.1:7860/', None)