In [None]:
import os                                   #imports
import tensorflow as tf
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.metrics import Precision,Recall,CategoricalAccuracy
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from tensorflow.keras.layers import Dropout,Dense,Embedding,LSTM,TextVectorization,Bidirectional,GRU
from tensorflow.keras.models import Sequential

In [None]:
df = pd.read_csv("train.csv") # reading the training dataset

In [None]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [None]:
df.drop_duplicates(inplace = True)
df.drop("id",axis=1,inplace = True)

In [None]:
X = df["comment_text"]
y = df.iloc[:,1:].values

In [None]:
x_train,x_temp,y_train,y_temp = train_test_split(X,y,test_size = 0.3,random_state = 42)
x_val,x_test,y_val,y_test = train_test_split(x_temp,y_temp,test_size = 0.4,random_state = 42)

In [None]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
max_features = 100000 # this is the number of words in the vocab
vectorizer = TextVectorization(max_tokens = max_features,
                               output_sequence_length=200,
                               output_mode = 'int')

In [None]:
vectorizer.adapt(x_train.values) # we are making the vectorizer learn the text

In [None]:
train_vectorized_text = vectorizer(x_train.values) #converting all the comments into vectors
val_vectorized_text = vectorizer(x_val.values)
test_vectorized_text = vectorizer(x_test.values)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_vectorized_text,y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((val_vectorized_text,y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((test_vectorized_text,y_test))

train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(160000)
train_dataset = train_dataset.batch(64)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

val_dataset = val_dataset.batch(64)
test_dataset = test_dataset.batch(64)

val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
def build_model(rnn_layer):
    tf.keras.backend.clear_session() # to clear gpu memory after each training
    model = Sequential([
        Embedding(max_features+1, 128, mask_zero=True),
        Bidirectional(rnn_layer(64)),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dense(6, activation='sigmoid')
    ])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(name = "auc",multi_label=True),
                 tf.keras.metrics.Precision(name = "precision"),
                 tf.keras.metrics.Recall(name = "recall")]
    )

    return model

In [None]:
model_lstm = build_model(LSTM)
model_gru  = build_model(GRU)

In [None]:
callbacks = [
    EarlyStopping(
        monitor="val_auc",
        patience=2,
        mode="max",
        restore_best_weights=True
    ),
    ModelCheckpoint(
        "best_model.keras",
        monitor="val_auc",
        mode="max",
        save_best_only=True
    )
]

In [None]:
start = time.time()
history_gru = model_gru.fit(train_dataset,epochs = 5, validation_data=val_dataset,callbacks = callbacks)
print("Time taken for training by GRU:", time.time() - start)

Epoch 1/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 31ms/step - auc: 0.8764 - loss: 0.1016 - precision: 0.6962 - recall: 0.4521 - val_auc: 0.9651 - val_loss: 0.0475 - val_precision: 0.8222 - val_recall: 0.6564
Epoch 2/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 37ms/step - auc: 0.9785 - loss: 0.0400 - precision: 0.8344 - recall: 0.7207 - val_auc: 0.9580 - val_loss: 0.0497 - val_precision: 0.7468 - val_recall: 0.7495
Epoch 3/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 31ms/step - auc: 0.9883 - loss: 0.0302 - precision: 0.8582 - recall: 0.7995 - val_auc: 0.9464 - val_loss: 0.0552 - val_precision: 0.7639 - val_recall: 0.7327
Time taken for training by GRU: 181.2048463821411


In [None]:
start = time.time()
history_lstm = model_lstm.fit(train_dataset,epochs = 5, validation_data=val_dataset,callbacks = callbacks)
print("Time taken for training by LSTM:", time.time() - start)

Epoch 1/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 32ms/step - auc: 0.8689 - loss: 0.1033 - precision: 0.7006 - recall: 0.4308 - val_auc: 0.9650 - val_loss: 0.0489 - val_precision: 0.8368 - val_recall: 0.6279
Epoch 2/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 32ms/step - auc: 0.9754 - loss: 0.0407 - precision: 0.8324 - recall: 0.7144 - val_auc: 0.9628 - val_loss: 0.0479 - val_precision: 0.8092 - val_recall: 0.6795
Epoch 3/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 32ms/step - auc: 0.9847 - loss: 0.0314 - precision: 0.8548 - recall: 0.7889 - val_auc: 0.9539 - val_loss: 0.0512 - val_precision: 0.8105 - val_recall: 0.6746
Time taken for training by LSTM: 171.47623252868652


In [None]:
history_lstm.history.keys()

dict_keys(['auc', 'loss', 'precision', 'recall', 'val_auc', 'val_loss', 'val_precision', 'val_recall'])

In [None]:
print(f"Max AUC for GRU: {max(history_gru.history['val_auc'])}")
print(f"Max AUC for LSTM: {max(history_lstm.history['val_auc'])}")

Max AUC for GRU: 0.9651054739952087
Max AUC for LSTM: 0.9650406241416931


In [None]:
print(f"Max Precision for GRU: {max(history_gru.history['val_precision'])}")
print(f"Max Precision for LSTM: {max(history_lstm.history['val_precision'])}")

Max Precision for GRU: 0.8221603631973267
Max Precision for LSTM: 0.8367908596992493


In [None]:
print(f"Max Recall for GRU: {max(history_gru.history['val_recall'])}")
print(f"Max Recall for LSTM: {max(history_lstm.history['val_recall'])}")

Max Recall for GRU: 0.7495235204696655
Max Recall for LSTM: 0.6794790625572205


In [None]:
def predictor(ip_text,model_name):
  pred = model_name.predict(np.expand_dims(ip_text,0))
  labels = ["toxic",'severe_toxic',"obscene","threat","insult","identity_hate"]
  threshold = 0.5
  pred_labels = (pred > threshold).astype(int)
  flagged = False
  print("This comment is: ")
  for i in range(len(labels)):
    if pred_labels[0][i] == 1:
      print(labels[i])
      flagged = True
  if flagged == False:
    print("safe")

In [None]:
ip_text = vectorizer("I absolutely hate you!")
predictor(ip_text,model_lstm)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347ms/step
This comment is: 
toxic


In [None]:
ip_text = vectorizer("I absolutely hate you!")
predictor(ip_text,model_gru)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 206ms/step
This comment is: 
toxic


In [None]:
ip3_text = vectorizer("I will kill you!")
predictor(ip3_text,model_lstm)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
This comment is: 
toxic
obscene
insult


In [None]:
ip3_text = vectorizer("I will kill you!")
predictor(ip3_text,model_gru)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
This comment is: 
toxic
obscene
insult


In [None]:
ip2_text = vectorizer("I love you!")
predictor(ip2_text,model_lstm)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
This comment is: 
safe


In [None]:
ip3_text = vectorizer("I love you!")
predictor(ip3_text,model_gru)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
This comment is: 
safe


In [None]:
pre = Precision()
rec = Recall()

In [None]:
def evaluation_of_model(model_name):
    precision = Precision()
    recall = Recall()
    auc = tf.keras.metrics.AUC(multi_label=True)

    for batch in test_dataset:
        x_true, y_true = batch
        yhat = model_name.predict(x_true)

        precision.update_state(y_true, (yhat>0.3))
        recall.update_state(y_true, (yhat>0.3))
        auc.update_state(y_true, yhat)

    print("Precision:", precision.result().numpy())
    print("Recall:", recall.result().numpy())
    print("AUC:", auc.result().numpy())

In [None]:
evaluation_of_model(model_lstm)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27

In [None]:
evaluation_of_model(model_gru)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

In [None]:
def score_comment(comment,model_name):
  input_str = vectorizer(comment)
  res = model_name.predict(np.expand_dims(input_str,0))
  text = ' '
  for idx,cols in enumerate(df.columns[1:]):
    text += '{}: {}  '.format(cols,res[0][idx]>0.3)
    flagged = True
  if flagged == False:
    print("\n\n\n Safe")

  return text