In [2]:
import os                                   #imports
import tensorflow as tf
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Dropout,Dense,Embedding,LSTM,TextVectorization,Bidirectional,GRU
from tensorflow.keras.models import Sequential

In [7]:
df = pd.read_csv("train.csv") # reading the training dataset

In [35]:
df

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [8]:
df.drop_duplicates(inplace = True)
df.drop("id",axis=1,inplace = True)

In [9]:
X = df["comment_text"]
y = df.iloc[:,1:].values

In [10]:
x_train,x_temp,y_train,y_temp = train_test_split(X,y,test_size = 0.3,random_state = 42)
x_val,x_test,y_val,y_test = train_test_split(x_temp,y_temp,test_size = 0.4,random_state = 42)

In [11]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [12]:
max_features = 100000 # this is the number of words in the vocab
vectorizer = TextVectorization(max_tokens = max_features,
                               output_sequence_length=200,
                               output_mode = 'int')

In [13]:
vectorizer.adapt(x_train.values) # we are making the vectorizer learn the text

In [14]:
train_vectorized_text = vectorizer(x_train.values) #converting all the comments into vectors
val_vectorized_text = vectorizer(x_val.values)
test_vectorized_text = vectorizer(x_test.values)

In [15]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_vectorized_text,y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((val_vectorized_text,y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((test_vectorized_text,y_test))

train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(160000)
train_dataset = train_dataset.batch(64)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

val_dataset = val_dataset.batch(64)
test_dataset = test_dataset.batch(64)

val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

In [16]:
model = Sequential()

In [17]:
def build_model(rnn_layer):
    tf.keras.backend.clear_session() # to clear gpu memory after each training
    model = Sequential([
        Embedding(max_features+1, 128, mask_zero=True),
        Bidirectional(rnn_layer(64)),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dense(6, activation='sigmoid')
    ])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(multi_label=True)]
    )

    return model

In [18]:
model_lstm = build_model(LSTM)
model_gru  = build_model(GRU)

In [19]:
start = time.time()
history_gru = model_gru.fit(train_dataset,epochs = 2, validation_data=val_dataset)
print("Time taken for training by GRU:", time.time() - start)

Epoch 1/2
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 28ms/step - auc: 0.8900 - loss: 0.0958 - val_auc: 0.9633 - val_loss: 0.0502
Epoch 2/2
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 28ms/step - auc: 0.9767 - loss: 0.0397 - val_auc: 0.9615 - val_loss: 0.0488
Time taken for training by GRU: 107.61555433273315


In [20]:
start = time.time()
history_lstm = model_lstm.fit(train_dataset,epochs = 2, validation_data=val_dataset)
print("Time taken for training by LSTM:", time.time() - start)

Epoch 1/2
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 31ms/step - auc: 0.8847 - loss: 0.0984 - val_auc: 0.9661 - val_loss: 0.0489
Epoch 2/2
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 29ms/step - auc: 0.9774 - loss: 0.0406 - val_auc: 0.9622 - val_loss: 0.0489
Time taken for training by LSTM: 108.20164608955383


In [21]:
print(f"Max AUC for GRU: {max(history_gru.history['val_auc'])}")
print(f"Max AUC for LSTM: {max(history_lstm.history['val_auc'])}")

Max AUC for GRU: 0.9633195996284485
Max AUC for LSTM: 0.9660558104515076


In [44]:
def predictor(ip_text):
  pred = model_lstm.predict(np.expand_dims(ip_text,0))
  labels = ["toxic",'severe_toxic',"obscene","threat","insult","identity_hate"]
  threshold = 0.5
  pred_labels = (pred > threshold).astype(int)
  flagged = False
  print("This comment is: ")
  for i in range(len(labels)):
    if pred_labels[0][i] == 1:
      print(labels[i])
      flagged = True
  if flagged == False:
    print("Comment is safe.")

In [45]:
ip_text = vectorizer("I absolutely hate you!")
predictor(ip_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
This comment is: 
toxic


In [46]:
ip3_text = vectorizer("I will kill you!")
predictor(ip3_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
This comment is: 
toxic
insult


In [47]:
ip2_text = vectorizer("I love you!")
predictor(ip2_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
This comment is: 
Comment is safe.
