In [1]:
from google.colab import files
files.upload()
import os

# Create kaggle folder
os.makedirs('/root/.kaggle', exist_ok=True)

# Move kaggle.json to the folder
!mv kaggle.json /root/.kaggle/

# Set permissions (very important)
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
!unzip jigsaw-toxic-comment-classification-challenge.zip # unzipping the dataset's parent folder
!unzip train.csv.zip #unzipping traning data
!unzip test.csv.zip #unzipping test data and its labels
!unzip test_labels.csv.zip
!pip install gensim

Saving kaggle.json to kaggle.json
Downloading jigsaw-toxic-comment-classification-challenge.zip to /content
  0% 0.00/52.6M [00:00<?, ?B/s]
100% 52.6M/52.6M [00:00<00:00, 1.67GB/s]
Archive:  jigsaw-toxic-comment-classification-challenge.zip
  inflating: sample_submission.csv.zip  
  inflating: test.csv.zip            
  inflating: test_labels.csv.zip     
  inflating: train.csv.zip           
Archive:  train.csv.zip
  inflating: train.csv               
Archive:  test.csv.zip
  inflating: test.csv                
Archive:  test_labels.csv.zip
  inflating: test_labels.csv         
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.

In [15]:
import os                                   #imports
import tensorflow as tf
import time
import pandas as pd
import gensim.downloader as api
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras
from sklearn.metrics import f1_score
from tensorflow.keras.metrics import Precision,Recall,CategoricalAccuracy
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from tensorflow.keras.layers import Dropout,Dense,Embedding,LSTM,TextVectorization,Bidirectional,GRU,Convolution1D,GlobalMaxPooling1D
from tensorflow.keras.models import Sequential

In [3]:
df = pd.read_csv("train.csv") # reading the training dataset

In [4]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [5]:
df.drop_duplicates(inplace = True)
df.drop("id",axis=1,inplace = True)

In [6]:
X = df["comment_text"]
y = df.iloc[:,1:].values

In [16]:
x_train,x_temp,y_train,y_temp = train_test_split(X,y,test_size = 0.3,random_state = 42)
x_val,x_test,y_val,y_test = train_test_split(x_temp,y_temp,test_size = 0.4,random_state = 42)

In [17]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [18]:
max_features = 100000 # this is the number of words in the vocab
vectorizer = TextVectorization(max_tokens = max_features,
                               output_sequence_length=200,
                               output_mode = 'int')

In [19]:
vectorizer.adapt(x_train.values) # we are making the vectorizer learn the text

In [20]:
train_vectorized_text = vectorizer(x_train.values) #converting all the comments into vectors
val_vectorized_text = vectorizer(x_val.values)
test_vectorized_text = vectorizer(x_test.values)

In [21]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_vectorized_text,y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((val_vectorized_text,y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((test_vectorized_text,y_test))

train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(len(x_train))
train_dataset = train_dataset.batch(64)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

val_dataset = val_dataset.batch(64)
test_dataset = test_dataset.batch(64)

val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

In [22]:
glove = api.load("glove-wiki-gigaword-100") #GloVe integration
vocab = vectorizer.get_vocabulary()
embedding_matrix = []
embedding_dim = glove.vector_size
embedding_matrix = np.zeros((len(vocab),embedding_dim))
for idx,word in enumerate(vocab):
  if word in glove:
    embedding_matrix[idx] = glove[word]



In [23]:
hits = 0
for word in vocab:
    if word in glove:
        hits += 1

print("Coverage:", hits / len(vocab))
#as we can see the coverage is only 58% so we must keep the trainable parameter set to True during training

Coverage: 0.58313


In [24]:
label_count = np.sum(y,axis=0)
total_samples = y.shape[0]
pos_weights = total_samples/(2*(label_count))
print("Positive class weights per label:")
for label,weight in zip(df.columns[1:],pos_weights):
  print(label,":",weight)

Positive class weights per label:
toxic : 5.216784359879691
severe_toxic : 50.02225705329153
obscene : 9.443188543022844
threat : 166.9152719665272
insult : 10.128919639456646
identity_hate : 56.78683274021353


In [42]:
def weighted_bce(pos_weights):
  pos_weights_tensor = tf.constant(pos_weights,dtype = tf.float32)
  def loss(y_true,y_pred):
    eps = 1e-7
    y_pred = tf.clip_by_value(y_pred,eps,1-eps) #for stable gradient
    loss = -(pos_weights_tensor * y_true * tf.math.log(y_pred) + (1 - y_true) * tf.math.log(1 - y_pred))
    return tf.reduce_mean(loss) #computes average loss of the 6 classes, this scalar is used for backprop\
  return loss

In [26]:
def build_model(layer = None,cnn = False):
  tf.keras.backend.clear_session() # to clear gpu memory after each training
  if cnn == False:
    model = Sequential([
        Embedding(input_dim = len(vocab),output_dim = embedding_dim,weights = [embedding_matrix],mask_zero = True,trainable = True), #using pretrained vectors for better results
        Bidirectional(layer(64)),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dense(6, activation='sigmoid')
    ])
  else:
    model = Sequential([
        Embedding(input_dim = len(vocab),output_dim = embedding_dim,weights = [embedding_matrix],trainable = True),
        Convolution1D(kernel_size = 5,activation = 'relu',filters=64),
        Dropout(0.2),
        Convolution1D(kernel_size = 5,activation = 'relu',filters=128),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dense(6, activation='sigmoid')
    ])
  model.compile(
      optimizer='adam',
      loss=weighted_bce(pos_weights),
      metrics=[tf.keras.metrics.AUC(name = "auc",multi_label=True),
                tf.keras.metrics.Precision(name = "precision",thresholds=0.3),
                tf.keras.metrics.Recall(name = "recall",thresholds=0.3)]
  )
  return model


In [27]:
model_lstm = build_model(LSTM)
model_gru  = build_model(GRU)
cnn_model = build_model(cnn = True)

In [28]:
callbacks = [
    EarlyStopping(
        monitor="val_auc",
        patience=2,
        mode="max",
        restore_best_weights=True
    ),
    ModelCheckpoint(
        "best_model.keras",
        monitor="val_auc",
        mode="max",
        save_best_only=True
    )
]

In [29]:
start = time.time()
history_gru = model_gru.fit(train_dataset,epochs = 10, validation_data=val_dataset,callbacks = callbacks)
print("Time taken for training by GRU:", time.time() - start)

Epoch 1/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 28ms/step - auc: 0.9243 - loss: 0.4410 - precision: 0.1737 - recall: 0.8947 - val_auc: 0.9804 - val_loss: 0.2619 - val_precision: 0.3204 - val_recall: 0.9319
Epoch 2/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 28ms/step - auc: 0.9872 - loss: 0.1862 - precision: 0.3629 - recall: 0.9600 - val_auc: 0.9812 - val_loss: 0.2453 - val_precision: 0.3902 - val_recall: 0.9355
Epoch 3/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 27ms/step - auc: 0.9918 - loss: 0.1256 - precision: 0.4648 - recall: 0.9793 - val_auc: 0.9686 - val_loss: 0.3888 - val_precision: 0.4907 - val_recall: 0.8945
Epoch 4/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 27ms/step - auc: 0.9943 - loss: 0.0999 - precision: 0.5296 - recall: 0.9864 - val_auc: 0.9658 - val_loss: 0.4011 - val_precision: 0.4350 - val_recall: 0.9057
Time taken for training by GRU: 201.872797489166

In [30]:
start = time.time()
history_lstm = model_lstm.fit(train_dataset,epochs = 10, validation_data=val_dataset,callbacks = callbacks)
print("Time taken for training by LSTM:", time.time() - start)

Epoch 1/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 29ms/step - auc: 0.9251 - loss: 0.4390 - precision: 0.1794 - recall: 0.8765 - val_auc: 0.9808 - val_loss: 0.2398 - val_precision: 0.2972 - val_recall: 0.9492
Epoch 2/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 28ms/step - auc: 0.9861 - loss: 0.1902 - precision: 0.3540 - recall: 0.9619 - val_auc: 0.9790 - val_loss: 0.2481 - val_precision: 0.3457 - val_recall: 0.9355
Epoch 3/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 28ms/step - auc: 0.9916 - loss: 0.1285 - precision: 0.4496 - recall: 0.9780 - val_auc: 0.9739 - val_loss: 0.3336 - val_precision: 0.4155 - val_recall: 0.9333
Time taken for training by LSTM: 152.35443234443665


In [31]:
start = time.time()
history_cnn = cnn_model.fit(train_dataset,epochs = 10, validation_data=val_dataset,callbacks = callbacks)
print("Time taken for training by CNN:", time.time() - start)

Epoch 1/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 11ms/step - auc: 0.9114 - loss: 0.4759 - precision: 0.1582 - recall: 0.8609 - val_auc: 0.9765 - val_loss: 0.2740 - val_precision: 0.2501 - val_recall: 0.9273
Epoch 2/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - auc: 0.9801 - loss: 0.2337 - precision: 0.3073 - recall: 0.9457 - val_auc: 0.9774 - val_loss: 0.2575 - val_precision: 0.2925 - val_recall: 0.9547
Epoch 3/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - auc: 0.9858 - loss: 0.1874 - precision: 0.3536 - recall: 0.9632 - val_auc: 0.9747 - val_loss: 0.2983 - val_precision: 0.1972 - val_recall: 0.9717
Epoch 4/10
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - auc: 0.9870 - loss: 0.1729 - precision: 0.3731 - recall: 0.9693 - val_auc: 0.9673 - val_loss: 0.3924 - val_precision: 0.4148 - val_recall: 0.9061
Time taken for training by CNN: 69.81211161613464


In [32]:
print(f"Max AUC for GRU: {max(history_gru.history['val_auc'])}")
print(f"Max AUC for LSTM: {max(history_lstm.history['val_auc'])}")
print(f"Max AUC for CNN: {max(history_cnn.history['val_auc'])}")

Max AUC for GRU: 0.9811680912971497
Max AUC for LSTM: 0.9808200001716614
Max AUC for CNN: 0.9773985743522644


In [33]:
print(f"Max Precision for GRU: {max(history_gru.history['val_precision'])}")
print(f"Max Precision for LSTM: {max(history_lstm.history['val_precision'])}")
print(f"Max Precision for CNN: {max(history_cnn.history['val_precision'])}")

Max Precision for GRU: 0.4906778037548065
Max Precision for LSTM: 0.4154999256134033
Max Precision for CNN: 0.4147884249687195


In [34]:
print(f"Max Recall for GRU: {max(history_gru.history['val_recall'])}")
print(f"Max Recall for LSTM: {max(history_lstm.history['val_recall'])}")
print(f"Max Recall for CNN: {max(history_cnn.history['val_recall'])}")

Max Recall for GRU: 0.9355146288871765
Max Recall for LSTM: 0.9491741061210632
Max Recall for CNN: 0.9717280864715576


In [35]:
def predictor(ip_text,model_name):
  pred = model_name(vectorizer([ip_text]),training = False).numpy()
  labels = ["toxic",'severe_toxic',"obscene","threat","insult","identity_hate"]
  threshold = 0.3
  pred_labels = (pred > threshold).astype(int)
  flagged = False
  print("This comment is: ")
  for i in range(len(labels)):
    if pred_labels[0][i] == 1:
      print(labels[i])
      flagged = True
  if flagged == False:
    print("safe")

In [40]:
def macro_f1(model_name):
  all_preds = []
  all_true = []
  for x_batch,y_batch in test_dataset:
    preds = model_name(x_batch,training = False).numpy()
    all_preds.append(preds)
    all_true.append(y_batch.numpy())
  all_preds = np.vstack(all_preds)
  all_true = np.vstack(all_true)
  preds_binary = (all_preds>0.3).astype(int)
  return f"Macro F1 of {model_name}: {f1_score(all_true,preds_binary,average = 'macro')}"

In [41]:
print(macro_f1(model_lstm))
print(macro_f1(model_gru))
print(macro_f1(cnn_model))

Macro F1 of <Sequential name=sequential, built=True>: 0.36947522358502755
Macro F1 of <Sequential name=sequential, built=True>: 0.43937298234365413
Macro F1 of <Sequential name=sequential, built=True>: 0.3596000148860969


In [38]:
def score_comment(comment,model_name):
  flagged = False
  input_str = vectorizer([comment])
  res = model_name(input_str,training = False).numpy()
  text = ' '
  for idx,cols in enumerate(df.columns[1:]):
    is_toxic = res[0][idx] > 0.3
    text += '{}: {}  '.format(cols,is_toxic)
    if is_toxic:
      flagged = True
  if flagged == False:
    print("\n\n\n Safe")
  return text