# Preparation

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
!pip install transformers
!pip install pytorch-lightning==0.7.5
!pip install nltk
!pip install sentencepiece



In [21]:
import pandas as pd

In [46]:
!ls "/content/drive/MyDrive/nlp/final/results/"

'checkpointepoch=0.ckpt'


In [23]:
train_path = '/content/drive/MyDrive/nlp/final/train.csv'
test_path = '/content/drive/MyDrive/nlp/final/test.csv'
test_labels_path = '/content/drive/MyDrive/nlp/final/test_labels.csv'
subm_path = '/content/drive/MyDrive/nlp/final/sample_submission.csv'

In [24]:
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
test_labels_data = pd.read_csv(test_labels_path)
subm_path_data = pd.read_csv(subm_path)

#Analysing the dataset

In [25]:
train_data.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [26]:
test_data.head(10)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
5,0001ea8717f6de06,Thank you for understanding. I think very high...
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...
7,000247e83dcc1211,:Dear god this site is horrible.
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ..."
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...


In [27]:
test_labels_data.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1
5,0001ea8717f6de06,0,0,0,0,0,0
6,00024115d4cbde0f,-1,-1,-1,-1,-1,-1
7,000247e83dcc1211,0,0,0,0,0,0
8,00025358d4737918,-1,-1,-1,-1,-1,-1
9,00026d1092fe71cc,-1,-1,-1,-1,-1,-1


In [28]:
subm_path_data.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5
5,0001ea8717f6de06,0.5,0.5,0.5,0.5,0.5,0.5
6,00024115d4cbde0f,0.5,0.5,0.5,0.5,0.5,0.5
7,000247e83dcc1211,0.5,0.5,0.5,0.5,0.5,0.5
8,00025358d4737918,0.5,0.5,0.5,0.5,0.5,0.5
9,00026d1092fe71cc,0.5,0.5,0.5,0.5,0.5,0.5


In [157]:
column_labels = train_data.columns.tolist()[2:]
train_data[column_labels].sum().sort_values()

threat             478
identity_hate     1405
severe_toxic      1595
insult            7877
obscene           8449
toxic            15294
dtype: int64

In [158]:
toxic = train_data[train_data[column_labels].sum(axis=1) > 0]
clean = train_data[train_data[column_labels].sum(axis=1) == 0]
print(toxic.shape)
print(clean.shape)

(16225, 8)
(143346, 8)


# T5 fine tunning

In [30]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [32]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.hparams = hparams

    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

  def is_logger(self):
    return self.trainer.proc_rank <= 0

  def configure_optimizers(self):
    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.hparams.weight_decay},
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)

    t_total = (
        (len(self.train_dataloader().dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total)

    return [optimizer], [scheduler]

  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        lm_labels=lm_labels,
    )

  def _step(self, batch):
    input_ids = batch["source_ids"]
    attention_mask = batch["source_mask"]
    labels = batch["target_ids"]
    decoder_attention_mask = batch["target_mask"]

    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels,
        decoder_attention_mask=decoder_attention_mask
    )
    loss = outputs[0]

    return loss


  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}

  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}

  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]

  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [33]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [34]:
args_dict = dict(
    data_dir='/content/drive/MyDrive/nlp/final/', # path for data files
    output_dir='/content/drive/MyDrive/nlp/final/results', # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=256,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=4,
    eval_batch_size=4,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=True,
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=0.5, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [35]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [56]:
from sklearn.model_selection import train_test_split
class ToxicCommentDataset(Dataset):

  def __init__(self, tokenizer, data_file, max_len=256, subset="train", val_split=0.1):
    self.data = pd.read_csv(data_file)
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.subset = subset

    if self.subset == "train" or self.subset == "val":
        train_data, val_data = train_test_split(self.data, test_size=val_split, random_state=42)
        self.data = train_data if self.subset == "train" else val_data

    self.inputs = []
    self.targets = []
    self._build()

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    src_mask = self.inputs[index]["attention_mask"].squeeze()

    item = {"source_ids": source_ids, "source_mask": src_mask}


    if hasattr(self, 'targets') and len(self.targets) > index:
        target_ids = self.targets[index]["input_ids"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()
        item["target_ids"] = target_ids
        item["target_mask"] = target_mask

    return item


  def _build(self):
    if self.subset == "train" or self.subset == "val":
        labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    else:  # For 'test' subset or any subset without labels
        labels = []

    for _, row in self.data.iterrows():
        input_text = "toxicity classification: " + row['comment_text']
        if labels:  # If there are labels to process
            target_text = ' '.join([label for label, value in zip(labels, row[labels].values) if value == 1])
            if not target_text:
                target_text = "not_toxic"
        else:
            target_text = None  # Or a default value for test data without labels

        tokenized_inputs = self.tokenizer(input_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")

        if target_text is not None:
            tokenized_targets = self.tokenizer(target_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
            self.targets.append(tokenized_targets)

        self.inputs.append(tokenized_inputs)



In [37]:
dataset = ToxicCommentDataset(tokenizer, train_path, max_len=256, subset = "val")

In [38]:
len(dataset)

15958

In [39]:
data = dataset[8]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

toxicity classification: List of My Three Sons episodes I saw you redirected two of the season episode lists back to the article a month ago, which removed them from both articles, since it was transcluded. I substituted the list back into the main article, and was about to remove the links back to the season lists, but the other seasons still have separate pages. I thought I'd just ask for your thoughts. It might be easier to just undo the substitution and redirects. -XT+</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [40]:
def get_dataset(tokenizer, type_path, args):
    data_file = os.path.join(args.data_dir, "train.csv")
    return ToxicCommentDataset(tokenizer=tokenizer, data_file=data_file, max_len=args.max_seq_length, subset=type_path)

In [42]:
args_dict.update({'num_train_epochs':2})

args = argparse.Namespace(**args_dict)

In [43]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)



In [44]:
model = T5FineTuner(args)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [45]:
trainer = pl.Trainer(**train_params)

In [26]:
trainer.fit(model)



Validation sanity check: 0it [00:00, ?it/s]

  self.pid = os.fork()
  self.pid = os.fork()


Training: 0it [00:00, ?it/s]

  self.pid = os.fork()
  self.pid = os.fork()


Validating: 0it [00:00, ?it/s]

1

In [30]:
torch.save(model.state_dict(), 'model_state_dict.pth')

In [47]:
model = T5FineTuner.load_from_checkpoint(checkpoint_path="/content/drive/MyDrive/nlp/final/results/checkpointepoch=0.ckpt")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [48]:
trainer.fit(model)



Validation sanity check: 0it [00:00, ?it/s]

  self.pid = os.fork()


Training: 0it [00:00, ?it/s]

  self.pid = os.fork()


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

#Testing and Evaluating

In [127]:
test_dataset = ToxicCommentDataset(tokenizer=model.tokenizer, data_file=test_path, max_len=model.hparams.max_seq_length, subset="test")

In [126]:
len(test_dataset)

153164

In [58]:
test_loader = DataLoader(test_dataset, batch_size=model.hparams.eval_batch_size, shuffle=False, num_workers=0)

In [130]:
model.model.eval()

predictions = []
device = next(model.parameters()).device

In [132]:
from tqdm import tqdm

for batch in tqdm(test_loader):
    # Generate text outputs
    output_sequences = model.model.generate(
        input_ids=batch['source_ids'].to(device),
        attention_mask=batch['source_mask'].to(device),
        max_length=50
    )

    decoded_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_sequences]
    predictions.extend(decoded_texts)

100%|██████████| 38291/38291 [1:20:26<00:00,  7.93it/s]


In [133]:
len(predictions)

153164

In [134]:
predictions_df = pd.DataFrame({
    'id': test_data['id'],
    'prediction': predictions
})

In [135]:
print(f"Number of entries in test_data: {len(test_data['id'])}")
print(f"Number of predictions generated: {len(corrected_predictions)}")

Number of entries in test_data: 153164
Number of predictions generated: 153164


In [136]:
processed_preds = {label: [] for label in ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]}

for pred in predictions:
    pred_labels = pred.split()
    for label in processed_preds.keys():
        processed_preds[label].append(1 if label in pred_labels else 0)


preds_df = pd.DataFrame(processed_preds)


preds_df['id'] = test_data['id'].values


In [137]:
len(preds_df)

153164

In [138]:
aligned_df = preds_df.merge(test_labels_data, on='id', suffixes=('_pred', '_true'))

In [145]:
len(aligned_df)

153164

In [147]:
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score

labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]


metrics = {label: {'accuracy': None, 'precision': None, 'recall': None, 'f1': None, 'roc_auc': None} for label in labels}

for label in labels:
    valid_indices = aligned_df[f'{label}_true'] != -1
    y_true = aligned_df.loc[valid_indices, f'{label}_true']
    y_pred = aligned_df.loc[valid_indices, f'{label}_pred']

    metrics[label]['accuracy'] = accuracy_score(y_true, y_pred)
    metrics[label]['precision'] = precision_score(y_true, y_pred)
    metrics[label]['recall'] = recall_score(y_true, y_pred)
    metrics[label]['f1'] = f1_score(y_true, y_pred)
    metrics[label]['roc_auc'] = roc_auc_score(y_true, y_pred)



for label, scores in metrics.items():
    print(f"Metrics for {label}:")
    for metric, score in scores.items():
        print(f"  {metric}: {score}")
    print()


Metrics for toxic:
  accuracy: 0.899574853856013
  precision: 0.48551914930405465
  recall: 0.9221674876847291
  f1: 0.6361216514696721
  roc_auc: 0.9096827626372788

Metrics for severe_toxic:
  accuracy: 0.9943105442495858
  precision: 0.5365853658536586
  recall: 0.05994550408719346
  f1: 0.10784313725490197
  roc_auc: 0.5298234068045657

Metrics for obscene:
  accuracy: 0.9399793679077183
  precision: 0.48816896935048437
  recall: 0.8328366296396641
  f1: 0.6155386463756508
  roc_auc: 0.8896878422469723

Metrics for threat:
  accuracy: 0.9961392978836475
  precision: 0.4343065693430657
  recall: 0.5639810426540285
  f1: 0.4907216494845361
  roc_auc: 0.7807751591490852

Metrics for insult:
  accuracy: 0.9347588233455251
  precision: 0.44340051522958024
  recall: 0.8538079953311934
  f1: 0.5836824256931977
  roc_auc: 0.8965741930380927

Metrics for identity_hate:
  accuracy: 0.9889180655850449
  precision: 0.5016685205784205
  recall: 0.6334269662921348
  f1: 0.5599006828057108
  roc_