### Use pretrained LLMs / transformer models

\
Use Google Colab with or Kaggle Notebooks with GPU enabled at runtime for faster model training

In [59]:
import pandas as pd
import torch
import re
from pathlib import Path 

In [6]:
# Load data

DATASET_PATH = Path() / '../data/processed/jigsaw-toxic-comment-classification-challenge/'
TRAIN_PATH = DATASET_PATH / "train.csv"
TEST_PATH = DATASET_PATH / "test.csv"
TEST_LABELS = DATASET_PATH / "test_labels.csv"

df = pd.read_csv(TRAIN_PATH)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [45]:
email_pattern = re.compile(r"([a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
url_pattern = re.compile(r"(http[s]?://\S+|www\.\S+)")
username_pattern = re.compile(r"(\s@[a-zA-Z0-9_-]+)")
extra_spaces = re.compile(r"\s+")

In [63]:
# Replace usernames and links for placeholders: "@user" and "http"

def process_text(text: str):
    text = url_pattern.sub('http', text)
    text = email_pattern.sub('<email>', text)
    text = username_pattern.sub('@user', text)
    text = extra_spaces.sub(' ', text)
    return text

In [49]:
texts = df['comment_text'].apply(process_text).tolist()
labels = df['toxic'].to_numpy()

In [10]:
# Split data into training and validation sets

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [24]:
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: mps


In [66]:
# Load pretrained tokenizer - we will use `DistilBertTokenizer`

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

checkpoint = 'distilbert-base-uncased'
# Also, try with the fined tuned'distilbert-base-uncased-finetuned-sst-2-english'
# or any model based on RoBERTa-base model (e.g 'twitter-roberta-base-sentiment')

tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Tokenize and encode texts

def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        max_length=max_length, 
        padding=True,
        truncation=True,
        return_tensors='pt'  # PyTorch tensors
    )

train_encodings = tokenize_data(train_texts, tokenizer)
val_encodings = tokenize_data(val_texts, tokenizer)

In [13]:
# Wrap encodings and labels in a Dataset

from torch.utils.data import Dataset, DataLoader

class ToxicCommentsDataset(Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        item = {key: value[idx] for key, value in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = ToxicCommentsDataset(train_encodings, train_labels)
val_dataset = ToxicCommentsDataset(val_encodings, val_labels)

In [15]:
# Batch dataset for training with DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)

In [27]:
# Model

from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [20]:
# Optimizer

from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-3)

In [None]:
# Model training

from tqdm import tqdm

n_epochs = 3

for epoch in range(n_epochs):
    
    # training loop
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        # reset gradient
        optimizer.zero_grad()
        # get batch input_ids, attention_mask and labels
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        # back propagate
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {epoch + 1} Loss: {train_loss / len(train_loader)}")
    
    # validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            val_loss += loss.item()
    print(f"Validation Loss: {val_loss / len(val_loader)}")

In [None]:
# Evaluation

from sklearn.metrics import classification_report

model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

print(classification_report(true_labels, predictions, target_names=['non-toxic', 'toxic']))

Using a pretrained transformer model, we can expect an f1-score of about 0.80 - 0.85.