# Finetuning DistilBERT Model for Toxic Comment Classification

### Importing Libraries
*NOTE: Transformers version>4.20 necessary for use of Trainer, TrainingArguments, and DistilBertTokenizerFast*

In [19]:
%pip install transformers==4.20
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader 
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


### Setting Up For GPU Usage

In [20]:
from torch import cuda
device = torch.device('cuda' if cuda.is_available() else 'cpu')

print(f"Current device: {device}")

Current device: cpu


## Preprocessing and Cleaning Domain Data
*Preprocessing assumes that both csv files are downloaded, unzipped, and saved in data/input/~.csv*

1. Read the csv files into dataframes using Pandas.
2. Drop the ID column from the data.
3. Take all the values of individual categories of toxicity and combine into a new column: 'labels'.
4. Drop all the old columns of individual categories.
5. Change all comment text to lower case.
6. Replace non-breaking spaces with regular spaces and ensure there is only one space between each word.

In [21]:
train_path = 'data/input/train.csv'
test_path = 'data/input/test.csv'

df = pd.read_csv(train_path)

df_test = pd.read_csv(test_path)

print(f"Total Training Records : {len(df)}")

df.drop(['id'], inplace=True, axis=1)
df['labels'] = df.iloc[:, 1:].values.tolist()
df.drop(df.columns.values[1:-1].tolist(), inplace=True, axis=1)

df["comment_text"] = df["comment_text"].str.lower()
df["comment_text"] = df["comment_text"].str.replace("\xa0", " ", regex=False).str.split().str.join(" ")

df.head()

df_train, df_val = train_test_split(df, test_size=0.1)

Total Training Records : 159571


In [26]:
df.head()

Unnamed: 0,comment_text,labels
0,explanation why the edits made under my userna...,"[0, 0, 0, 0, 0, 0]"
1,d'aww! he matches this background colour i'm s...,"[0, 0, 0, 0, 0, 0]"
2,"hey man, i'm really not trying to edit war. it...","[0, 0, 0, 0, 0, 0]"
3,""" more i can't make any real suggestions on im...","[0, 0, 0, 0, 0, 0]"
4,"you, sir, are my hero. any chance you remember...","[0, 0, 0, 0, 0, 0]"


### Training Parameters

In [22]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory for model predictions and checkpoints
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

MAX_LEN = 200

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## Defining the Model

In [23]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\kyleb/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": 

## Preparing the Dataset and Dataloader

### ToxicCommentsDataset Dataset Class
* This class accepts the tokenizer, dataframe, max length, and evaluation mode and generates tokenized output that will be used by the DistilBERT model.
* The DistilBERT tokenizer will be used to tokenize the data from the "comment_text" dataframe column.
* This class is used to create 2 datasets, one for training and one for validation. We will be doing an 80-20 split of the data for training and validation, respectively.

In [27]:
class ToxicCommentsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data['labels'].values
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'labels': torch.tensor(self.targets[index], dtype=torch.float)
}

### Loading Tokenizer and Generating Training Set

In [28]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
training_set = ToxicCommentsDataset(df_train, tokenizer, MAX_LEN)
validation_set = ToxicCommentsDataset(df_val, tokenizer, MAX_LEN)

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\kyleb/.cache\huggingface\transformers\0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at C:\Users\kyleb/.cache\huggingface\transformers\75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json from cache at C:\Users\kyleb/.cache\huggingface\transformers\8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.

### Defining the Trainer

In [29]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=training_set,         # training dataset
    eval_dataset=validation_set             # evaluation dataset
)

### Training the Model

In [30]:
trainer.train()

***** Running training *****
  Num examples = 143613
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 53856


  0%|          | 0/53856 [00:00<?, ?it/s]

KeyError: 66199

### Saving the Model

In [None]:
model.save_pretrained("./toxic_comment_model")
tokenizer.save_pretrained("./toxic_comment_model")