[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/NLP/blob/main/unitask_downstream_nlp/Sentence-Level-Classification/BERT_FineTuned_Toxic_Comment_Classification.ipynb)




In [None]:
# Installing the transformers library
!pip install -q transformers

[K     |████████████████████████████████| 2.5MB 8.1MB/s 
[K     |████████████████████████████████| 901kB 49.0MB/s 
[K     |████████████████████████████████| 3.3MB 46.6MB/s 
[?25h

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import transformers

# import tokenizer for the model
from transformers import BertTokenizer

# import the model from huggingface
from transformers import BertModel

# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# **Dataset**
We are using the [Jigsaw Toxic Comment Dataset](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge). Refer only to the first csv file from the data dump: `train.csv`

### Reading Dataset

In [None]:
df = pd.read_csv("./data/train.csv")

# Taking the values of all the categories and coverting it into a list.
df['list'] = df[df.columns[2:]].values.tolist()

#The list is appened as a new column and other columns are removed
new_df = df[['comment_text', 'list']].copy()

new_df.head()

Unnamed: 0,comment_text,list
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


### Train Test Split

In [None]:
train_dataset = new_df.sample(frac=0.8,random_state=200)
test_dataset = new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

### Text2Numeric Conversion

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # - To read further into the tokenizer, [refer to this document](https://huggingface.co/transformers/model_doc/bert.html#berttokenizer)
encoded_input = tokenizer("Hello, I'm candlelight")

# to get the text to numeric representation
print(encoded_input)

#to get what are the subwords it broke the original sentence into use following
tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])

{'input_ids': [101, 8667, 117, 146, 112, 182, 170, 1423, 5650, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
class CustomDataset(Dataset):

    def __init__(self, df, tokenizer):
        self.tokenizer = tokenizer
        self.df = df

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):

        comment_text = str(self.df['comment_text'][index]) # extracting the sentence at a particular index
        comment_text = " ".join(comment_text.split()) # splitting the extracted sentence using ' '

        # performing text => numeric conversion using tokenizer
        inputs = self.tokenizer.encode_plus(
                                              comment_text,
                                              None,
                                              add_special_tokens=True,
                                              max_length = 200,
                                              pad_to_max_length=True,
                                              return_token_type_ids=True
                                          )

        return {
                    'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),                 # subwords
                    'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),           # masks
                    'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long), # numeric representation
                    'targets': torch.tensor(self.df['list'][index], dtype=torch.float)          # target
                }

In [None]:
# training
training_set = CustomDataset(train_dataset, tokenizer)
training_loader = DataLoader(training_set, batch_size = 8, shuffle = True, num_workers = 0)

# testing
testing_set = CustomDataset(test_dataset, tokenizer)
testing_loader = DataLoader(testing_set, batch_size = 4, shuffle = True, num_workers = 0)

# **Modelling**


In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3) # for regularization
        self.l3 = torch.nn.Linear(768, 6) # cause 6 class classification and bert output a 768 size vector

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids) #note: there are two outputs from the bert model #output 1 is also called the pooled output
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

In [None]:
model = BERTClass()
model.to(device)

# **Training**

In [None]:
model.train()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=1e-05)

In [None]:
for epoch in range(4):

    for _,data in enumerate(training_loader, 0): #dataloader passes data to the model based on the batch size

        # inputs
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)

        # predictions
        outputs = model(ids, mask, token_type_ids)

        # actuals
        targets = data['targets'].to(device, dtype = torch.float)


        optimizer.zero_grad()
        loss = torch.nn.BCEWithLogitsLoss()(outputs, targets)

        # printing loss after every 5000 steps
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch: 0, Loss:  0.8253790140151978
Epoch: 0, Loss:  0.1364113688468933
Epoch: 0, Loss:  0.06799022853374481
Epoch: 0, Loss:  0.022630181163549423


# **Inference**



In [None]:
from sklearn import metrics

In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
for epoch in range(4):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.9354828601867519
F1 Score (Micro) = 0.8104458787743897
F1 Score (Macro) = 0.6943681099377335
