[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/NLP/blob/main/unitask_downstream_nlp/Sentence-Level-Classification/BERT_FineTuned_Toxic_Comment_Classification.ipynb)




In [1]:
# Installing the transformers library
!pip install -q transformers

In [2]:
# Importing libraries
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import transformers

# import tokenizer for the model
from transformers import BertTokenizer

# import the model from huggingface
from transformers import BertModel

# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

# **Dataset**


In [4]:
!wget https://github.com/khetansarvesh/NLP/blob/main/unitask_downstream_nlp/Sentence-Level-Classification/SST_Dataset.csv

--2025-01-04 22:20:22--  https://github.com/khetansarvesh/NLP/blob/main/unitask_downstream_nlp/Sentence-Level-Classification/SST_Dataset.csv
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘SST_Dataset.csv’

SST_Dataset.csv         [ <=>                ] 223.93K  --.-KB/s    in 0.03s   

2025-01-04 22:20:23 (7.68 MB/s) - ‘SST_Dataset.csv’ saved [229307]



In [6]:
# reading the dataset
df = pd.read_csv("SST_Dataset.csv")
df.dropna(inplace=True)
df

Unnamed: 0,review,label
0,bromwell high is a cartoon comedy . it ran at ...,1.0
1,story of a man who has unnatural feelings for ...,0.0
2,homelessness or houselessness as george carli...,1.0
3,airport starts as a brand new luxury pla...,0.0
4,brilliant over acting by lesley ann warren . ...,1.0
...,...,...
10980,i watched this movie a couple of days ago in a...,1.0
10981,rajinikanth becomes born again after getting a...,0.0
10982,it s not easy making a movie with different...,1.0
10983,movie goers avoid watching this movie . if yo...,0.0


In [8]:
# performing train test split
train_dataset = df.sample(frac=0.8,random_state=200)
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("Full Dataset: {}".format(df.shape))
print("Train Dataset: {}".format(train_dataset.shape))
print("Test Dataset: {}".format(test_dataset.shape))

Full Dataset: (10985, 2)
Train Dataset: (8788, 2)
Test Dataset: (2197, 2)


### Text2Numeric Conversion

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # - To read further into the tokenizer, [refer to this document](https://huggingface.co/transformers/model_doc/bert.html#berttokenizer)
encoded_input = tokenizer("Hello, I'm candlelight")

# to get the text to numeric representation
print(encoded_input)

#to get what are the subwords it broke the original sentence into use following
tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

{'input_ids': [101, 7592, 1010, 1045, 1005, 1049, 13541, 7138, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


['[CLS]', 'hello', ',', 'i', "'", 'm', 'candle', '##light', '[SEP]']

In [14]:
class CustomDataset(Dataset):

    def __init__(self, df, tokenizer):
        self.tokenizer = tokenizer
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        comment_text = str(self.df['review'][index]) # extracting the sentence at a particular index
        comment_text = " ".join(comment_text.split()) # splitting the extracted sentence using ' '

        # performing text => numeric conversion using tokenizer
        inputs = self.tokenizer.encode_plus(comment_text,
                                              None,
                                              add_special_tokens=True,
                                              max_length = 200,
                                              pad_to_max_length=True,
                                              return_token_type_ids=True)

        return {'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),                 # numeric representation
                'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),           # masks
                'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long),
                'targets': torch.tensor(self.df['label'][index], dtype=torch.float)}          # target

In [15]:
# training
training_set = CustomDataset(train_dataset, tokenizer)
training_loader = DataLoader(training_set, batch_size = 8, shuffle = True, num_workers = 0)

# testing
testing_set = CustomDataset(test_dataset, tokenizer)
testing_loader = DataLoader(testing_set, batch_size = 4, shuffle = True, num_workers = 0)

# **Modelling**


In [74]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Linear(768, 1) # cause 1 class classification and bert output a 768 size vector
        self.sig = torch.nn.Sigmoid()

    def forward(self, ids, mask, token_type_ids):
        batch_size = ids.shape[0]
        out = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, output_hidden_states=True) #note: there are two outputs from the bert model #output 1 is also called the pooled output
        out = out.hidden_states[-1][:, 0, :] # Taking the first token's embedding from last hidden layer
        out = self.sig(self.l2(out))

        out = out.view(batch_size, -1)
        return out[:, -1]

In [75]:
model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

# **Training**

In [76]:
model.train()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=1e-05)

In [81]:
for epoch in range(4):

    for _,data in enumerate(training_loader, 0):

        # predictions
        outputs = model(data['ids'].to(device, dtype = torch.long),
                        data['mask'].to(device, dtype = torch.long),
                        data['token_type_ids'].to(device, dtype = torch.long))

        # actuals
        targets = data['targets'].to(device, dtype = torch.float)

        optimizer.zero_grad()
        loss = torch.nn.BCELoss()(outputs, targets)

        # printing loss after every 100 steps
        if _%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()



Epoch: 0, Loss:  0.24486786127090454
Epoch: 0, Loss:  0.18887081742286682
Epoch: 0, Loss:  0.04634963721036911
Epoch: 0, Loss:  0.27185314893722534
Epoch: 0, Loss:  0.23766538500785828
Epoch: 0, Loss:  0.36826038360595703
Epoch: 0, Loss:  0.16217420995235443
Epoch: 0, Loss:  0.07892346382141113
Epoch: 0, Loss:  0.0802040696144104
Epoch: 0, Loss:  0.05158868432044983
Epoch: 0, Loss:  0.19988760352134705
Epoch: 1, Loss:  0.4631763994693756
Epoch: 1, Loss:  0.37885037064552307
Epoch: 1, Loss:  0.06528477370738983
Epoch: 1, Loss:  0.026637200266122818
Epoch: 1, Loss:  0.09496569633483887
Epoch: 1, Loss:  0.30154386162757874
Epoch: 1, Loss:  0.07963477075099945
Epoch: 1, Loss:  0.023516539484262466
Epoch: 1, Loss:  0.18341918289661407
Epoch: 1, Loss:  0.14741434156894684
Epoch: 1, Loss:  0.014229665510356426
Epoch: 2, Loss:  0.04519607126712799
Epoch: 2, Loss:  0.003545056562870741
Epoch: 2, Loss:  0.10749687254428864
Epoch: 2, Loss:  0.002310708397999406
Epoch: 2, Loss:  0.0909830108284950

# **Inference**



In [82]:
from sklearn import metrics

In [83]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [84]:
for epoch in range(4):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.5029585798816568
F1 Score (Micro) = 0.5029585798816568
F1 Score (Macro) = 0.3346456692913386




Accuracy Score = 0.5029585798816568
F1 Score (Micro) = 0.5029585798816568
F1 Score (Macro) = 0.3346456692913386




Accuracy Score = 0.5029585798816568
F1 Score (Micro) = 0.5029585798816568
F1 Score (Macro) = 0.3346456692913386




Accuracy Score = 0.5029585798816568
F1 Score (Micro) = 0.5029585798816568
F1 Score (Macro) = 0.3346456692913386
