# BERT Implementation for Text Classification in different Noise Levels

### Install dependecies + connect drive for dataset

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#!pip install transformers

In [None]:
#standard imports
import numpy as np
import pandas as pd

#Pytorch imports
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import nn
from torch.optim import Adam

#bert imports
from transformers import BertTokenizer
from transformers import BertModel

#for precision, recall, fscore
from sklearn.metrics import precision_recall_fscore_support as prfs

#for train loop
from tqdm import tqdm

#for noise injection (file needs to be uploaded)
from utils import noisify

#set param
np.random.seed(112)



### Import Dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/bbc-text.csv")

In [None]:
#label mapping
labels = {'business':0,
          'entertainment':1,
          'sport':2,
          'tech':3,
          'politics':4
          }

### Prepare Data

In [None]:
#init bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
#split dataset into 80/10/10
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

1780 222 223


### Bert Dataset and Bert Model Definition

In [None]:
#custom Pytorch Dataset
class BertDataset(Dataset):

    def __init__(self, df, noise_rate=0.2, noise_type='clean', random_state=0):

        ##Data Conversion

        #label texts to numbers = Y
        self.labels = [labels[label] for label in df['category']]

        #tokenize texts = X
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

        ##



        ##Noisify

        #set params for noisify function
        self.noise_type=noise_type
        self.noise_rate=noise_rate
        self.dataset='bbc'
        
        #inject noise
        if noise_type != 'clean':
            #convert Y to numpy array
            self.labels=np.asarray([[self.labels[i]] for i in range(len(self.labels))])

            #convert Y to noisy Y (is actually a different variable, so you still have access to the clean and the noisy labels simultaneously)
            self.noisy_labels, self.actual_noise_rate = noisify(dataset=self.dataset, train_labels=self.labels, noise_type=self.noise_type, noise_rate=self.noise_rate, random_state=random_state)
            self.noisy_labels=[i[0] for i in self.noisy_labels]

        ##

    def __len__(self):
        return len(self.labels)

    def __get_batch_labels__(self, idx):

        # Fetch a batch of labels = Y
        if self.noise_type != 'clean':
            return np.array(self.noisy_labels[idx])
        return np.array(self.labels[idx])

    def __get_batch_texts__(self, idx):

        # Fetch a batch of inputs = X
        return self.texts[idx]

    def __getitem__(self, idx):

        #Fetch a batch of data pairs (X, Y)
        batch_texts = self.__get_batch_texts__(idx)
        batch_y = self.__get_batch_labels__(idx)

        return batch_texts, batch_y

In [None]:
#custom bert model for pytorch from huggingface
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        #huggingface bert
        super(BertClassifier, self).__init__()

        #define bert + dropout layer + linear layer
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask):

        #huggingface bert needs to be given the input_ids (aka tokens) and the attention_masks from the huggingface bert tokenizer
        _, pooled_output = self.bert(input_ids= input_ids, attention_mask=attention_mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        #don't forget to use softmax after you made a forward pass!

        return linear_output

### Train Definitions

In [None]:
def train(model, train_data, val_data, learning_rate, epochs, noise_rate, noise_type='clean'):

    #load data    

    train = BertDataset(train_data, noise_type= noise_type, noise_rate=noise_rate)
    val = BertDataset(val_data, noise_type= noise_type, noise_rate=noise_rate)

    train_dataloader = DataLoader(train, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val, batch_size=8)

    #set hyperparams
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate, weight_decay=1e-1)

    #use gpu
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    #START TRAIN LOOP
    for epoch_num in range(epochs):

            #### Training ####

            #init params
            total_acc_train = 0
            total_loss_train = 0

            predicted_labels_train = []
            real_labels_train = []

            #batch loop
            for train_input, train_label in tqdm(train_dataloader):

                #move data to cuda
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                #let model predict
                output = model(input_id, mask)
                
                #compute train loss of batch
                batch_loss = criterion(output, train_label)

                #add train loss to total loss
                total_loss_train += batch_loss.item()
                

                #add softmax
                prob = F.softmax(output, dim=1)
                predicted_label = prob.argmax(dim=1)

                #accuracy
                acc = (predicted_label == train_label).sum().item()
                total_acc_train += acc

                #other metrics
                predicted_labels_train.extend(predicted_label.cpu())
                real_labels_train.extend(train_label.cpu())

                #backprop, optimizer step
                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            #### Validation ####
            #same as above but without backprop etc

            total_acc_val = 0
            total_loss_val = 0

            predicted_labels_val = []
            real_labels_val = []

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    #to cuda
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    #predict
                    output = model(input_id, mask)

                    #calc loss
                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    #calc accuracy
                    prob = F.softmax(output, dim=1)
                    predicted_label = prob.argmax(dim=1)

                    acc = (predicted_label == val_label).sum().item()
                    total_acc_val += acc

                    predicted_labels_val.extend(predicted_label.cpu())
                    real_labels_val.extend(val_label.cpu())
            

            #### Metrics ####

            #after training in one epoch is done, compute the losses and accuracy measures for the epoch
            #train metrics
            train_loss = total_loss_train / len(train_data)
            train_acc = total_acc_train / len(train_data)

            #additional metrics
            train_prec, train_rec, train_f, _ = prfs(real_labels_train, predicted_labels_train, average='weighted')

            #val metrics
            val_loss = total_loss_val / len(val_data)
            val_acc = total_acc_val / len(val_data)

            #additional metrics
            val_prec, val_rec, val_f, _ = prfs(real_labels_val, predicted_labels_val, average='weighted')


            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {train_loss: .3f} \
                | Train Accuracy: {train_acc: .3f} \
                | Val Loss: {val_loss: .3f} \
                | Val Accuracy: {val_acc: .3f}')

In [None]:
#evaluate on test data
def evaluate(model, test_data, noise_type, noise_rate):

    #init data
    test = BertDataset(test_data, noise_type=noise_type, noise_rate=noise_rate)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=8)

    #use gpu
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    #set params
    total_acc_test = 0

    predicted_labels_test = []
    real_labels_test = []    

    #evaluate model
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            prob = F.softmax(output, dim=1)
            predicted_label = prob.argmax(dim=1)

            acc = (predicted_label == test_label).sum().item()
            total_acc_test += acc

            predicted_labels_test.extend(predicted_label.cpu())
            real_labels_test.extend(test_label.cpu())
    #metrics
    test_acc = total_acc_test / len(test_data)
    #additional metrics
    test_prec, test_rec, test_f, _ = prfs(real_labels_test, predicted_labels_test, average='weighted')


    print(f'Test Accuracy: {test_acc: .3f}')

### Training

In [None]:
EPOCHS = 25
LR = 1e-6
NOISE_RATES = [0.2,0.4,0.5]
NOISE_TYPE = 'symmetric'
model = BertClassifier()


#start train loop for clean dataset
train(model, df_train, df_val, LR, EPOCHS, 0.0, 'clean')
#evaluate model on clean test data
evaluate(model, df_test, 'clean', 0.0)


#start train loop for different noise rates:
for NOISE_RATE in NOISE_RATES:
    print('#########################################################################')
    print('NOISE RATE:',NOISE_RATE)
    train(model, df_train, df_val, LR, EPOCHS, NOISE_RATE, NOISE_TYPE)
    #evaluate model on clean test data
    evaluate(model, df_test, 'clean', 0.0)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 5
1780
Actual noise 0.51
[[0.5   0.125 0.125 0.125 0.125]
 [0.125 0.5   0.125 0.125 0.125]
 [0.125 0.125 0.5   0.125 0.125]
 [0.125 0.125 0.125 0.5   0.125]
 [0.125 0.125 0.125 0.125 0.5  ]]
4 5
222
Actual noise 0.49
[[0.5   0.125 0.125 0.125 0.125]
 [0.125 0.5   0.125 0.125 0.125]
 [0.125 0.125 0.5   0.125 0.125]
 [0.125 0.125 0.125 0.5   0.125]
 [0.125 0.125 0.125 0.125 0.5  ]]
True


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 1 | Train Loss:  0.213                 | Train Accuracy:  0.220                 | Val Loss:  0.212                 | Val Accuracy:  0.239


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 2 | Train Loss:  0.210                 | Train Accuracy:  0.220                 | Val Loss:  0.205                 | Val Accuracy:  0.261


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 3 | Train Loss:  0.208                 | Train Accuracy:  0.220                 | Val Loss:  0.209                 | Val Accuracy:  0.185


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 4 | Train Loss:  0.206                 | Train Accuracy:  0.225                 | Val Loss:  0.204                 | Val Accuracy:  0.239


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 5 | Train Loss:  0.205                 | Train Accuracy:  0.225                 | Val Loss:  0.206                 | Val Accuracy:  0.225


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 6 | Train Loss:  0.202                 | Train Accuracy:  0.244                 | Val Loss:  0.204                 | Val Accuracy:  0.239


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 7 | Train Loss:  0.200                 | Train Accuracy:  0.274                 | Val Loss:  0.203                 | Val Accuracy:  0.284


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 8 | Train Loss:  0.199                 | Train Accuracy:  0.272                 | Val Loss:  0.198                 | Val Accuracy:  0.279


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 9 | Train Loss:  0.196                 | Train Accuracy:  0.306                 | Val Loss:  0.199                 | Val Accuracy:  0.279


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 10 | Train Loss:  0.197                 | Train Accuracy:  0.292                 | Val Loss:  0.195                 | Val Accuracy:  0.293


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 11 | Train Loss:  0.195                 | Train Accuracy:  0.321                 | Val Loss:  0.200                 | Val Accuracy:  0.324


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 12 | Train Loss:  0.194                 | Train Accuracy:  0.320                 | Val Loss:  0.197                 | Val Accuracy:  0.297


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 13 | Train Loss:  0.191                 | Train Accuracy:  0.330                 | Val Loss:  0.194                 | Val Accuracy:  0.338


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 14 | Train Loss:  0.191                 | Train Accuracy:  0.335                 | Val Loss:  0.200                 | Val Accuracy:  0.275


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 15 | Train Loss:  0.188                 | Train Accuracy:  0.356                 | Val Loss:  0.194                 | Val Accuracy:  0.333


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 16 | Train Loss:  0.188                 | Train Accuracy:  0.366                 | Val Loss:  0.196                 | Val Accuracy:  0.329


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 17 | Train Loss:  0.187                 | Train Accuracy:  0.376                 | Val Loss:  0.190                 | Val Accuracy:  0.378


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 18 | Train Loss:  0.185                 | Train Accuracy:  0.385                 | Val Loss:  0.189                 | Val Accuracy:  0.369


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 19 | Train Loss:  0.183                 | Train Accuracy:  0.400                 | Val Loss:  0.188                 | Val Accuracy:  0.392


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 20 | Train Loss:  0.182                 | Train Accuracy:  0.418                 | Val Loss:  0.189                 | Val Accuracy:  0.392


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 21 | Train Loss:  0.180                 | Train Accuracy:  0.417                 | Val Loss:  0.191                 | Val Accuracy:  0.410


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 22 | Train Loss:  0.179                 | Train Accuracy:  0.442                 | Val Loss:  0.188                 | Val Accuracy:  0.432


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 23 | Train Loss:  0.176                 | Train Accuracy:  0.469                 | Val Loss:  0.189                 | Val Accuracy:  0.387


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 24 | Train Loss:  0.175                 | Train Accuracy:  0.452                 | Val Loss:  0.190                 | Val Accuracy:  0.378


100%|██████████| 223/223 [01:37<00:00,  2.28it/s]


Epochs: 25 | Train Loss:  0.172                 | Train Accuracy:  0.469                 | Val Loss:  0.190                 | Val Accuracy:  0.441
