# Implementing BERT

## 1. Loading Libraries

In [1]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from sklearn.model_selection import train_test_split




# os.chdir('c:\\Users\\Owner\\Desktop\\Sem4\\Capstone\\Project\\')
# os.getcwd()

## 2. Setting up GPU (in case of GoogleColab)

In [2]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


No GPU available, using the CPU instead.


## 3. Dataset

## 3.1. Training data 

Loading the file that was created in the notebook *Training Data.ipynb*. We have columns *Tweet ID*, *Text* and *label*.

Training data is already pre-processed i.e. minimum cleaned

In [3]:
ann_tweets = pd.read_csv("/Users/nikhiljoshi/Capstone Project/ML Models/minimum_cleaned_balanced training data.csv")
ann_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3831 entries, 0 to 3830
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    3831 non-null   object
 1   label   3831 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 60.0+ KB


In [4]:
ann_tweets.label

0       0
1       0
2       0
3       1
4       0
       ..
3826    2
3827    2
3828    2
3829    2
3830    2
Name: label, Length: 3831, dtype: int64

In [5]:
ann_tweets.Text

0       are we still allowed to quote ancient chinese ...
1        more power to you! this chinese virus thing h...
2       cnbc: who, tedros reiterated that the virus co...
3       "the heightened racism experienced by asian co...
4       coronavirus and nepali in china: kp oli has di...
                              ...                        
3826    sea shepherd suppoers are racist! antiracism s...
3827     no it does not, germans, even fascists rape t...
3828    "nigger?" lifelessons of white kid growing up ...
3829     you do not have the balls to hashtag me as a ...
3830     kevin macdonald: ethnocentrism is normal and ...
Name: Text, Length: 3831, dtype: object

In [6]:
ann_tweets.index.values

array([   0,    1,    2, ..., 3828, 3829, 3830])

In [7]:
ann_tweets.label.values

array([0, 0, 0, ..., 2, 2, 2])

## 3.2. Split training data

We will randomly split the training data into two sets with a ratio of **85:15**. A train set with 85% data and a validation set with 15% data. We will perform hyperparameter tuning using cross-validation on the train set and evaluate the performance on the remaining 15%.

We are also using stratify to split it in a way that preserves the same proportions of instances in each label.

## Fine tuning BERT with minimum cleaned tweets

In [8]:
#train test split
X=ann_tweets.Text.values
y=ann_tweets.label.values
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                   test_size = 0.15,
                                                   random_state = 17,
                                                   stratify = ann_tweets.label.values)

## 3.3. Max Tweet Length

In [9]:
max_len = max([len(sent) for sent in ann_tweets.Text])
print('Max length: ', max_len)

Max length:  291


## 4. Fine tuning BERT with minimum cleaned tweets

### 4.1. Install HuggingFace Library

The transformer library of Hugging Face contains PyTorch implementation of NLP models like BERT.

In [10]:
#!pip install transformers

We are using 'bert-base-uncased' for this project. uncased means it will not make a difference between **english** and **English**.
### 4.2. BERT Tokenizer
Since the training data is already partially preprocessed, we can directly tokenize our text using Berttokenizer provided by the library.

We are using 'bert-base-uncased' for this project. uncased means it will not make a difference between **english** and **English**. 

In [11]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                         do_lower_case = True)

The encode_plus method of BERT tokenizer will:

- split our text into tokens,

- add the special [CLS] and [SEP] tokens, [SEP] token helps the model to understand the end of one input and the start of another input in the same sequence input. [CLS] is a special classification token (CLS stands for classification) what lets BERT know that we are doing a classification problem.

- convert these tokens into indexes of the tokenizer vocabulary,

- pad or truncate sentences to max length, and

- create attention mask.

In [12]:
def tokenize_text(data,MAX_LEN) :
    input_ids = []
    attention_mask = []
    for i in range(len(data)):
        encoded = tokenizer.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded.get('input_ids'))
        attention_mask.append(encoded.get('attention_mask'))

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_mask)
    return input_ids, attention_masks

### 4.3. Maximum Token Length

In [13]:
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in ann_tweets.Text]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)

Max length:  129


In [14]:
MAX_LEN = max_len

# Print sentence 0 and its encoded token ids
token_ids,masks = tokenize_text(ann_tweets.Text[0],MAX_LEN)
print('Original: ', ann_tweets.Text[0])
print('Token IDs: ', token_ids)



Original:  are we still allowed to quote ancient chinese proverbs, or is that racist? racismisavirus
Token IDs:  tensor([[ 101, 1037,  102,  ...,    0,    0,    0],
        [ 101, 1054,  102,  ...,    0,    0,    0],
        [ 101, 1041,  102,  ...,    0,    0,    0],
        ...,
        [ 101, 1054,  102,  ...,    0,    0,    0],
        [ 101, 1057,  102,  ...,    0,    0,    0],
        [ 101, 1055,  102,  ...,    0,    0,    0]])


In [15]:
# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = tokenize_text(X_train,MAX_LEN)
val_inputs, val_masks = tokenize_text(X_val,MAX_LEN)

Tokenizing data...


### 4.4. Tensors & DataLoaders

Creating an iterator for each of our dataset i.e train and validation using the Pytorch DataLoader class. 

In [16]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

# For fine-tuning BERT, batch size of 16 or 32 is recommended.
batch_size = 32


In [17]:
train_labels

tensor([1, 1, 0,  ..., 2, 1, 2])

In [18]:
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [19]:
temp=len(ann_tweets.label.unique())
label_dict={0:'neutral',2:'negative',1:'positive'}


### 4.5. Set up BERT Pre-trained Model

In [20]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = temp,
                                                       output_attentions = False,
                                                      output_hidden_states = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [21]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

### 4.6. Setting up Optimizer and scheduler

We are creating the optimizer with the following hyper-parameters:

- Learning rate (Adam): 1e-5,
- Number of epochs: 10
- default epsilon value 1e-8

and AdamW optimizer.

In [22]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                 lr = 1e-5,
                 eps = 1e-8) #2e-5 > 5e-5
                 
epochs = 10

scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = len(train_dataloader)*epochs)



## 5. Training & Evaluation


### 5.1. Evaluation
To evaluate model, put it in validation mode by using .eval() which deactivates Dropout layer.
- Perform a forward pass to compute logits and loss
- Compute loss and f1-score using val set.


In [23]:
import tqdm

def evaluate(val_dataloader):

    #evaluation mode 
    model.eval()
    
    #tracking variables
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(val_dataloader):
        
        #load into GPU
        batch = tuple(b.to(device) for b in batch)
        
        #define inputs
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}

        #compute logits
        with torch.no_grad():        
            outputs = model(**inputs)
        
        #compute loss
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        #compute accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    #compute average loss
    loss_val_avg = loss_val_total/len(val_dataloader) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

### 5.2. F1-Score, Precision & Recall

In [24]:
from sklearn.metrics import f1_score
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'macro')


In [25]:
from sklearn.metrics import precision_score
def precision_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, preds_flat, average = 'macro')


In [26]:
from sklearn.metrics import recall_score
def recall_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, preds_flat, average = 'macro')


### 5.3. Accuracy 

In [27]:
from sklearn.metrics import accuracy_score
def accuracy_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, preds_flat)
    

### 5.4. Setting seed

In [28]:
import random
from tqdm import tqdm as tqdm
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

### 5.5. Training
The model is put into training mode by using .train()
- tqdm to track progress
- to(device) unpacks our data from the DataLoader and load the data onto the GPU.
- zero_grad to set the gradient to 0.
- input will take in 3 values input_id, attention_mask, and labels.
- We will train the model forward to get loss and logits.
- loss.backward() performs backward pass to compute gradients.
- optimizer.step() updates the model’s parameters.
- scheduler.step() updates learning rate.
- evaluate and f1_score_func to evaluate and compute loss and f1-score on the val set

In [29]:

for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0
    
    progress_bar = tqdm(train_dataloader, 
                        desc = 'Epoch {:1d}'.format(epoch), 
                        leave = False, 
                        disable = False)
    
    for batch in progress_bar:
        
        model.zero_grad() #set gradient to 0
    
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids': batch[0], 
                  'attention_mask': batch[1], 
                  'labels': batch[2]}
        
        outputs = model(**inputs) #unpack the dict straight into inputs
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})
        
#    torch.save(model.state_dict(), 'c:\\Users\\Owner\\Desktop\\Sem4\\Capstone\\Project\\BERT_ft_epoch{0}.pth'.format(epoch))
    
    print('\n Epoch {0}' .format(epoch))
    loss_train_avg = loss_train_total / len(train_dataloader)
    val_loss, predictions, true_vals = evaluate(val_dataloader)
    print('Training loss:{0}' .format(loss_train_avg))
    val_loss, predictions, true_vals = evaluate(val_dataloader)
    val_f1 = f1_score_func(predictions, true_vals)
    val_precision=precision_score_func(predictions,true_vals)
    val_recall=recall_score_func(predictions,true_vals)
    print('Validation loss:{0}'.format(val_loss))
    print('F1 Score (macro):{0}' .format(val_f1))
    print('Precision Score (macro):{0}' .format(val_precision))
    print('Recall Score (macro):{0}' .format(val_recall))
  
   


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch 1


100%|██████████| 18/18 [00:42<00:00,  2.33s/it]


Training loss:0.9221487127098382


100%|██████████| 18/18 [00:41<00:00,  2.30s/it]
 10%|█         | 1/10 [15:06<2:15:54, 906.05s/it]

Validation loss:0.7439585361215804
F1 Score (macro):0.6699732945817957
Precision Score (macro):0.6873921547983842
Recall Score (macro):0.679074574124079





 Epoch 2


100%|██████████| 18/18 [00:33<00:00,  1.88s/it]


Training loss:0.6133209189363554


100%|██████████| 18/18 [00:37<00:00,  2.08s/it]
 20%|██        | 2/10 [30:14<2:01:01, 907.67s/it]

Validation loss:0.5734858993026946
F1 Score (macro):0.7712670321365973
Precision Score (macro):0.7787870966732755
Recall Score (macro):0.7670379418894271





 Epoch 3


100%|██████████| 18/18 [00:30<00:00,  1.69s/it]


Training loss:0.4507456627838752


100%|██████████| 18/18 [00:31<00:00,  1.73s/it]
 30%|███       | 3/10 [49:43<1:59:49, 1027.05s/it]

Validation loss:0.5388726078801684
F1 Score (macro):0.7704907967006468
Precision Score (macro):0.7914671141670753
Recall Score (macro):0.7684517975607085





 Epoch 4


100%|██████████| 18/18 [00:32<00:00,  1.78s/it]


Training loss:0.340995467468804


100%|██████████| 18/18 [00:34<00:00,  1.89s/it]
 40%|████      | 4/10 [1:08:23<1:46:20, 1063.50s/it]

Validation loss:0.5442996025085449
F1 Score (macro):0.7890056167048768
Precision Score (macro):0.8113727335466466
Recall Score (macro):0.7845230237309445





 Epoch 5


100%|██████████| 18/18 [00:33<00:00,  1.84s/it]


Training loss:0.2613297649020073


100%|██████████| 18/18 [00:33<00:00,  1.84s/it]
 50%|█████     | 5/10 [1:22:39<1:22:23, 988.79s/it] 

Validation loss:0.5479411052332984
F1 Score (macro):0.7996928490384807
Precision Score (macro):0.8010190222336652
Recall Score (macro):0.7991535344010591





 Epoch 6


100%|██████████| 18/18 [00:30<00:00,  1.70s/it]


Training loss:0.19880528569075406


100%|██████████| 18/18 [00:34<00:00,  1.91s/it]
 60%|██████    | 6/10 [1:38:57<1:05:40, 985.00s/it]

Validation loss:0.567880156967375
F1 Score (macro):0.8075541543975678
Precision Score (macro):0.810762987012987
Recall Score (macro):0.8075256097038276





 Epoch 7


100%|██████████| 18/18 [00:32<00:00,  1.81s/it]


Training loss:0.16125697631608038


100%|██████████| 18/18 [00:32<00:00,  1.82s/it]
 70%|███████   | 7/10 [2:00:36<54:22, 1087.59s/it] 

Validation loss:0.5793773002094693
F1 Score (macro):0.8088768849921562
Precision Score (macro):0.8134984897637603
Recall Score (macro):0.8064526452645264





 Epoch 8


100%|██████████| 18/18 [00:55<00:00,  3.06s/it]


Training loss:0.12766081503793306


100%|██████████| 18/18 [00:55<00:00,  3.07s/it]
 80%|████████  | 8/10 [2:19:36<36:48, 1104.26s/it]

Validation loss:0.6124436524179246
F1 Score (macro):0.8101233029891876
Precision Score (macro):0.817986561713293
Recall Score (macro):0.8088211678310687





 Epoch 9


100%|██████████| 18/18 [00:57<00:00,  3.21s/it]


Training loss:0.10991187676714331


100%|██████████| 18/18 [01:04<00:00,  3.58s/it]
 90%|█████████ | 9/10 [2:42:15<19:44, 1184.13s/it]

Validation loss:0.6205306781662835
F1 Score (macro):0.8061233105792692
Precision Score (macro):0.8068528607749803
Recall Score (macro):0.8060421280223261





 Epoch 10


100%|██████████| 18/18 [00:45<00:00,  2.51s/it]


Training loss:0.08892649448677606


100%|██████████| 18/18 [00:45<00:00,  2.53s/it]
100%|██████████| 10/10 [3:01:30<00:00, 1089.02s/it]

Validation loss:0.6277898136112425
F1 Score (macro):0.8080011561318261
Precision Score (macro):0.8096558933403982
Recall Score (macro):0.8076256197048277





### 5.6. Prediction

 We will perform a forward pass to compute logits and apply softmax to calculate probabilities.

In [30]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask).logits
        all_logits.append(logits)
    
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()   
    return probs

In [47]:
from sklearn.metrics import classification_report

def creport(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    print(classification_report(labels_flat, preds_flat))

creport(predictions,true_vals)

              precision    recall  f1-score   support

           0       0.76      0.81      0.78       202
           1       0.88      0.87      0.87       175
           2       0.79      0.74      0.77       198

    accuracy                           0.81       575
   macro avg       0.81      0.81      0.81       575
weighted avg       0.81      0.81      0.81       575



In [30]:
test_data = pd.read_csv('/Users/nikhiljoshi/Documents/GitHub/Project/data/input3.csv')
sample_test = test_data.sample(frac=1.0, replace=False, random_state=17)

In [44]:
sample_test.reset_index(inplace=True)
sample_test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126275 entries, 0 to 126274
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   level_0       126275 non-null  int64  
 1   index         126275 non-null  int64  
 2   Datetime      126275 non-null  object 
 3   Tweet Id      126275 non-null  float64
 4   Text          126275 non-null  object 
 5   Location      95072 non-null   object 
 6   Retweet       126275 non-null  int64  
 7   Like          126275 non-null  int64  
 8   new_Hashtags  126240 non-null  object 
dtypes: float64(1), int64(4), object(4)
memory usage: 8.7+ MB


In [33]:
encoded_test_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in sample_test.Text]

max_len = max([len(sent) for sent in encoded_test_tweets])
print('Max length: ', max_len)


Max length:  167


In [34]:
test_inputs, test_masks = tokenize_text(sample_test.Text,max_len)

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

In [35]:
prob=bert_predict(model,test_dataloader)

In [36]:
probs=[]
for x in range(len(test_data)):
    probs.append(prob[x].argmax())


Final=pd.DataFrame(probs)
Final.columns=['Pred']