In [35]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig,AdamW, BertForSequenceClassification,get_linear_schedule_with_warmup


import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef

from tqdm.notebook import trange, tqdm
from ipywidgets import FloatProgress
import random
import os
import io

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [20]:
data = pd.read_csv('processed_emotions_dataset_2.csv',index_col=0)
# data['preprocessed_text_split'] = data['preprocessed_text'].str.split()
data = data.dropna()

In [21]:
# Function to retrieve top few number of each category
def get_top_data(data, top_n):
    top_data_0 = data[data['label'] == 0].head(top_n)
    top_data_1 = data[data['label'] == 1].head(top_n)
    top_data_2 = data[data['label'] == 2].head(top_n)
    top_data_3 = data[data['label'] == 3].head(top_n)
    top_data_4 = data[data['label'] == 4].head(top_n)
    top_data_5 = data[data['label'] == 5].head(top_n)
    data_equal_size_per_label = pd.concat([top_data_0, top_data_1, top_data_2, top_data_3, top_data_4, top_data_5])
    return data_equal_size_per_label

# Function call to get the top 15000 from each sentiment
data_equal_size_per_label = get_top_data(data, top_n=3000)

# After selecting top few samples of each sentiment
print("After segregating and taking equal number of rows for each sentiment:")
print(data_equal_size_per_label['label'].value_counts())
data_equal_size_per_label.head(10)

After segregating and taking equal number of rows for each sentiment:
label
0    3000
1    3000
2    3000
3    3000
4    3000
5    3000
Name: count, dtype: int64


Unnamed: 0,text,label,preprocessed_text
1,ive enjoyed being able to slouch about relax a...,0,ive enjoy abl slouch relax unwind frankli need...
3,i dont know i feel so lost,0,dont know feel lost
5,i was beginning to feel quite disheartened,0,begin feel quit dishearten
9,i can still lose the weight without feeling de...,0,still lose weight without feel depriv
11,im feeling a little like a damaged tree and th...,0,im feel littl like damag tree root littl wack
18,i feel so stupid that i realise it so late,0,feel stupid realis late
23,i don t feel submissive and for the time being...,0,feel submiss time lost interest bdsm stuff
24,i would imagine this is just one of the reason...,0,would imagin one reason marriag hard theyll se...
25,i feel like a real fan not that i was ever a f...,0,feel like real fan not ever fake fan
31,i needed to show me that i really am integrati...,0,need show realli integr make connect sinc feel...


In [None]:
MAX_LEN = 256 # can prob shorten to like 45
text = data_equal_size_per_label.text.values
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
input_ids = [tokenizer.encode(sent, add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True) for sent in text]
labels = data_equal_size_per_label.label.values

# print("Actual sentence before tokenization: ",text[2])
# print("Encoded Input from dataset: ",input_ids[2])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [23]:
attention_masks = []
## Create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
# print(attention_masks[2])

In [24]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks,validation_masks,_,_ = train_test_split(attention_masks,input_ids, test_size=0.2, random_state=42)

In [25]:
# convert all our data into torch tensors, required data type for our model
train_inputs = torch.tensor(X_train)
validation_inputs = torch.tensor(X_test)
train_labels = torch.tensor(y_train)
validation_labels = torch.tensor(y_test)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs,train_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

validation_data = TensorDataset(validation_inputs,validation_masks,validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data,sampler=validation_sampler,batch_size=batch_size)


In [26]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6).to(device)

# Parameters:
lr = 2e-5
adam_epsilon = 1e-8

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

num_warmup_steps = 0
num_training_steps = len(train_dataloader)*epochs

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon,correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
## Store our loss and accuracy for plotting
train_loss_set = []
learning_rate = []

# Gradients gets accumulated by default
# model.zero_grad()

# tnrange is a tqdm wrapper around the normal python range
for _ in trange(1,epochs+1,desc='Epoch'):
  print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
  # Calculate total loss for this epoch
  batch_loss = 0

  for step, batch in enumerate(train_dataloader):
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()
    
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs[0]
    print(loss)
    # Backward pass
    loss.backward()
    
    # Clip the norm of the gradients to 1.0
    # Gradient clipping is not in AdamW anymore
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    # Update learning rate schedule
    scheduler.step()

    # Clear the previous accumulated gradients
    optimizer.zero_grad()
    
    # Update tracking variables
    batch_loss += loss.item()

  # Calculate the average loss over the training data.
  avg_train_loss = batch_loss / len(train_dataloader)

  #store the current learning rate
  for param_group in optimizer.param_groups:
    print("\n\tCurrent Learning rate: ",param_group['lr'])
    learning_rate.append(param_group['lr'])
    
  train_loss_set.append(avg_train_loss)
  print(F'\n\tAverage Training loss: {avg_train_loss}')
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_accuracy,eval_mcc_accuracy,nb_eval_steps = 0, 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits[0].to('cpu').numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    
    df_metrics=pd.DataFrame({'Epoch':epochs,'Actual_class':labels_flat,'Predicted_class':pred_flat})
    
    tmp_eval_accuracy = accuracy_score(labels_flat,pred_flat)
    tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)
    
    eval_accuracy += tmp_eval_accuracy
    eval_mcc_accuracy += tmp_eval_mcc_accuracy
    nb_eval_steps += 1

  print(F'\n\tValidation Accuracy: {eval_accuracy/nb_eval_steps}')
  print(F'\n\tValidation MCC Accuracy: {eval_mcc_accuracy/nb_eval_steps}')

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

tensor(1.8101, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8706, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7646, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8388, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9293, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7680, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7741, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8355, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7721, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8929, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8365, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7787, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8400, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7830, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8150, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7964, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7517, device='cuda:0', grad_fn=

In [None]:
## Now let's save our model and tokenizer to a directory
model.save_pretrained('bertModel')
tokenizer.save_pretrained('bertModelToken')

# path = 'bertModel_state.pt'
# torch.save(model.state_dict(),path)

In [53]:
# Test model, only do 100 samples due to computational power

model = BertForSequenceClassification.from_pretrained("bertModel", local_files_only=True)
tokenizer = BertTokenizer.from_pretrained("bertModelToken", local_files_only=True)

with torch.no_grad():
      # Forward pass, calculate logit predictions
  logits = model(validation_inputs[0:500], token_type_ids=None, attention_mask=validation_masks[0:500])

  # Move logits and labels to CPU
  logits = logits[0].to('cpu').numpy()
  label_ids = validation_labels[0:500].to('cpu').numpy()

  pred_flat = np.argmax(logits, axis=1).flatten()
  labels_flat = label_ids.flatten()

label_mapping = {
  'sadness': 0,
  'joy': 1,
  'love': 2,
  'anger': 3,
  'fear': 4,
  'surprise': 5
}
print(classification_report(labels_flat, pred_flat, target_names=label_mapping.keys(), digits=len(label_mapping)))

              precision    recall  f1-score   support

     sadness   0.973684  0.936709  0.954839        79
         joy   1.000000  0.880435  0.936416        92
        love   0.922330  1.000000  0.959596        95
       anger   0.945946  0.972222  0.958904        72
        fear   0.945946  0.897436  0.921053        78
    surprise   0.913043  1.000000  0.954545        84

    accuracy                       0.948000       500
   macro avg   0.950158  0.947800  0.947559       500
weighted avg   0.950260  0.948000  0.947618       500

