In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from transformers import BertForSequenceClassification
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt
from torch import Tensor
from torch.nn import Transformer
import math
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
from tqdm.auto import tqdm
import csv
from nltk.translate.bleu_score import sentence_bleu

In [2]:
train_path = "/kaggle/input/nlp-a4-data/train_file.json"
val_path = "/kaggle/input/nlp-a4-data/val_file.json"

with open(train_path, "r") as file:
    trainingData = json.load(file)

with open(val_path, "r") as file:
    valData = json.load(file)


In [3]:
print(type(trainingData))
print(len(trainingData))
print(trainingData[0])
print(type(valData))
print(len(valData))
print(valData[0])

<class 'list'>
6740
{'episode': 'utterance_3492', 'speakers': ['Phoebe', 'Eric', 'Phoebe', 'Eric', 'Phoebe'], 'emotions': ['surprise', 'fear', 'surprise', 'sadness', 'disgust'], 'utterances': ['You-you\x85you had sex with Ursula?!', 'Uh, a little bit. She-she-she walked in and I thought she was you and I kissed her and', "You didn't notice she was wearing different clothes?!", 'Well I was just so excited to see you.', "Oh. Ew! Ew! Ew! Ugh! Y'know what? This is too weird."], 'triggers': [1.0, 1.0, 0.0, 0.0, 0.0]}
<class 'list'>
843
{'episode': 'utterance_3421', 'speakers': ['Chandler', 'Joey', 'Chandler', 'Joey', 'Joey', 'Chandler', 'Joey', 'Joey', 'Joey', 'Chandler', 'Joey', 'Chandler', 'Joey', 'Chandler', 'Joey', 'Chandler', 'Joey'], 'emotions': ['anger', 'neutral', 'neutral', 'surprise', 'anger', 'disgust', 'neutral', 'neutral', 'neutral', 'anger', 'fear', 'surprise', 'neutral', 'sadness', 'sadness', 'surprise', 'neutral'], 'utterances': ['Hey! Hold on a minute, hold on a second. Do 

In [4]:
trainingEmotions = []
trainingSpeakers = []
trainingUtterances = []
valEmotions = []
valSpeakers = []
valUtterances = []
for example in trainingData:
  for emotion in example['emotions']:
    trainingEmotions.append(emotion)
  for speaker in example['speakers']:
    trainingSpeakers.append(speaker)
  for utterance in example['utterances']:
    trainingUtterances.append(utterance)

for example in valData:
  for emotion in example['emotions']:
    valEmotions.append(emotion)
  for speaker in example['speakers']:
    valSpeakers.append(speaker)
  for utterance in example['utterances']:
    valUtterances.append(utterance)

# for example in valData:
#   valEmotions.append(example['emotions'])
#   valSpeakers.append(example['speakers'])
#   valUtterances.append(example['utterances'])

print(len(valSpeakers))
print(len(trainingUtterances))
print(trainingEmotions[1])
print(len(trainingEmotions))
print(type(trainingEmotions[58956]))
print(valUtterances[1])

7293
58957
fear
58957
<class 'str'>
I'd really prefer a mountain bike.


In [5]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
from transformers import GPT2Tokenizer, GPT2Model

trainInputs = [utterance for example in trainingData for utterance in example['utterances']]
trainLabels = [emotion for example in trainingData for emotion in example['emotions']]
valInputs = [utterance for example in valData for utterance in example['utterances']]
valLabels = [emotion for example in valData for emotion in example['emotions']]

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2Model.from_pretrained('gpt2')

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CustomDataset(Dataset):
    def __init__(self, inputs, labels, tokenizer, max_length=128):
        self.inputs = inputs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        label = self.labels[idx]
        inputs = self.tokenizer.encode_plus(input_text, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        return inputs, torch.tensor(label)

trainDataset = CustomDataset(trainInputs, trainLabels, tokenizer)
valDataset = CustomDataset(valInputs, valLabels, tokenizer)

trainLoader = DataLoader(trainDataset, batch_size=8, shuffle=True)
valLoader = DataLoader(valDataset, batch_size=8, shuffle=False)

class CustomClassifier(nn.Module):
    def __init__(self, pretrained_model, num_labels):
        super(CustomClassifier, self).__init__()
        self.pretrained_model = pretrained_model
        self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)  # Using mean pooling over the last hidden state
        logits = self.classifier(pooled_output)
        return logits

model = CustomClassifier(model, num_labels=7).to(DEVICE)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [7]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

trainInputs = tokenizer(trainingUtterances, padding = True, truncation = True, return_tensors = "pt")
tempEncoder = LabelEncoder()
trainLabelsEncoded = tempEncoder.fit_transform(trainingEmotions)
valLabelsEncoded = tempEncoder.fit_transform(valEmotions)
trainLabels = torch.tensor(trainLabelsEncoded)

valInputs = tokenizer(valUtterances, padding = True, truncation = True, return_tensors = "pt")
valLabels = torch.tensor(valLabelsEncoded)

trainDataset = TensorDataset(trainInputs['input_ids'], trainInputs['attention_mask'], trainLabels)
valDataset = TensorDataset(valInputs['input_ids'], valInputs['attention_mask'], valLabels)

trainLoader = DataLoader(trainDataset, batch_size = 8, shuffle = True)
valLoader = DataLoader(valDataset, batch_size = 8, shuffle = True)

In [None]:
from tqdm import tqdm

def train(model, epochCount, lossFn, opt, loader):
    epochLoss = []
    for epoch in range(epochCount):
        model.train()
        running_loss = 0.0
        with tqdm(loader, desc=f"Epoch {epoch+1}/{epochCount}", unit="batch") as tepoch:
            for i, batch in enumerate(tepoch):
                input_ids, attention_mask, labels = batch
                input_ids = input_ids.to(DEVICE)
                attention_mask = attention_mask.to(DEVICE)
                labels = labels.to(DEVICE)

                opt.zero_grad()

                # Forward pass
                logits = model(input_ids=input_ids, attention_mask=attention_mask)

                # Calculate loss
                loss = lossFn(logits, labels)
                running_loss += loss.item()

                # Backward pass
                loss.backward()
                opt.step()

                # Print statistics every 1000 mini-batches
                if i % 1000 == 999:  
                    tepoch.set_postfix(loss=running_loss / 1000)
                    running_loss = 0.0

            # Calculate average loss for the epoch
            epoch_loss = running_loss / len(loader)
            epochLoss.append(epoch_loss)
            tepoch.set_postfix(loss=epoch_loss)

            # Save model checkpoint
            torch.save({'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': opt.state_dict(),
                        'loss': epoch_loss}, f'gpt2_classifier_epoch{epoch+1}.pth')

    print('Training finished!')
    return epochLoss

# Define the number of epochs and the optimizer
numEpochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
lossFn = torch.nn.CrossEntropyLoss()

# Train the model
trainingLossesList = train(model, numEpochs, lossFn, optimizer, trainLoader)

Epoch 1/5: 100%|██████████| 7370/7370 [12:17<00:00,  9.99batch/s, loss=0.298]
Epoch 2/5: 100%|██████████| 7370/7370 [12:16<00:00, 10.00batch/s, loss=0.16] 
Epoch 3/5: 100%|██████████| 7370/7370 [12:16<00:00, 10.01batch/s, loss=0.127]
Epoch 4/5: 100%|██████████| 7370/7370 [12:16<00:00, 10.00batch/s, loss=0.101]
Epoch 5/5:  96%|█████████▌| 7071/7370 [11:47<00:29,  9.99batch/s, loss=0.0936]

In [None]:
print(trainingLossesList)

In [None]:
import matplotlib.pyplot as plt

# Plot the training loss versus epoch
plt.plot(range(1, numEpochs + 1), trainingLossesList, marker='o', linestyle='-')
plt.title('Training Loss vs Epoch')
plt.xlabel('Epoch')
plt.ylabel('Training Loss')
plt.grid(True)
plt.show()

In [None]:
# finalTrainLoss = [1.6081185510637317, 1.6056980307907716, 1.6046199815972042, 1.6033065124719406, 1.6029525319303877]
finalTrainLoss = trainingLossesList
for i in range(len(finalTrainLoss)):
  finalTrainLoss[i] = finalTrainLoss[i]*len(trainLoader)
  finalTrainLoss[i] = finalTrainLoss[i]/(i+1)

print(finalTrainLoss)

In [None]:
reportTrainLoss = [11851.833721339703, 5916.997243463993, 3942.016421457132, 2954.0922492295504, 2362.7520320653916]

In [None]:
def load_model(model, optim, file_name):
    checkpoint = torch.load(file_name, map_location=torch.device(DEVICE))
    model.load_state_dict(checkpoint["model_state_dict"])
    optim.load_state_dict(checkpoint["optimizer_state_dict"])
    return checkpoint["epoch"]

def computeValLoss(model, loader, optimizer):
    valLoss = []
    for ep in range(5):
        in_mod = '/kaggle/input/nlp-a4/bert_ERC_ep_val'+str(ep+1)+'.pth'  # Corrected the filename
        checkpoint = torch.load(in_mod, map_location=torch.device(DEVICE))
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        model.eval()
        curLoss = 0
        with torch.no_grad():
            for batch in loader:
                input_ids, attention_mask, labels = batch
                input_ids = input_ids.to(DEVICE)
                attention_mask = attention_mask.to(DEVICE)
                labels = labels.to(DEVICE)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                curLoss += loss.item()

        avgLoss = curLoss / len(loader)
        valLoss.append(avgLoss)
    return valLoss

# Assuming you already have the model, optimizer, and loaders defined
# Load the model and optimizer state from the last checkpoint
epoch_resumed = load_model(model, optimizer, '/kaggle/input/nlp-a4/bert_ERC_ep5.pth')

# Compute validation loss
valLosses = computeValLoss(model, valLoader, optimizer)

# Plot the validation loss
plt.plot(range(1, 6), valLosses, marker='o', linestyle='-')
plt.title('Validation Loss vs Epoch')
plt.xlabel('Epoch')
plt.ylabel('Validation Loss')
plt.grid(True)
plt.show()


In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
finalValLoss = computeValLoss(model, valLoader, optimizer)
print(finalValLoss)

In [None]:
reportValLoss = [1463.82704859972, 730.9797016084194, 486.44409054517746, 365.1842586994171, 291.8854038953781]

In [None]:
model_loc = '/kaggle/input/nlp-a4/bert_ERC_ep4.pth'
load_model(model, torch.optim.Adam(model.parameters(), lr=0.001), model_loc)
# predictions = []
# model.eval()
# with torch.inference_mode():
#   for batch in valLoader:
#     input_ids, attention_mask, labels = batch
#     input_ids = input_ids.to(DEVICE)
#     attention_mask = attention_mask.to(DEVICE)
#     labels = labels.to(DEVICE)
#     outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#     for output in outputs:
#       predictions.append(output)

# print(predictions)

############### INFERENCE CODE #######################
with open("/content/drive/MyDrive/NLP_A4/MELD_test_efr.json", "r") as file:
  testData = json.load(file)

testEmotions = []
testSpeakers = []
testUtterances = []
testConvoLength = []
for example in testData:
  testConvoLength.append(len(example['emotions']))
  for emotion in example['emotions']:
    testEmotions.append(emotion)
  for speaker in example['speakers']:
    testSpeakers.append(speaker)
  for utterance in example['utterances']:
    testUtterances.append(utterance)

testInputs = tokenizer(testUtterances, padding = True, truncation = True, return_tensors = "pt")
testLabelsEncoded = tempEncoder.fit_transform(testEmotions)
testLabels = torch.tensor(testLabelsEncoded)

testDataset = TensorDataset(testInputs['input_ids'], testInputs['attention_mask'], testLabels)
testLoader = DataLoader(testDataset, batch_size = 8, shuffle = False)




In [None]:
def evaluate(model, loader):
    model.eval()
    predictions = []
    true_labels = []
    for batch in loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        labels = labels.to(DEVICE)
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return predictions, true_labels

In [None]:

from sklearn.metrics import f1_score
finalPredictions, finalLabels = evaluate(model.to(DEVICE), testLoader)

macro_f1 = f1_score(finalLabels, finalPredictions, average='macro')
print(f'Macro F1 Score: {macro_f1:.4f}')
