In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
train_path = "/content/drive/MyDrive/NLP_A4/train_file.json"
val_path = "/content/drive/MyDrive/NLP_A4/val_file.json"

In [24]:
import json

def read_json_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

train_data = read_json_file(train_path)
val_data = read_json_file(val_path)

for i in range(5):
    print(train_data[i])

{'episode': 'utterance_3492', 'speakers': ['Phoebe', 'Eric', 'Phoebe', 'Eric', 'Phoebe'], 'emotions': ['surprise', 'fear', 'surprise', 'sadness', 'disgust'], 'utterances': ['You-you\x85you had sex with Ursula?!', 'Uh, a little bit. She-she-she walked in and I thought she was you and I kissed her and', "You didn't notice she was wearing different clothes?!", 'Well I was just so excited to see you.', "Oh. Ew! Ew! Ew! Ugh! Y'know what? This is too weird."], 'triggers': [1.0, 1.0, 0.0, 0.0, 0.0]}
{'episode': 'utterance_3952', 'speakers': ['Monica', 'Monica', 'Phoebe', 'Joey', 'Joey', 'Joey', 'Rachel', 'Joey', 'Rachel', 'Rachel', 'Rachel', 'Rachel', 'Rachel', 'Joey', 'Monica'], 'emotions': ['disgust', 'disgust', 'anger', 'sadness', 'surprise', 'anger', 'neutral', 'anger', 'anger', 'anger', 'anger', 'fear', 'neutral', 'joy', 'anger'], 'utterances': ["Dad, please don't pick your teeth out here!", "Alright, and if you're gonna put your feet up, why don't you sit on the-", 'Monica, leave him al

In [25]:
for i in range(5):
    print(val_data[i])

{'episode': 'utterance_3421', 'speakers': ['Chandler', 'Joey', 'Chandler', 'Joey', 'Joey', 'Chandler', 'Joey', 'Joey', 'Joey', 'Chandler', 'Joey', 'Chandler', 'Joey', 'Chandler', 'Joey', 'Chandler', 'Joey'], 'emotions': ['anger', 'neutral', 'neutral', 'surprise', 'anger', 'disgust', 'neutral', 'neutral', 'neutral', 'anger', 'fear', 'surprise', 'neutral', 'sadness', 'sadness', 'surprise', 'neutral'], 'utterances': ['Hey! Hold on a minute, hold on a second. Do you think these pearls are nice?', "I'd really prefer a mountain bike.", "Janice's birthday is coming up, I want to get her something speacial. Come in here with me.", 'Whoa, whoa, whoa, wait, whoa.', 'Do you ah, want to get her something speacial, get her flowers, get her candy, get her gum, girls love gum.', "That's a good idea, \x91Dear Janice have a Hubba-Bubba birthday'. I would like to get her something serious.", 'Oh, you want something serious.', "Y'know what you should do, you should get her one of those um, barium enemas.

In [26]:
train_data[0].keys()

dict_keys(['episode', 'speakers', 'emotions', 'utterances', 'triggers'])

In [27]:
print(type(train_data))

<class 'list'>


In [28]:
# Initialize lists for each feature in the training data
train_episodes = []
train_speakers = []
train_emotions = []
train_utterances = []
train_triggers = []

# Iterate over the training data
for sample in train_data:
    # Extract each feature
    train_episodes.append(sample['episode'])
    train_speakers.append(sample['speakers'])
    train_emotions.append(sample['emotions'])
    train_utterances.append(sample['utterances'])
    train_triggers.append(sample['triggers'])

# Initialize lists for each feature in the testing data
val_episodes = []
val_speakers = []
val_emotions = []
val_utterances = []
val_triggers = []

# Iterate over the validation data
for sample in val_data:
    # Extract each feature
    val_episodes.append(sample['episode'])
    val_speakers.append(sample['speakers'])
    val_emotions.append(sample['emotions'])
    val_utterances.append(sample['utterances'])
    val_triggers.append(sample['triggers'])

In [29]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, utterances, speakers, emotions, episode, triggers):
        self.utterances = utterances
        self.speakers = speakers
        self.emotions = emotions
        self.episode = episode
        self.triggers = triggers
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased').to('cuda')

    def generate_embeddings(self, utterances):
        embedding_list = []
        for utterance in utterances:
            tokens = self.tokenizer.encode(utterance, add_special_tokens=True)
            tokens_tensor = torch.tensor([tokens]).to('cuda')
            with torch.no_grad():
                outputs = self.bert_model(tokens_tensor)
                last_hidden_state = outputs[0].squeeze(0)  # Take the last hidden state
            embedding_list.append(last_hidden_state.cpu().numpy())
        return np.vstack(embedding_list)

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):
        utterances = self.utterances[idx]
        speakers = self.speakers[idx]
        emotions = self.emotions[idx]
        triggers = self.triggers[idx]
        z = [i+' '+j+' '+k for i, j, k in zip(utterances, emotions, speakers)]
        # Generate embeddings for each utterance
        utterance_embeddings = self.generate_embeddings(z)
        # Pad embeddings to a fixed length
        # print(torch.tensor(triggers).shape)
        if len(triggers)>=30:
          triggers=triggers[0:30]
        else:
          triggers=np.array(np.pad(triggers, ((30 - len(triggers)), ( 0)), mode='constant', constant_values=0) ,dtype=np.float32)
        triggers=np.array(triggers)
        triggers[np.isnan(triggers)] = 0

        if  utterance_embeddings.shape[0]<=250:
          padded_embeddings =np.array( np.pad(utterance_embeddings, ((0, 250 - utterance_embeddings.shape[0]), (0, 0)), mode='constant', constant_values=0) ,dtype=np.float32)
        else:
          padded_embeddings =utterance_embeddings[0:250,:]


        return {
            'utterance_embeddings': torch.tensor(padded_embeddings),
            'triggers': torch.tensor(triggers)
        }

In [30]:
# Assuming train_episodes, train_speakers, train_emotions, train_utterances, train_triggers are lists of lists
train_dataset = CustomDataset(train_utterances, train_speakers, train_emotions, train_episodes, train_triggers)
val_dataset = CustomDataset(val_utterances, val_speakers, val_emotions, val_episodes, val_triggers)

# Define batch size
batch_size = 32

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
print(type(train_loader))
print("Number of batches in train_loader:", len(train_loader))
print("Batch size in train_loader:", batch_size)


In [None]:
print(len(train_loader))
# Get a single batch from the train_loader
sample_batch = next(iter(train_loader))

# Extract input dimensions from the sample batch
inputs, labels = sample_batch
print("Input shape:", len(inputs))
print("Labels shape:", len(labels))


In [None]:
# import torch
# import torch.nn as nn
# from transformers import LongformerModel, LongformerTokenizer

# class LongformerModel(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers, num_classes):
#         super(LongformerModel, self).__init__()
#         self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
#         self.longformer = LongformerModel.from_pretrained('allenai/longformer-base-4096')
#         self.fc = nn.Linear(hidden_size, num_classes)

#     def forward(self, x):
#         input_ids = self.tokenizer(x, padding=True, truncation=True, return_tensors="pt").input_ids
#         outputs = self.longformer(input_ids)
#         pooled_output = outputs.last_hidden_state.mean(dim=1)  # Global average pooling
#         output = self.fc(pooled_output)
#         return output

# # Define hyperparameters
# hidden_size = 768  # Longformer's hidden size
# num_layers = 1  # Number of Longformer layers
# num_classes = 2  # Number of output classes

# # Instantiate the model
# model = LongformerModel(input_size[2], hidden_size, num_layers, num_classes)

# # Move model to device
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# # Print model summary
# print(model)


In [None]:
import torch
import torch.nn as nn
# Get a single batch from the train_loader
sample_batch = next(iter(train_loader))

# Extract input dimensions from the sample batch
input_size = sample_batch['utterance_embeddings'].shape

num_classes =sample_batch['triggers'].shape  # Assuming each target is one-hot encoded
print(num_classes)
# Define BiLSTM model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)  # *2 for bidirectional

    def forward(self, x):
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(x.device)  # *2 for bidirectional
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])  # Use last timestep's output
        return out
hidden_size = 256
num_layers = 1

model = GRUModel(input_size[2], hidden_size, num_layers, num_classes[1])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Print model summary
print(model)

In [None]:
# print(len(train_loader.dataset))
# for i in range(len(train_loader.dataset)):
#     sample = train_loader.dataset[i]
#     inputs, labels = sample['utterances'], sample['triggers']
#     print("Sample:", i)
#     print("Input length:", len(inputs))
#     print("Label:", labels)
#     print()
#     if i == 5:  # Print only a few samples for inspection
#         break


In [None]:
# def custom_collate(batch):
#     max_seq_len = max(len(sample['utterance_embeddings']) for sample in batch)

#     # Pad utterance embeddings to the same sequence length
#     padded_utterance_embeddings = []
#     emotions_list = []
#     triggers_list = []
#     for sample in batch:
#         # Pad embeddings
#         padded_utterance_embeddings.append(
#             torch.nn.functional.pad(
#                 torch.stack(sample['utterance_embeddings']),
#                 pad=(0, 0, 0, max_seq_len - len(sample['utterance_embeddings'])),
#                 mode='constant',
#                 value=0
#             )
#         )
#         emotions_list.append(sample['emotions'])
#         triggers_list.append(sample['triggers'])

#     return {
#         'utterance_embeddings': torch.stack(padded_utterance_embeddings),
#         'emotions': emotions_list,
#         'triggers': triggers_list
#     }

# # Assuming train_episodes, train_speakers, train_emotions, train_utterances, train_triggers are lists of lists
# train_dataset = CustomDataset(train_utterances, train_speakers, train_emotions, train_episodes, train_triggers)
# val_dataset = CustomDataset(val_utterances, val_speakers, val_emotions, val_episodes, val_triggers)

# train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=custom_collate)
# val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=custom_collate)

In [None]:
import os
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
import numpy as np
import matplotlib.pyplot as plt

# Move model to device
model.to(device)

# Define number of classes
num_classes = 8  # Assuming there are 8 emotion classes
num_epochs = 5
epoch = 0
# Initialize lists to store losses, accuracies and f1 scores
train_losses = []
train_accuracies = []
train_f1_scores = []

# Set model to training mode
model.train()

# Define the model parameters and learning rate
learning_rate = 0.001
params = model.parameters()  # Assuming 'model' is your neural network

# Choose an optimizer and specify the learning rate and other parameters
optimizer = optim.Adam(params, lr=learning_rate)

# Choose a suitable loss function based on your task
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with Logistic Loss
for epoch in range(num_epochs):
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    true_labels = []
    pred_labels = []
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} (Train)"):
        inputs, labels = batch['utterance_embeddings'], batch['triggers']  # Give one hot encoding for label
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels.float())  # Use BCEWithLogitsLoss

        loss.backward()
        optimizer.step()

        # Calculate total loss
        total_loss += loss.item()

        # Threshold predicted labels
        predicted_labels = (outputs >= 0.5).float()

        # Calculate accuracy
        corrects = (predicted_labels == labels).all(dim=-1)
        correct = corrects.sum().item()
        correct_predictions += correct
        total_predictions += labels.numel()

        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(predicted_labels.cpu().numpy())

    # Calculate average loss and accuracy
    average_loss = total_loss / len(train_loader)
    train_accuracy = correct_predictions / total_predictions * 100

    # Calculate F1 score
    train_f1 = f1_score(true_labels, pred_labels, average='macro')
    train_f1_scores.append(train_f1)

    # Append results to lists
    train_losses.append(average_loss)
    train_accuracies.append(train_accuracy)

    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {average_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%, Training F1 Score: {train_f1:.4f}")

    # Save the model checkpoint for training
    model_checkpoint_dir = "/content/drive/MyDrive/NLP_A4/M4/train_checkpoints"
    os.makedirs(model_checkpoint_dir, exist_ok=True)
    model_checkpoint_path = os.path.join(model_checkpoint_dir, f"epoch_{epoch+1}.pth")
    torch.save(model.state_dict(), model_checkpoint_path)

# Plotting the Training Losses vs Epochs
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss vs Epochs')
plt.legend()
plt.grid(True)
plt.savefig("/content/drive/MyDrive/NLP_A4/M4/train_losses.png")
plt.show()

# Save train F1 scores to a file
train_f1_file = "/content/drive/MyDrive/NLP_A4/M4/train_f1_scores.txt"
with open(train_f1_file, "w") as f:
    for f1 in train_f1_scores:
        f.write(str(f1) + "\n")

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

# Move model to device
model.to(device)

# Define number of classes
num_classes = 8  # Assuming there are 8 emotion classes
num_epochs = 5
epoch = 0
# Initialize lists to store losses, accuracies and f1 scores
val_losses = []
val_accuracies = []
val_f1_scores = []

# Set model to evaluation mode for validation
model.eval()

# Choose a suitable loss function based on your task
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with Logistic Loss
for epoch in range(num_epochs):
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    true_labels = []
    pred_labels = []
    with torch.no_grad():  # No need to calculate gradients during validation
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} (Validation)"):
            inputs, labels = batch['utterance_embeddings'], batch['triggers']  # Give one hot encoding for label
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels.float())  # Use BCEWithLogitsLoss

            # Calculate total loss
            total_loss += loss.item()

            # Threshold predicted labels
            predicted_labels = (outputs >= 0.5).float()

            # Calculate accuracy
            corrects = (predicted_labels == labels).all(dim=-1)
            correct = corrects.sum().item()
            correct_predictions += correct
            total_predictions += labels.numel()

            true_labels.extend(labels.cpu().numpy())
            pred_labels.extend(predicted_labels.cpu().numpy())

    # Calculate average loss and accuracy
    average_loss = total_loss / len(val_loader)
    val_accuracy = correct_predictions / total_predictions * 100

    # Calculate F1 score
    val_f1 = f1_score(true_labels, pred_labels, average='macro')
    val_f1_scores.append(val_f1)

    # Append results to lists
    val_losses.append(average_loss)
    val_accuracies.append(val_accuracy)

    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {average_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%, Validation F1 Score: {val_f1:.4f}")

    # Save the model checkpoint for validation
    model_checkpoint_dir = "/content/drive/MyDrive/NLP_A4/M4/val_checkpoints"
    os.makedirs(model_checkpoint_dir, exist_ok=True)
    model_checkpoint_path = os.path.join(model_checkpoint_dir, f"epoch_{epoch+1}.pth")
    torch.save(model.state_dict(), model_checkpoint_path)

# Plotting the Validation Losses vs Epochs
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Validation Loss vs Epochs')
plt.legend()
plt.grid(True)
plt.savefig("/content/drive/MyDrive/NLP_A4/M4/val_losses.png")
plt.show()

# Save validation F1 scores to a file
val_f1_file = "/content/drive/MyDrive/NLP_A4/M4/val_f1_scores.txt"
with open(val_f1_file, "w") as f:
    for f1 in val_f1_scores:
        f.write(str(f1) + "\n")

In [None]:
average_loss

In [None]:
train_accuracy

In [None]:
train_losses

In [None]:
# model_path = 'M4.pth'

# # Save the model
# torch.save(model.state_dict(), model_path)