In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pack_padded_sequence
from torchtext.vocab import build_vocab_from_iterator, Vectors
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import matplotlib.pyplot as plt
import math
import time
import json
import copy
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score


#sadness (0), joy (1), love (2), anger (3), fear (4), surprise (5)

mapping = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}


X_train_filename = "./drive/MyDrive/CSCI467Data/X_train.tsv"
X_dev_filename = "./drive/MyDrive/CSCI467Data/X_dev.tsv"
X_test_filename = "./drive/MyDrive/CSCI467Data/X_test.tsv"
y_train_filename = "./drive/MyDrive/CSCI467Data/y_train.npy"
y_dev_filename = "./drive/MyDrive/CSCI467Data/y_dev.npy"
y_test_filename = "./drive/MyDrive/CSCI467Data/y_test.npy"

data_filename = "./drive/MyDrive/CSCI467Data/data.jsonl"
# word_vectors_filename = "./drive/MyDrive/CSCI467Data/glove.6B.50d.txt"
word_vectors_filename = "./drive/MyDrive/CSCI467Data/glove.twitter.27B.100d.txt"




embed_size = 100
hidden_dim = 32
n_layers = 1
num_of_classes = 6
dropout = 0.2
batch_size = 2048
epochs = 100
freeze = False
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)



if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU activated")
else:
    device = torch.device("cpu")
    print("CPU activated")



class EmotionDataset(Dataset):
    def __init__(self, X, y,seq_len, transform=None, target_transform=None):
        self.X = X
        self.y = y
        self.seq_len = seq_len
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.seq_len[idx]



class MyLSTM(nn.Module):
    def __init__(self, vocab_size, weights, hidden_size=50, n_layers=1, dropout=0, embed_size=50):
        super().__init__()
        self.embedding_layer = nn.Embedding.from_pretrained(embeddings=weights, freeze=freeze, padding_idx=0)
        # self.rnn = nn.RNN(input_size = embed_size, hidden_size = hidden_dim, num_layers = n_layers, batch_first = True)
        self.lstm = nn.LSTM(input_size=embed_size, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(hidden_dim, num_of_classes)


    def forward(self, x, seq_lengths):
        embeddings = self.embedding_layer(x)
        packed_embeddings = pack_padded_sequence(embeddings, seq_lengths, batch_first=True, enforce_sorted=False)
        # output, hidden = self.rnn(packed_embeddings, torch.randn(n_layers, len(x), hidden_dim))
        packed_output,(hidden,cell) = self.lstm(packed_embeddings, (torch.randn(n_layers, len(x), hidden_dim).to(device), torch.randn(n_layers, len(x), hidden_dim).to(device)))
        # return self.linear(hidden[-1])
        a1 = F.relu(self.linear1(hidden[-1]))
        d1 = self.dropout(a1)
        output = self.linear2(d1)
        return output




def read_file(filename):
    data=[]

    with open(filename) as file:
        for line in file:
            words = line.strip().split('\t')
            data.append(words)
    return data



def get_vocabulary(data):
    return list(set(word for words in data for word in words))

def evaluate(model, dataset, name, class_weights, print_confusion_matrix = False):
    """Measure and print accuracy of a predictor on a dataset."""
    confusion_counts = Counter()
    loss_func = nn.CrossEntropyLoss()
    model.eval()  # Set model to "eval mode", e.g. turns dropout off if you have dropout layers.
    with torch.no_grad():  # Don't allocate memory for storing gradients, more efficient when not training
      for batch in DataLoader(dataset, batch_size=len(dataset), shuffle=True):
        X, y, seq_len_batch = batch
        X, y, seq_len_batch = X.to(device), y.to(device), seq_len_batch
        logits = model(X, seq_len_batch)  # tensor of size (N, 10)
        # Choose argmax for each row (i.e., collapse dimension 1, hence dim=1)
        y_preds = torch.argmax(logits, dim=1)
        loss = loss_func(logits, y)

        acc = torch.mean((y_preds == y).float()).item()


        f1s = f1_score(y.tolist(), y_preds.tolist(), average=None)
        f1 = np.dot(class_weights, f1s)

        for label, pred_label in zip(y.tolist(), y_preds.tolist()):
            confusion_counts[(label, pred_label)] += 1

    print(f'Accuracy on {name} data: {acc:.5f}')
    print(f'Weighted F1 on {name} data: {f1.item():.5f}')
    print(f'Loss on {name} data: {loss:.5f}')
    for i,class_f1 in enumerate(f1s):
      print(f'{mapping[i]} F1: {class_f1}')

    if print_confusion_matrix:
      print(''.join(['actual\\predicted'] + [str(label).rjust(12) for label in range(6)]))
      for true_label in range(6):
          print(''.join([str(true_label).rjust(16)] + [
                  str(confusion_counts[true_label, pred_label]).rjust(12)
                  for pred_label in range(6)]))

    return acc


def train(model, train_set, dev_set, class_weights, lr=1e-1, batch_size=32, num_epochs=30):

    start_time = time.time()
    # Cross-entropy loss is just softmax regression loss
    loss_func = nn.CrossEntropyLoss()
    # Stochastic gradient descent optimizer
    optimizer = optim.SGD(model.parameters(), lr=lr)

    # Simple version of early stopping: save the best model checkpoint based on dev accuracy
    best_dev_acc = -1
    best_dev_f1 = -1
    best_checkpoint_acc = None
    best_checkpoint_f1 = None
    best_epoch_acc = -1
    best_epoch_f1 = -1
    best_dev_loss = 1000
    best_checkpoint_loss = None
    best_epoch_loss = -1
    # total_num_of_batches =  math.ceil(len(train_set)/batch_size)

    epoch_train_f1 = []
    epoch_dev_f1 = []
    epoch_dev_loss = []


    for t in range(num_epochs):
        train_num_correct = 0

        y_pred = []
        y_true = []

        # Training loop
        model.train()  # Set model to "training mode", e.g. turns dropout on if you have dropout layers
        # for batch in DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: preprocess_batch(batch, max_words)):
        for batch in DataLoader(train_set, batch_size=batch_size, shuffle=True):
            # DataLoader automatically groups the data into batchse of roughly batch_size
            # shuffle=True makes it so that the batches are randomly chosen in each epoch
            # unpack batch, which is a tuple (x_batch, y_batch)


            x_batch, y_batch, seq_len_batch = batch
            x_batch, y_batch, seq_len_batch = x_batch.to(device), y_batch.to(device), seq_len_batch


            optimizer.zero_grad()  # Reset the gradients to zero
            # Recall how backpropagation works---gradients are initialized to zero and then accumulated
            # So we need to reset to zero before running on a new batch!
            # tensor of size (B, C), each row is the logits (pre-softmax scores) for the C classes
            logits = model(x_batch, seq_len_batch)

            # For MNIST, C=10
            # Compute the loss of the model output compared to true labels
            loss = loss_func(logits, y_batch)
            loss.backward()  # Run backpropagation to compute gradients
            optimizer.step()  # Take a SGD step
            # Note that when we created the optimizer, we passed in model.parameters()
            # This is a list of all parameters of all layers of the model
            # optimizer.step() iterates over this list and does an SGD update to each parameter

            # Compute running count of number of training examples correct
            # Choose argmax for each row (i.e., collapse dimension 1, hence dim=1)
            preds = torch.argmax(logits, dim=1)
            train_num_correct += torch.sum(preds == y_batch).item()

            y_pred += preds.tolist()
            y_true += y_batch.tolist()

        # import pdb
        # pdb.set_trace()
        # Evaluate train and dev accuracy at the end of each epoch
        train_acc = train_num_correct / len(train_set)


        train_f1s = f1_score(y_true, y_pred, average=None)
        train_f1 = np.dot(class_weights, np.array(train_f1s))
        epoch_train_f1.append(train_f1.item())

        # Set model to "eval mode", e.g. turns dropout off if you have dropout layers.
        model.eval()
        dev_loss=[]
        with torch.no_grad():  # Don't allocate memory for storing gradients, more efficient when not training
            for batch in DataLoader(dev_set, batch_size=len(dev_set), shuffle=True):
                X_dev, y_dev, seq_len_batch = batch
                X_dev, y_dev, seq_len_batch = X_dev.to(device), y_dev.to(device), seq_len_batch
                dev_logits = model(X_dev, seq_len_batch)
                dev_preds = torch.argmax(dev_logits, dim=1)

                loss = loss_func(dev_logits, y_dev) #average loss

                dev_acc = torch.mean((dev_preds == y_dev).float()).item()
                dev_f1s = f1_score(y_dev.tolist(), dev_preds.tolist(), average=None)
                dev_f1 = np.dot(class_weights, dev_f1s)


                epoch_dev_f1.append(dev_f1.item())

                if loss < best_dev_loss:
                  best_dev_loss = loss
                  best_checkpoint_loss = copy.deepcopy(model.state_dict())
                  best_epoch_loss = t

                if dev_acc > best_dev_acc:
                    # Save this checkpoint if it has best dev accuracy so far
                    best_dev_acc = dev_acc
                    best_checkpoint_acc = copy.deepcopy(model.state_dict())
                    best_epoch_acc = t

                if dev_f1 > best_dev_f1:
                  best_dev_f1 = dev_f1
                  best_checkpoint_f1 = copy.deepcopy(model.state_dict())
                  best_epoch_f1 = t


        print(
            f'Epoch {t: <2}: train_acc={train_acc:.5f}, dev_acc={dev_acc:.5f}, train_f1={train_f1:.5f}, dev_f1={dev_f1:.5f} dev_loss={loss:.5f}')

    # Set the model parameters to the best checkpoint across all epochs
    model.load_state_dict(best_checkpoint_f1)
    end_time = time.time()
    print(f'Training took {end_time - start_time:.2f} seconds')
    print(f'\nBest epoch in terms of accuracy was {best_epoch_acc}, dev_acc={best_dev_acc:.5f}')
    print(f'Best epoch in terms of dev loss {best_epoch_loss}, dev_f1={best_dev_loss:.5f}')
    print(f'Best epoch in terms of f1 was {best_epoch_f1}, dev_f1={best_dev_f1:.5f}\n')

    evaluate(model, dev_set, "dev", class_weights, print_confusion_matrix=True)
    return epoch_train_f1, epoch_dev_f1




def analyze_mistakes(dataset, model, vocabulary):

    #  dataset = dataset[] #get a random split here
     with torch.no_grad():  # Don't allocate memory for storing gradients, more efficient when not training
      for batch in DataLoader(dataset, batch_size=len(dataset), shuffle=False):
        X_test, y_test, seq_len_batch = batch
        X_test, y_test, seq_len_batch = X_test.to(device), y_test.to(device), seq_len_batch
        logits = model(X_test, seq_len_batch)  # tensor of size (N, 10)
        # Choose argmax for each row (i.e., collapse dimension 1, hence dim=1)
        y_preds = torch.argmax(logits, dim=1)
        acc = torch.mean((y_preds == y_test).float()).item()
        zipped = zip(y_test.tolist(), y_preds.tolist())
        count = 0
        for i, pair in enumerate(zipped):
            if pair[0] == pair[1] or count > 10:
                continue
            words_list = vocabulary.lookup_tokens(X_test[i].tolist())
            words = []
            for word in words_list:
                if word != "<UNK>":
                    words.append(word)
            sentence = " ".join(words)
            print(f'Text: {sentence}')
            print(f'True label: {mapping[pair[0]]}')
            print(f'Pred label: {mapping[pair[1]]}\n')
            count +=1

def preprocess_data(X, y, vocab, name):
    # map to indices
    # padding
    X_list = []
    y_list = []
    seq_len = []
    counter = 0
    for words, label in tqdm(zip(X, y), total=len(y), desc=f'Preprocessing {name} data'):
        counter+=1
        # if(counter % 50000 == 0):
        #     print(f'Example {counter}/{len(y)}')
        indices = vocab(words)
        seq_len.append(len(indices))
        X_list.append(torch.tensor(indices, dtype=torch.int32))
        y_list.append(torch.tensor(label))

    padded_X = pad_sequence(X_list, batch_first=True, padding_value = 0)

    return padded_X, y_list, seq_len



def build_vocabulary(dataset):

    for text in dataset:
        yield text


def map_vocab_to_embeddings(vocabulary, word_vectors):
    itos = vocabulary.get_itos()
    vocab_size = len(vocabulary)
    weights = torch.zeros((vocab_size, embed_size), dtype=torch.float32)
    for i, word in enumerate(itos):
        weights[i] = word_vectors[word]
    return weights


def get_class_weights(y,a):
    c = Counter(y)
    denom = sum([c[w]**a for w in range(6)])
    # class_logits = [(c[w]/c.total())**a for w in range(6)]
    class_weights = [((c[w]**a)/denom) for w in range(6)]
    # class_weights = np.array([w/sum(class_logits) for w in class_logits])
    return class_weights


def main():

    #load data
    X_train = read_file(X_train_filename)
    X_dev = read_file(X_dev_filename)
    X_test = read_file(X_test_filename)
    y_train = np.load(y_train_filename)
    y_dev = np.load(y_dev_filename)
    y_test = np.load(y_test_filename)


    class_weights = get_class_weights(y_train, a=0.25)



    print(f'Creating Vocabulary from dataset...')
    vocabulary = build_vocab_from_iterator(build_vocabulary(X_train + X_dev + X_test), min_freq=2, specials=["<UNK>"])
    vocabulary.set_default_index(vocabulary["<UNK>"])

    print(f'Reading GloVe...')
    word_vectors = Vectors(name=word_vectors_filename)
    print(f'Getting weights...')
    weights = map_vocab_to_embeddings(vocabulary=vocabulary, word_vectors=word_vectors)


    X_train, y_train, seq_len_train = preprocess_data(X_train, y_train, vocabulary, name="train") #[ ([indices], label) ]
    X_dev, y_dev, seq_len_dev = preprocess_data(X_dev, y_dev, vocabulary, name="dev")
    X_test, y_test, seq_len_test = preprocess_data(X_test, y_test, vocabulary, name="test")
    train_set = EmotionDataset(X_train, y_train, seq_len_train)
    dev_set = EmotionDataset(X_dev, y_dev, seq_len_dev)
    test_set = EmotionDataset(X_test, y_test, seq_len_test)

    print("Training...")


    model = MyLSTM(vocab_size=len(vocabulary), weights=weights, hidden_size=hidden_dim, n_layers=n_layers, dropout = dropout, embed_size=embed_size).to(device)

    train_accuracies, dev_accuracies = train(model, train_set, dev_set, lr=1e-1,
          batch_size=batch_size, num_epochs=epochs, class_weights=class_weights)


    evaluate(model, test_set, "test", class_weights, print_confusion_matrix=True)
    analyze_mistakes(dev_set, model, vocabulary)

    # Create a line plot of accuracy over epochs
    plt.plot(range(1, epochs+1), train_accuracies, linestyle='-', label="Train")
    plt.plot(range(1, epochs+1), dev_accuracies, linestyle='-', label="Dev")
    plt.title('Weighted F1 per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Weighted F1')
    plt.legend(loc="lower right")
    txt=f'Embedding Size: {embed_size} - Hidden Size: {hidden_dim} - Batch Size: {batch_size} - Num Layers: {n_layers}\n Dropout = {dropout}'
    plt.figtext(0.5, -0.035, txt, wrap=True, horizontalalignment='center', fontsize=9)


    # Display the plot
    plt.show()



if __name__ == "__main__":
    main()




GPU activated
Creating Vocabulary from dataset...
Reading GloVe...


100%|█████████▉| 1193513/1193514 [01:07<00:00, 17635.96it/s]


Getting weights...


Preprocessing train data: 100%|██████████| 291766/291766 [00:09<00:00, 31501.06it/s]
Preprocessing dev data: 100%|██████████| 83778/83778 [00:01<00:00, 44566.74it/s]
Preprocessing test data: 100%|██████████| 41265/41265 [00:01<00:00, 31542.70it/s]


Training...
Epoch 0 : train_acc=0.34987, dev_acc=0.37601, train_f1=0.14295, dev_f1=0.16400 dev_loss=1.55977
Epoch 1 : train_acc=0.39537, dev_acc=0.39506, train_f1=0.19138, dev_f1=0.17961 dev_loss=1.53259


KeyboardInterrupt: ignored