In [1]:
from torch.utils.data import Dataset
import torch
import pandas as pd
import numpy as np
from random import sample
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from transformers import AutoTokenizer, AutoModel
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import nltk
import warnings


warnings.filterwarnings("ignore")
nltk.download('punkt')
glove = api.load('glove-wiki-gigaword-50')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joowa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def train(model, optimizer, criterion, train_loader, valid_loader, num_epochs, device, accuracy_fn):
    train_losses, test_losses = [], []
    for epoch in range(num_epochs):
        print(f"Epoch: {epoch} \n ==========")
        ### Training
        train_loss, train_acc = 0, 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            flattened_inputs = inputs.view(inputs.size(0), -1)
            model.train()
            # Forard Pass
            logits = model(flattened_inputs).squeeze()
            pred = torch.round(torch.sigmoid(logits))
            # Calculate the loss
            loss = criterion(logits, labels)
            train_loss += loss
            train_acc += accuracy_fn(labels, pred)
            # Zero the gradient
            optimizer.zero_grad()
            # Perform backpropagation
            loss.backward()
            # Perform gradient descent
            optimizer.step()
        train_loss /= len(train_loader)
        train_acc /= len(train_loader)
        train_losses.append(train_loss)
        ### Testing
        test_loss, test_acc = 0, 0
        model.eval()
        with torch.inference_mode():
            for inputs, labels in valid_loader:
                inptus, labels = inputs.to(device), labels.to(device)
                flattened_inputs = inputs.view(inputs.size(0), -1)
                # Forward pass
                test_logits = model(flattened_inputs).squeeze()
                test_pred = torch.round(torch.sigmoid(test_logits))
                # Calculate the loss and accuracy
                test_loss += criterion(test_logits, labels)
                test_acc += accuracy_fn(labels, test_pred)
            test_loss /= len(valid_loader)
            test_acc /= len(valid_loader)
            test_losses.append(test_loss)
        print(f"\nTrain loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}% | Test loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}%")

In [3]:
def train_glove(model, optimizer, criterion, train_loader, valid_loader, num_epochs, device, accuracy_fn):
    train_losses, test_losses = [], []
    for epoch in range(num_epochs):
        print(f"Epoch: {epoch} \n ==========")
        ### Training
        train_loss, train_acc = 0, 0
        for inputs, labels, seq_lengths in train_loader:
            inputs, labels, seq_lengths = inputs.to(device), labels.to(device), seq_lengths.to(device)
            model.train()
            # Forard Pass
            logits = model(inputs, seq_lengths).squeeze()
            pred = torch.round(torch.sigmoid(logits))
            # Calculate the loss
            loss = criterion(logits, labels)
            train_loss += loss
            train_acc += accuracy_fn(labels, pred)
            # Zero the gradient
            optimizer.zero_grad()
            # Perform backpropagation
            loss.backward()
            # Perform gradient descent
            optimizer.step()
        train_loss /= len(train_loader)
        train_acc /= len(train_loader)
        train_losses.append(train_loss)
        ### Testing
        test_loss, test_acc = 0, 0
        model.eval()
        with torch.inference_mode():
            for inputs, labels, seq_lengths in valid_loader:
                inptus, labels, seq_lengths = inputs.to(device), labels.to(device), seq_lengths.to(device)
                # Forward pass
                test_logits = model(inputs, seq_lengths).squeeze()
                test_pred = torch.round(torch.sigmoid(test_logits))
                # Calculate the loss and accuracy
                test_loss += criterion(test_logits, labels)
                test_acc += accuracy_fn(labels, test_pred)
            test_loss /= len(valid_loader)
            test_acc /= len(valid_loader)
            test_losses.append(test_loss)
        print(f"\nTrain loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}% | Test loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}%")

In [4]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    """Calculates accuracy between truth labels and predictions.

    Args:
        y_true (torch.Tensor): Truth labels for predictions.
        y_pred (torch.Tensor): Predictions to be compared to predictions.

    Returns:
        [torch.float]: Accuracy value between y_true and y_pred, e.g. 78.45
    """
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc

In [5]:
def collate_fn(batch):
    inputs, labels = zip(*batch)
    # pad the inputs with zeros to make them the same length
    inputs_padded = rnn_utils.pad_sequence(inputs, batch_first=True)
    # get the sequence lenghts of the inputs
    seq_length = torch.LongTensor([len(seq) for seq in inputs])
    
    # sort the inputs and labels by the sequence lengths
    seq_length, sort_idx = seq_length.sort(descending=True)
    inputs_padded = inputs_padded[sort_idx].to(device)
    labels_sorted = torch.tensor(labels, dtype=torch.float32)[sort_idx].to(device)

    return inputs_padded, labels_sorted, seq_length

In [6]:
def chunk_list(lst, chunk_size):
    """Divides a list into sublists with an equal amount of items in each sublist."""
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]

In [7]:
def find_maxLength(sentence_list, tokenizer):
    length_list = []
    sentence_lists = chunk_list(sentence_list, 200)
    for block in sentence_lists:
        torch.cuda.empty_cache()
        token = tokenizer(block,
                          padding=True,
                          return_tensors='pt')
        length_list.append(token['input_ids'].shape[1])
    return max(length_list)

In [8]:
def pad_sequences(sequences):
    padded_sequences = []
    for seq in sequences:
        if seq.size(0) <= 65:
            padded_seq = torch.nn.functional.pad(seq, (0, 0, 0, 65 - seq.size(0)), mode='constant', value=0)
        else:
            print(sequences.numel())
        padded_sequences.append(padded_seq)
    return torch.stack(padded_sequences)

In [9]:
from transformers import GPT2Tokenizer, GPT2Model
import torch.nn.utils.rnn as rnn_utils
class WiCDataset(Dataset):
    def __init__(self, path, mode):
        self.mode = mode
        if mode == "gpt":
            self.mode = 'gpt2'
            self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            self.model = GPT2Model.from_pretrained('gpt2').to(device)
        elif mode == "bert":
            self.mode = 'bert-base-uncased'
            self.tokenizer = AutoTokenizer.from_pretrained(self.mode) 
            self.model = AutoModel.from_pretrained(self.mode).to(device)

        df_data = pd.read_csv(path+"data.txt",
                              delimiter='\t',
                              names=['Target Word', 'PoS', 'Index', 'Context1', 'Context2'])
        df_label = pd.read_csv(path+'gold.txt',
                               delimiter='\t',
                               names=['label'])
        self.data = pd.concat([df_data, df_label], axis=1)
        self.data['Joined'] = self.data['Context1'] + " " + self.data['Context2']
        self.data['label'] = self.data['label'].map(lambda x: 0 if x == 'F' else 1)
        #self.maxLength = find_maxLength(self.data['Joined'].tolist(), self.tokenizer)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if self.mode == 'gpt2':
           # self.tokenizer.pad_token = self.tokenizer.eos_token
           # gpt_token = self.tokenizer(self.data['Joined'].iloc[idx], return_tensors='pt').to(device)
            gpt_token = self.tokenizer(self.data['Joined'].iloc[idx], return_tensors='pt').to(device)
            gpt_outputs = self.model(gpt_token['input_ids'])[0]
            padded_outputs = pad_sequences(gpt_outputs)
           # with torch.inference_mode():
           #     gpt_outputs = self.model(**gpt_token)
            return (padded_outputs, torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float32)) 
            
            # sentence_lists = chunk_list(self.data['Joined'].tolist(), 200)
            # tensor_list = []
            # for block in sentence_lists:
            #     torch.cuda.empty_cache()
            #     gpt_token = self.tokenizer(block, padding='max_length', return_tensors='pt', max_length=65).to(device)
            #     with torch.inference_mode():
            #         gpt_outputs = self.model(**gpt_token)
            #     tensor_list.append(gpt_outputs[0])
            # gpt_tensor = torch.cat(tensor_list, dim = 0)
            # return (gpt_tensor[idx].cpu(), torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long))
        
        
        elif self.mode == 'bert-base-uncased':
            bert_token = self.tokenizer(self.data['Joined'].iloc[idx], padding='max_length', return_tensors='pt', max_length=68).to(device)        
            with torch.inference_mode():
                bert_outputs = self.model(**bert_token)
            return (bert_outputs[0], torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float32))
        
            
        elif self.mode == 'glove':
            row = self.data.iloc[idx]
            words = word_tokenize(row.Joined.lower())

            indices = [glove.get_index(w) for w in words if glove.has_index_for(w)]
            indices_tensor = torch.tensor(indices, dtype=torch.long)

            return indices_tensor, torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)
        
train_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\train\train."
valid_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\dev\dev."
test_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\test\test."

# GloVe

In [13]:
train_data = WiCDataset(train_path, "glove")
train_dataloader = torch.utils.data.DataLoader(train_data,
                                          batch_size=32,
                                          drop_last=True,
                                          collate_fn=collate_fn)
valid_data = WiCDataset(valid_path, "glove")
valid_dataloader = torch.utils.data.DataLoader(valid_data,
                                          batch_size=32,
                                          drop_last=True,
                                          collate_fn=collate_fn)
test_data = WiCDataset(test_path, "glove")
test_dataloader = torch.utils.data.DataLoader(test_data,
                                          batch_size=32,
                                          drop_last=True,
                                          collate_fn=collate_fn)

In [14]:
class LSTM(nn.Module):
    def __init__(self,
                 input_dim: int,
                 hidden_dim: int,
                 output_dim: int,
                 num_layers: int):
        super().__init__()
        self.emb = nn.Embedding.from_pretrained(torch.FloatTensor(glove.vectors))
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim,
                          hidden_dim,
                          num_layers,
                          bidirectional = True,
                          batch_first=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)
        
    
    def forward(self, seq, seq_length):
        inputs_embedded = self.emb(seq)
        seq_length = seq_length.cpu()
        packed_input = rnn_utils.pack_padded_sequence(inputs_embedded, seq_length, batch_first=True)
        packed_output, _ = self.lstm(packed_input)
        output, _ = rnn_utils.pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), seq_length - 1, :self.hidden_dim]
        out_reverse = output[:, 0, self.hidden_dim:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        output = self.fc(out_reduced)
        return output

glove_model = LSTM(50, 128, 1, 2).to(device)

In [19]:
lr = 0.001
num_epochs = 100
optimizer = torch.optim.Adam(glove_model.parameters(), lr)
criterion = nn.BCEWithLogitsLoss()

In [20]:
train_glove(glove_model,
    optimizer,
    criterion,
    train_dataloader,
    valid_dataloader,
    num_epochs,
    device,
    accuracy_fn,)

Epoch: 0 

Train loss: 0.0549, Train Acc: 98.1509% | Test loss: 2.3807, Test Acc: 54.1118%
Epoch: 1 

Train loss: 0.0406, Train Acc: 98.6317% | Test loss: 2.6156, Test Acc: 52.1382%
Epoch: 2 

Train loss: 0.0468, Train Acc: 98.4837% | Test loss: 2.4443, Test Acc: 51.8092%
Epoch: 3 

Train loss: 0.0369, Train Acc: 98.8166% | Test loss: 2.6459, Test Acc: 53.7829%
Epoch: 4 

Train loss: 0.0337, Train Acc: 98.8905% | Test loss: 2.7209, Test Acc: 54.7697%
Epoch: 5 

Train loss: 0.0333, Train Acc: 98.9460% | Test loss: 2.5958, Test Acc: 53.6184%
Epoch: 6 

Train loss: 0.0309, Train Acc: 99.1309% | Test loss: 2.9188, Test Acc: 51.8092%
Epoch: 7 

Train loss: 0.0227, Train Acc: 99.3528% | Test loss: 2.8814, Test Acc: 53.4539%
Epoch: 8 

Train loss: 0.0192, Train Acc: 99.4822% | Test loss: 2.8821, Test Acc: 54.1118%
Epoch: 9 

Train loss: 0.0101, Train Acc: 99.7411% | Test loss: 3.0779, Test Acc: 53.6184%
Epoch: 10 

Train loss: 0.0100, Train Acc: 99.7411% | Test loss: 3.1492, Test Acc: 55.0987

# Bert

In [10]:
train_data = WiCDataset(train_path, "bert")
train_dataloader = torch.utils.data.DataLoader(train_data,
                                          batch_size=32,
                                          drop_last=True)
valid_data = WiCDataset(valid_path, "bert")
valid_dataloader = torch.utils.data.DataLoader(valid_data,
                                          batch_size=32,
                                          drop_last=True)
test_data = WiCDataset(test_path, "bert")
test_dataloader = torch.utils.data.DataLoader(test_data,
                                          batch_size=32,
                                          drop_last=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_re

In [10]:
class DNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        return out



In [12]:
bert_model = DNN(input_size = 52224, hidden_size=64, num_classes=1).to(device)
lr = 0.001
num_epochs = 100
optimizer = torch.optim.Adam(bert_model.parameters(), lr)
criterion = nn.BCEWithLogitsLoss()

In [None]:
train(bert_model,
    optimizer,
    criterion,
    train_dataloader,
    valid_dataloader,
    num_epochs,
    device,
    accuracy_fn,)

Epoch: 0 

Train loss: 0.6675, Train Acc: 62.8328% | Test loss: 0.7042, Test Acc: 60.6908%
Epoch: 1 

Train loss: 0.4787, Train Acc: 76.9601% | Test loss: 0.7606, Test Acc: 60.3618%
Epoch: 2 

Train loss: 0.3525, Train Acc: 85.5584% | Test loss: 0.8406, Test Acc: 59.2105%
Epoch: 3 

Train loss: 0.2663, Train Acc: 89.3861% | Test loss: 0.9444, Test Acc: 58.7171%
Epoch: 4 


In [15]:
torch.save(bert_model.state_dict(), 'bert_model_wic_small.pth')

# GPT

In [11]:
train_data = WiCDataset(train_path, "gpt")
train_dataloader = torch.utils.data.DataLoader(train_data,
                                          batch_size=32,
                                          drop_last=True)
valid_data = WiCDataset(valid_path, "gpt")
valid_dataloader = torch.utils.data.DataLoader(valid_data,
                                          batch_size=32,
                                          drop_last=True)
test_data = WiCDataset(test_path, "gpt")
test_dataloader = torch.utils.data.DataLoader(test_data,
                                          batch_size=32,
                                          drop_last=True)

In [12]:
gpt_model = DNN(input_size = 49920, hidden_size=64, num_classes=1).to(device)
#gpt_model.load_state_dict(torch.load("gpt_model_wic_1.pth"))
lr = 0.001
num_epochs = 40
optimizer = torch.optim.Adam(gpt_model.parameters(), lr)
criterion = nn.BCEWithLogitsLoss()

In [13]:
train(gpt_model,
    optimizer,
    criterion,
    train_dataloader,
    valid_dataloader,
    num_epochs,
    device,
    accuracy_fn,)

Epoch: 0 

Train loss: 0.7276, Train Acc: 49.8706% | Test loss: 0.6927, Test Acc: 50.8224%
Epoch: 1 

Train loss: 0.6832, Train Acc: 53.2729% | Test loss: 0.6934, Test Acc: 49.1776%
Epoch: 2 

Train loss: 0.6932, Train Acc: 49.3343% | Test loss: 0.6934, Test Acc: 49.0132%
Epoch: 3 

Train loss: 0.6839, Train Acc: 56.3609% | Test loss: 0.6922, Test Acc: 50.9868%
Epoch: 4 

Train loss: 0.6689, Train Acc: 60.1516% | Test loss: 0.6920, Test Acc: 51.4803%
Epoch: 5 

Train loss: 0.6414, Train Acc: 64.2567% | Test loss: 0.6945, Test Acc: 52.1382%
Epoch: 6 

Train loss: 0.6030, Train Acc: 67.9734% | Test loss: 0.7034, Test Acc: 54.1118%
Epoch: 7 

Train loss: 0.5688, Train Acc: 71.5052% | Test loss: 0.7139, Test Acc: 54.6053%
Epoch: 8 

Train loss: 0.5610, Train Acc: 70.7655% | Test loss: 0.7123, Test Acc: 52.4671%
Epoch: 9 

Train loss: 0.5335, Train Acc: 73.2433% | Test loss: 0.7332, Test Acc: 53.6184%
Epoch: 10 

Train loss: 0.5280, Train Acc: 73.9090% | Test loss: 0.7277, Test Acc: 53.1250

In [15]:
# Save the model
torch.save(gpt_model.state_dict(), 'gpt_model_wic_small_1.pth')