In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joowa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def collate_fn(batch):
    inputs, labels = zip(*batch)
    # pad the inputs with zeros to make them the same length
    inputs_padded = rnn_utils.pad_sequence(inputs, batch_first=True)
    # get the sequence lenghts of the inputs
    seq_length = torch.LongTensor([len(seq) for seq in inputs])
    
    # sort the inputs and labels by the sequence lengths
    seq_length, sort_idx = seq_length.sort(descending=True)
    inputs_padded = inputs_padded[sort_idx].to(device)
    labels_sorted = torch.tensor(labels, dtype=torch.float32)[sort_idx].to(device)

    return inputs_padded, labels_sorted, seq_length

In [3]:
class Vocabulary:
    def __init__(self):
        self.index2str = {0:"<PAD>", 1:"<SOS>", 2:"<EOS>", 3:"<UNK>", 4:"<SEP>"}
        self.str2index = {"<PAD>":0, "<SOS>":1, "<EOS>":2, "<UNK>":3, "<SEP>":4}
        
    def __len__(self):
        return len(self.index2str)
    
    @staticmethod
    def tokenizer_eng(text):
        return word_tokenize(text.lower())
    
    def build_vocabulary(self, sentence_list):
        index = 5
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in self.str2index:
                    self.index2str[index] = word
                    self.str2index[word] = index
                    index += 1
                    
    def encode(self, text):
        tokenized_text = self.tokenizer_eng(text)
        result = []
        for token in tokenized_text:
            if token in self.str2index:
                result.append(self.str2index[token])
            else:
                result.append(self.str2index['<UNK>'])
        return result
    
    def decode(self, tokens):
        return [self.index2str[token] for token in tokens]
    
    

In [4]:
class newDataset(Dataset):
    def __init__(self, df, vocab):
        self.data = df
        self.vocab = vocab
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        inputs = self.vocab.encode(self.data['Joined'].iloc[idx])
        label = self.data['Label'].iloc[idx]
        
        return (torch.tensor(inputs, dtype=torch.int64), torch.tensor(label, dtype=torch.float32) )

In [5]:
class student(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc1 = nn.Linear(2*hidden_dim, 2*hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(2*hidden_dim, 2*hidden_dim)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(2*hidden_dim, output_dim)
        
    def forward(self, text, text_length):
        text_embedded = self.embedding(text.long())
        packed_text = rnn_utils.pack_padded_sequence(text_embedded, text_length, batch_first=True)
        packed_output, _ = self.lstm(packed_text)
        output, _ = rnn_utils.pad_packed_sequence(packed_output, batch_first=True)

        
        out_forward = output[range(len(output)), seq_length - 1, :self.hidden_dim]
        out_reverse = output[:, 0, self.hidden_dim:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        out = self.fc1(out_reduced)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

In [6]:
df = pd.read_csv('new_data.csv')

In [7]:
vocab = Vocabulary()
vocab.build_vocabulary(df['Joined'].tolist())

In [8]:
train_data = newDataset(df,vocab)
train_dataloader = torch.utils.data.DataLoader(train_data,
                                         batch_size=32,
                                         drop_last=True,
                                         collate_fn = collate_fn
                                         )

In [56]:
model = student(vocab_size = len(train_data.vocab),
               embedding_dim=768,
               hidden_dim=1024,
               output_dim=1).to(device)
lr = 0.001
num_epochs = 200
optimizer = torch.optim.Adam(model.parameters(), lr)
criterion = nn.BCEWithLogitsLoss()
# criterion = nn.MSELoss()

In [48]:
# inputs, labels, seq_length = next(iter(train_dataloader))
# for inputs, labels, seq_length in train_dataloader:
#     print(f"inputs shape is: {inputs.shape}" )

In [49]:
# model(inputs, seq_length)

In [51]:
for epoch in range(num_epochs):
    print(f"Epoch: {epoch}\n=========")
    ### Training
    train_loss, train_acc = 0, 0
    for inputs, labels, seq_length in train_dataloader:
        model.train()
        # Forward pass
        logits = model(inputs, seq_length).squeeze()
        print(labels)
        # Calculate the loss
        loss = criterion(logits, labels)
        train_loss += loss
        # Zero the gradient
        optimizer.zero_grad()
        ## Perform backpropagation
        loss.backward()
        # Perform gradient descent
        optimizer.step()
    train_loss /= len(train_dataloader)
    print(f"Train Loss: {train_loss:.4f}")

Epoch: 0
text_embedded shape is : torch.Size([32, 32, 768])
out shape is: torch.Size([534, 1])
tensor([3.6301e-13, 6.6667e-01, 6.6667e-01, 1.0000e+00, 3.5847e-01, 2.5141e-02,
        6.6666e-01, 3.3334e-01, 3.3335e-01, 3.3333e-01, 1.0000e+00, 7.3573e-06,
        6.6667e-01, 6.6667e-01, 3.3334e-01, 3.3334e-01, 6.6667e-01, 3.3448e-01,
        3.3333e-01, 6.6667e-01, 5.3753e-07, 4.5793e-03, 9.9965e-01, 5.3456e-05,
        1.5328e-06, 1.7839e-05, 1.0327e-06, 9.9990e-01, 3.2431e-09, 6.6667e-01,
        1.0000e+00, 1.4634e-03], device='cuda:0')


ValueError: Target size (torch.Size([32])) must be the same as input size (torch.Size([534]))

In [49]:
test_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\test\test."
test_data = pd.read_csv(test_path+'data.txt', delimiter='\t', names=['Target Word', 'PoS', 'Index', 'Context1', 'Context2'])
test_label = pd.read_csv(test_path+'gold.txt', delimiter='\t', names=['Label'])
test = pd.concat([test_data, test_label], axis=1)
test['Label'] = test['Label'].apply(lambda label: 1 if label == 'T' else 0)
test['Joined'] = test['Context1'] + " " + test['Context2']

In [50]:
test_data = newDataset(test, vocab)
test_dataloader = torch.utils.data.DataLoader(test_data,
                                             batch_size=32,
                                             drop_last=True,
                                             collate_fn=collate_fn)

In [51]:
logits_list, labels_lists = [], []
for inputs, labels, seq_length in test_dataloader:
    logits = model(inputs, seq_length).squeeze()
    logits_list.append(logits)
    labels_lists.append(labels)
len(logits_list), len(labels_lists)

(43, 43)

In [52]:
pred_lists = [torch.round(torch.sigmoid(logit)) for logit in logits_list]
pred_vector = torch.cat(pred_lists, dim=0).cpu().detach().numpy()
label_vector = torch.cat(labels_lists, dim=0).cpu().detach().numpy() 

In [53]:
np.mean(pred_vector == label_vector)

0.4992732558139535