In [1]:
import pandas as pd
import torch 
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset
import torch.nn.utils.rnn as rnn_utils
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    """Calculates accuracy between truth labels and predictions.

    Args:
        y_true (torch.Tensor): Truth labels for predictions.
        y_pred (torch.Tensor): Predictions to be compared to predictions.

    Returns:
        [torch.float]: Accuracy value between y_true and y_pred, e.g. 78.45
    """
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc

In [3]:
class DNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

In [4]:
def pad_sequences(sequences):
    padded_sequences = []
    for seq in sequences:
        if seq.size(0) <= 65:
            padded_seq = torch.nn.functional.pad(seq, (0, 0, 0, 65 - seq.size(0)), mode='constant', value=0)
        else:
            print(sequences.numel())
        padded_sequences.append(padded_seq)
    return torch.stack(padded_sequences)

In [9]:
class customDataset(Dataset):
    def __init__(self, path):
        self.tokenizer = AutoTokenizer.from_pretrained('gpt2')
        self.model = AutoModel.from_pretrained('gpt2').to(device)
        self.data = pd.read_csv(path)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        gpt_token = self.tokenizer(self.data['Joined'].iloc[idx], return_tensors='pt').to(device)
        gpt_outputs = self.model(gpt_token['input_ids'])[0]
        padded_outputs = pad_sequences(gpt_outputs)
        return (padded_outputs, torch.tensor(self.data['Label'].iloc[idx], dtype=torch.float32))



In [10]:
data = customDataset("new_data.csv")
train_dataloader = torch.utils.data.DataLoader(dataset=data,
                                              batch_size=32,
                                              drop_last=True)

In [11]:
gpt_model = DNN(input_size=768*65, hidden_size=52, num_classes=1).to(device)
lr = 0.001
num_epochs = 100
optimizer = torch.optim.Adam(gpt_model.parameters(), lr)
criterion = nn.BCEWithLogitsLoss()
device

device(type='cuda')

In [12]:
for epoch in range(num_epochs):
    print(f"Epoch: {epoch} \n =====================")
    train_loss, train_acc = 0, 0
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        flattened_inputs = inputs.view(inputs.size(0), -1)
        gpt_model.train()
        # Forward Pass
        logits = gpt_model(flattened_inputs).squeeze()
        rounded_labels = torch.round(labels)
        pred = torch.round(torch.sigmoid(logits))
        # Calculate the loss
        loss = criterion(logits, labels)
        train_loss += loss
        train_acc += accuracy_fn(rounded_labels, pred)
        # Zero the graident
        optimizer.zero_grad()
        # Perform backpropagation
        loss.backward()
        # Perform gradient descent
        optimizer.step()
    train_loss /= len(train_dataloader)
    train_acc /= len(train_dataloader)
    print(f"\nTrain Loss: {train_loss}, Train Acc: {train_acc}")

Epoch: 0 

Train Loss: 0.6934838891029358, Train Acc: 56.601331360946745
Epoch: 1 

Train Loss: 0.634870171546936, Train Acc: 63.99778106508876
Epoch: 2 

Train Loss: 0.5645986199378967, Train Acc: 69.8224852071006
Epoch: 3 

Train Loss: 0.4749314785003662, Train Acc: 77.1819526627219
Epoch: 4 

Train Loss: 0.3949655294418335, Train Acc: 82.3594674556213
Epoch: 5 

Train Loss: 0.42069733142852783, Train Acc: 80.45488165680473
Epoch: 6 

Train Loss: 0.324857622385025, Train Acc: 85.70636094674556
Epoch: 7 

Train Loss: 0.2626688480377197, Train Acc: 89.42307692307692
Epoch: 8 

Train Loss: 0.2818368673324585, Train Acc: 88.09171597633136
Epoch: 9 

Train Loss: 0.4335102438926697, Train Acc: 77.2189349112426
Epoch: 10 

Train Loss: 0.3279739022254944, Train Acc: 86.48298816568047
Epoch: 11 

Train Loss: 0.25826701521873474, Train Acc: 89.82988165680473
Epoch: 12 

Train Loss: 0.20079736411571503, Train Acc: 92.3076923076923
Epoch: 13 

Train Loss: 0.1791241616010666, Train Acc: 93.380177

In [14]:
# torch.save(gpt_model.state_dict(), 'student_model_wic_1.pth')

In [18]:
class testDataset(Dataset):
    def __init__(self, path):
        df_data = pd.read_csv(path+"data.txt",
                              delimiter='\t',
                              names=['Target Word', 'PoS', 'Index', 'Context1', 'Context2'])
        df_label = pd.read_csv(path+'gold.txt',
                               delimiter='\t',
                               names=['label'])
        self.data = pd.concat([df_data, df_label], axis=1)
        self.data['Joined'] = self.data['Context1'] + " " + self.data['Context2']
        self.data['label'] = self.data['label'].map(lambda x: 0 if x == 'F' else 1)
        self.tokenizer = AutoTokenizer.from_pretrained('gpt2')
        self.model = AutoModel.from_pretrained('gpt2').to(device)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        gpt_token = self.tokenizer(self.data['Joined'].iloc[idx], return_tensors='pt').to(device)
        gpt_outputs = self.model(gpt_token['input_ids'])[0]
        padded_outputs = pad_sequences(gpt_outputs)
        return (padded_outputs, torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float32)) 
        
test_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\test\test."
test_data = testDataset(test_path)
test_dataloader = torch.utils.data.DataLoader(dataset=test_data,
                                             batch_size=32,
                                             drop_last=False)

In [21]:
model = DNN(input_size=768*65, hidden_size=52, num_classes=1).to(device)
model.load_state_dict(torch.load('student_model_wic_1.pth'))

<All keys matched successfully>

In [27]:
pred_list, label_list = [], []
model.eval()
with torch.inference_mode():
    for inputs, labels in test_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        flattened_inputs = inputs.view(inputs.size(0), -1)
        test_logits = model(flattened_inputs)
        pred = torch.round(torch.sigmoid(test_logits))
        pred_list.append(pred.cpu().numpy())
        label_list.append(labels.cpu().numpy())

In [33]:
pred_array = np.concatenate(pred_list)
label_array = np.concatenate(label_list)
pred_array  = pred_array.ravel()

In [36]:
np.mean(pred_array == label_array)

0.5328571428571428

In [34]:
label_array

array([1., 1., 1., ..., 0., 1., 1.], dtype=float32)

In [35]:
pred_array

array([0., 1., 0., ..., 1., 1., 1.], dtype=float32)