In [1]:
import pandas as pd
import torch 
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset
import torch.nn.utils.rnn as rnn_utils
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    """Calculates accuracy between truth labels and predictions.

    Args:
        y_true (torch.Tensor): Truth labels for predictions.
        y_pred (torch.Tensor): Predictions to be compared to predictions.

    Returns:
        [torch.float]: Accuracy value between y_true and y_pred, e.g. 78.45
    """
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc

In [2]:
class DNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        return out

In [3]:
def pad_sequences(sequences):
    padded_sequences = []
    for seq in sequences:
        if seq.size(0) <= 65:
            padded_seq = torch.nn.functional.pad(seq, (0, 0, 0, 65 - seq.size(0)), mode='constant', value=0)
        else:
            print(sequences.numel())
        padded_sequences.append(padded_seq)
    return torch.stack(padded_sequences)

In [4]:
class customDataset(Dataset):
    def __init__(self, path):
        self.tokenizer = AutoTokenizer.from_pretrained('gpt2')
        self.model = AutoModel.from_pretrained('gpt2').to(device)
        self.data = pd.read_csv(path)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        gpt_token = self.tokenizer(self.data['Joined'].iloc[idx], return_tensors='pt').to(device)
        gpt_outputs = self.model(gpt_token['input_ids'])[0]
        padded_outputs = pad_sequences(gpt_outputs)
        return (padded_outputs, torch.tensor(self.data['Label'].iloc[idx], dtype=torch.float32))



In [5]:
data = customDataset("small_data.csv")
train_dataloader = torch.utils.data.DataLoader(dataset=data,
                                              batch_size=32,
                                              drop_last=True)

In [6]:
gpt_model = DNN(input_size=768*65, hidden_size=62, num_classes=1).to(device)
lr = 0.001
num_epochs = 50
optimizer = torch.optim.Adam(gpt_model.parameters(), lr)
criterion = nn.BCEWithLogitsLoss()

In [9]:
for epoch in range(num_epochs):
    print(f"Epoch: {epoch} \n =====================")
    train_loss, train_acc = 0, 0
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        flattened_inputs = inputs.view(inputs.size(0), -1)
        gpt_model.train()
        # Forward Pass
        logits = gpt_model(flattened_inputs).squeeze()
        rounded_labels = torch.round(labels)
        pred = torch.round(torch.sigmoid(logits))
        # Calculate the loss
        loss = criterion(logits, labels)
        train_loss += loss
        train_acc += accuracy_fn(rounded_labels, pred)
        # Zero the graident
        optimizer.zero_grad()
        # Perform backpropagation
        loss.backward()
        # Perform gradient descent
        optimizer.step()
    train_loss /= len(train_dataloader)
    train_acc /= len(train_dataloader)
    print(f"\nTrain Loss: {train_loss}, Train Acc: {train_acc}")

Epoch: 0 

Train Loss: 0.7259482145309448, Train Acc: 58.45044378698225
Epoch: 1 

Train Loss: 0.592014729976654, Train Acc: 67.89940828402366
Epoch: 2 

Train Loss: 0.5279016494750977, Train Acc: 73.79807692307692
Epoch: 3 

Train Loss: 0.4704250991344452, Train Acc: 79.25295857988165
Epoch: 4 

Train Loss: 0.4236723780632019, Train Acc: 83.52440828402366
Epoch: 5 

Train Loss: 0.3975466191768646, Train Acc: 86.5939349112426
Epoch: 6 

Train Loss: 0.443880558013916, Train Acc: 82.91420118343196
Epoch: 7 

Train Loss: 0.47273707389831543, Train Acc: 81.63831360946746
Epoch: 8 

Train Loss: 0.45010557770729065, Train Acc: 82.50739644970415
Epoch: 9 

Train Loss: 0.45523035526275635, Train Acc: 82.30399408284023
Epoch: 10 

Train Loss: 0.4355543255805969, Train Acc: 84.41198224852072
Epoch: 11 

Train Loss: 0.4308825135231018, Train Acc: 84.13461538461539
Epoch: 12 

Train Loss: 0.41202419996261597, Train Acc: 85.72485207100591
Epoch: 13 

Train Loss: 0.3858277499675751, Train Acc: 87.70

KeyboardInterrupt: 

In [10]:
torch.save(gpt_model.state_dict(), 'student_model_wic_1.pth')

In [11]:
class testDataset(Dataset):
    def __init__(self, path):
        df_data = pd.read_csv(path+"data.txt",
                              delimiter='\t',
                              names=['Target Word', 'PoS', 'Index', 'Context1', 'Context2'])
        df_label = pd.read_csv(path+'gold.txt',
                               delimiter='\t',
                               names=['label'])
        self.data = pd.concat([df_data, df_label], axis=1)
        self.data['Joined'] = self.data['Context1'] + " " + self.data['Context2']
        self.data['label'] = self.data['label'].map(lambda x: 0 if x == 'F' else 1)
        self.tokenizer = AutoTokenizer.from_pretrained('gpt2')
        self.model = AutoModel.from_pretrained('gpt2').to(device)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        gpt_token = self.tokenizer(self.data['Joined'].iloc[idx], return_tensors='pt').to(device)
        gpt_outputs = self.model(gpt_token['input_ids'])[0]
        padded_outputs = pad_sequences(gpt_outputs)
        return (padded_outputs, torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float32)) 
        
test_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\test\test."
test_data = testDataset(test_path)
test_dataloader = torch.utils.data.DataLoader(dataset=test_data,
                                             batch_size=32,
                                             drop_last=False)

In [None]:
# model = DNN(input_size=768*65, hidden_size=52, num_classes=1).to(device)
# model.load_state_dict(torch.load('student_model_wic_1.pth'))

In [13]:
pred_list, label_list = [], []
gpt_model.eval()
with torch.inference_mode():
    for inputs, labels in test_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        flattened_inputs = inputs.view(inputs.size(0), -1)
        test_logits = gpt_model(flattened_inputs)
        pred = torch.round(torch.sigmoid(test_logits))
        pred_list.append(pred.cpu().numpy())
        label_list.append(labels.cpu().numpy())

In [14]:
pred_array = np.concatenate(pred_list)
label_array = np.concatenate(label_list)
pred_array  = pred_array.ravel()

In [15]:
np.mean(pred_array == label_array)

0.545