In [8]:
## from torch.utils.data import Dataset
import torch
import pandas as pd
import numpy as np
from random import sample
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from transformers import AutoTokenizer, AutoModel
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import nltk
import warnings
from torch.utils.data import Dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
def pad_sequences(sequences):
    padded_sequences = []
    for seq in sequences:
        if seq.size(0) <= 65:
            padded_seq = torch.nn.functional.pad(seq, (0, 0, 0, 65 - seq.size(0)), mode='constant', value=0)
        else:
            padded_seq = seq[:65]
        padded_sequences.append(padded_seq)
    return torch.stack(padded_sequences)

In [13]:
class DNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        return out



In [14]:
from transformers import GPT2Tokenizer, GPT2Model
import torch.nn.utils.rnn as rnn_utils
class WiCDataset(Dataset):
    def __init__(self, path, mode):
        self.mode = mode
        if mode == "gpt":
            self.mode = 'gpt2'
            self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            self.model = GPT2Model.from_pretrained('gpt2').to(device)
        elif mode == "bert":
            self.mode = 'bert-base-uncased'
            self.tokenizer = AutoTokenizer.from_pretrained(self.mode) 
            self.model = AutoModel.from_pretrained(self.mode).to(device)

        df_data = pd.read_csv(path+"data.txt",
                              delimiter='\t',
                              names=['Target Word', 'PoS', 'Index', 'Context1', 'Context2'])
        df_label = pd.read_csv(path+'gold.txt',
                               delimiter='\t',
                               names=['label'])
        self.data = pd.concat([df_data, df_label], axis=1)
        self.data['Joined'] = self.data['Context1'] + " " + self.data['Context2']
        self.data['label'] = self.data['label'].map(lambda x: 0 if x == 'F' else 1)
        #self.maxLength = find_maxLength(self.data['Joined'].tolist(), self.tokenizer)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if self.mode == 'gpt2':
            gpt_token = self.tokenizer(self.data['Joined'].iloc[idx], return_tensors='pt').to(device)
            gpt_outputs = self.model(gpt_token['input_ids'])[0]
            padded_outputs = pad_sequences(gpt_outputs)
            return (padded_outputs, torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float32)) 
            
        
        elif self.mode == 'bert-base-uncased':
            bert_token = self.tokenizer(self.data['Joined'].iloc[idx], padding='max_length', return_tensors='pt', max_length=68).to(device)        
            with torch.inference_mode():
                bert_outputs = self.model(**bert_token)
            return (bert_outputs[0], torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float32))
        
        
train_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\train\train."
valid_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\dev\dev."
test_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\test\test."

In [15]:
bert_data = WiCDataset(train_path, "bert")
bert_dataloader = torch.utils.data.DataLoader(bert_data,
                                             batch_size=32,
                                             shuffle=False,
                                             drop_last=False)
gpt_data = WiCDataset(train_path, "gpt")
gpt_dataloader = torch.utils.data.DataLoader(gpt_data,
                                            batch_size=32,
                                            shuffle = False,
                                            drop_last = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
bert_model = DNN(input_size = 52224, hidden_size=64, num_classes=1).to(device)
bert_model.load_state_dict(torch.load("bert_model_wic_small.pth"))
gpt_model = DNN(input_size = 49920, hidden_size=64, num_classes=1).to(device)
gpt_model.load_state_dict(torch.load("gpt_model_wic_small_1.pth"))

<All keys matched successfully>

In [17]:
def predict(dataloader, model, mode, device):
    result = []
    if mode == "glove":
        model.eval()
        with torch.inference_mode():
            for inputs, _, seq_lengths in dataloader:
                inputs, seq_lengths = inputs.to(device), seq_lengths.to(device)
                test_logits = model(inputs, seq_lengths)
                result.append(torch.sigmoid(test_logits))
    else:
        model.eval()
        with torch.inference_mode():
            for inputs, _ in dataloader:
                inputs = inputs.to(device)
                flattened_inputs = inputs.view(inputs.size(0), -1)
                test_logits = model(flattened_inputs)
                result.append(torch.sigmoid(test_logits))
    return result
        

In [20]:
gpt_output = predict(gpt_dataloader, gpt_model, "gpt", device)
bert_output = predict(bert_dataloader, bert_model, "bert", device)
gpt_vector = torch.cat(gpt_output, dim=0).squeeze()
bert_vector = torch.cat(bert_output, dim=0).squeeze()
average_vector = torch.mean(torch.stack([gpt_vector, bert_vector]), dim=0).cpu().numpy()

In [21]:
df = pd.read_csv(train_path+'data.txt',
                delimiter='\t',
                names=['Target Word', 'PoS', 'Index', 'Context1', 'Context2'])
df['Joined'] = df['Context1'] + " " + df['Context2']
df = df.assign(Label=average_vector)
df = df.drop(['PoS', 'Index', 'Context1', 'Context2'], axis=1)
df.to_csv('small_data.csv', index=False)

In [22]:
df

Unnamed: 0,Target Word,Joined,Label
0,carry,You must carry your camping gear . Sound carri...,3.909200e-01
1,go,Messages must go through diplomatic channels ....,3.783608e-08
2,break,Break an alibi . The wholesaler broke the cont...,1.034690e-08
3,cup,He wore a jock strap with a metal cup . Bees f...,8.908972e-01
4,academy,The Academy of Music . The French Academy .,1.423036e-03
...,...,...,...
5423,krona,Piecas kronas — five krona . Kronas kurss — th...,8.909018e-01
5424,conflict,The harder the conflict the more glorious the ...,4.503391e-01
5425,answer,Answer the riddle . Answer a question .,7.800645e-01
5426,play,Play the casinos in Trouville . Play the races .,7.776666e-02
