In [None]:
## from torch.utils.data import Dataset
import torch
import pandas as pd
import numpy as np
from random import sample
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from transformers import AutoTokenizer, AutoModel
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import nltk
import warnings

warnings.filterwarnings("ignore")
nltk.download('punkt')
glove = api.load('glove-wiki-gigaword-50')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def pad_sequences(sequences):
    padded_sequences = []
    for seq in sequences:
        if seq.size(0) <= 65:
            padded_seq = torch.nn.functional.pad(seq, (0, 0, 0, 65 - seq.size(0)), mode='constant', value=0)
        else:
            padded_seq = seq[:65]
        padded_sequences.append(padded_seq)
    return torch.stack(padded_sequences)

In [3]:
from transformers import GPT2Tokenizer, GPT2Model
import torch.nn.utils.rnn as rnn_utils
class WiCDataset(Dataset):
    def __init__(self, path, mode):
        self.mode = mode
        if mode == "gpt":
            self.mode = 'gpt2'
            self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            self.model = GPT2Model.from_pretrained('gpt2').to(device)
        elif mode == "bert":
            self.mode = 'bert-base-uncased'
            self.tokenizer = AutoTokenizer.from_pretrained(self.mode) 
            self.model = AutoModel.from_pretrained(self.mode).to(device)

        df_data = pd.read_csv(path+"data.txt",
                              delimiter='\t',
                              names=['Target Word', 'PoS', 'Index', 'Context1', 'Context2'])
        df_label = pd.read_csv(path+'gold.txt',
                               delimiter='\t',
                               names=['label'])
        self.data = pd.concat([df_data, df_label], axis=1)
        self.data['Joined'] = self.data['Context1'] + " " + self.data['Context2']
        self.data['label'] = self.data['label'].map(lambda x: 0 if x == 'F' else 1)
        #self.maxLength = find_maxLength(self.data['Joined'].tolist(), self.tokenizer)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if self.mode == 'gpt2':
           # self.tokenizer.pad_token = self.tokenizer.eos_token
           # gpt_token = self.tokenizer(self.data['Joined'].iloc[idx], return_tensors='pt').to(device)
            gpt_token = self.tokenizer(self.data['Joined'].iloc[idx], return_tensors='pt').to(device)
            gpt_outputs = self.model(gpt_token['input_ids'])[0]
            padded_outputs = pad_sequences(gpt_outputs)
           # with torch.inference_mode():
           #     gpt_outputs = self.model(**gpt_token)
            return (padded_outputs, torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float32)) 
            
            # sentence_lists = chunk_list(self.data['Joined'].tolist(), 200)
            # tensor_list = []
            # for block in sentence_lists:
            #     torch.cuda.empty_cache()
            #     gpt_token = self.tokenizer(block, padding='max_length', return_tensors='pt', max_length=65).to(device)
            #     with torch.inference_mode():
            #         gpt_outputs = self.model(**gpt_token)
            #     tensor_list.append(gpt_outputs[0])
            # gpt_tensor = torch.cat(tensor_list, dim = 0)
            # return (gpt_tensor[idx].cpu(), torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long))
        
        
        elif self.mode == 'bert-base-uncased':
            bert_token = self.tokenizer(self.data['Joined'].iloc[idx], padding='max_length', return_tensors='pt', max_length=68).to(device)        
            with torch.inference_mode():
                bert_outputs = self.model(**bert_token)
            return (bert_outputs[0], torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float32))
        
            
        elif self.mode == 'glove':
            row = self.data.iloc[idx]
            words = word_tokenize(row.Joined.lower())

            indices = [glove.get_index(w) for w in words if glove.has_index_for(w)]
            indices_tensor = torch.tensor(indices, dtype=torch.long)

            return indices_tensor, torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)
        
train_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\train\train."
valid_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\dev\dev."
test_path = r"C:\Users\joowa\OneDrive\Spring 2023\CS577\Project\WiC_dataset\test\test."

In [4]:
class DNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out



In [5]:
class LSTM(nn.Module):
    def __init__(self,
                 input_dim: int,
                 hidden_dim: int,
                 output_dim: int,
                 num_layers: int):
        super().__init__()
        self.emb = nn.Embedding.from_pretrained(torch.FloatTensor(glove.vectors))
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim,
                          hidden_dim,
                          num_layers,
                          bidirectional = True,
                          batch_first=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)
        
    
    def forward(self, seq, seq_length):
        inputs_embedded = self.emb(seq)
        seq_length = seq_length.cpu()
        packed_input = rnn_utils.pack_padded_sequence(inputs_embedded, seq_length, batch_first=True)
        packed_output, _ = self.lstm(packed_input)
        output, _ = rnn_utils.pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), seq_length - 1, :self.hidden_dim]
        out_reverse = output[:, 0, self.hidden_dim:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        output = self.fc(out_reduced)
        return output

In [6]:
def collate_fn(batch):
    inputs, labels = zip(*batch)
    # pad the inputs with zeros to make them the same length
    inputs_padded = rnn_utils.pad_sequence(inputs, batch_first=True)
    # get the sequence lenghts of the inputs
    seq_length = torch.LongTensor([len(seq) for seq in inputs])
    
    # sort the inputs and labels by the sequence lengths
    seq_length, sort_idx = seq_length.sort(descending=True)
    inputs_padded = inputs_padded[sort_idx].to(device)
    labels_sorted = torch.tensor(labels, dtype=torch.float32)[sort_idx].to(device)

    return inputs_padded, labels_sorted, seq_length

In [7]:
# Make each dataloader
# glove_data = WiCDataset(train_path, "glove")
# glove_dataloader = torch.utils.data.DataLoader(glove_data,
#                                               batch_size=32,
#                                               shuffle = False, 
#                                               drop_last=False,
#                                               collate_fn=collate_fn)
bert_data = WiCDataset(train_path, "bert")
bert_dataloader = torch.utils.data.DataLoader(bert_data,
                                             batch_size=32,
                                             shuffle=False,
                                             drop_last=False)
gpt_data = WiCDataset(train_path, "gpt")
gpt_dataloader = torch.utils.data.DataLoader(gpt_data,
                                            batch_size=32,
                                            shuffle = False,
                                            drop_last = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
# Load the models
# glove_model = LSTM(50, 128, 1, 2).to(device)
# glove_model.load_state_dict(torch.load("glove_model_wic_1.pth"))
bert_model = DNN(input_size = 52224, hidden_size=512, num_classes=1).to(device)
bert_model.load_state_dict(torch.load("bert_model_wic_1.pth"))
gpt_model = DNN(input_size = 49920, hidden_size=512, num_classes=1).to(device)
gpt_model.load_state_dict(torch.load("gpt_model_wic_2.path"))

<All keys matched successfully>

In [11]:
def predict(dataloader, model, mode, device):
    result = []
    if mode == "glove":
        model.eval()
        with torch.inference_mode():
            for inputs, _, seq_lengths in dataloader:
                inputs, seq_lengths = inputs.to(device), seq_lengths.to(device)
                test_logits = model(inputs, seq_lengths)
                result.append(torch.sigmoid(test_logits))
    else:
        model.eval()
        with torch.inference_mode():
            for inputs, _ in dataloader:
                inputs = inputs.to(device)
                flattened_inputs = inputs.view(inputs.size(0), -1)
                test_logits = model(flattened_inputs)
                result.append(torch.sigmoid(test_logits))
    return result
        

In [12]:
gpt_output = predict(gpt_dataloader, gpt_model, "gpt", device)

In [13]:
bert_output = predict(bert_dataloader, bert_model, "bert", device)

In [14]:
# glove_output = predict(glove_dataloader, glove_model, "glove", device)

In [15]:
gpt_vector = torch.cat(gpt_output, dim=0).squeeze()
bert_vector = torch.cat(bert_output, dim=0).squeeze()
# glove_vector = torch.cat(glove_output, dim=0).squeeze()

In [16]:
# average_vector = torch.mean(torch.stack([gpt_vector, bert_vector, glove_vector]), dim=0).cpu().numpy()
average_vector = torch.mean(torch.stack([gpt_vector, bert_vector]), dim=0).cpu().numpy()

In [17]:
average_vector

array([6.8690046e-03, 2.3604396e-05, 3.6915534e-08, ..., 6.9479299e-01,
       5.9548412e-02, 9.9967980e-01], dtype=float32)

In [18]:
df = pd.read_csv(train_path+'data.txt',
                delimiter='\t',
                names=['Target Word', 'PoS', 'Index', 'Context1', 'Context2'])
df['Joined'] = df['Context1'] + " " + df['Context2']
df = df.assign(Label=average_vector)
df = df.drop(['PoS', 'Index', 'Context1', 'Context2'], axis=1)

In [19]:
df

Unnamed: 0,Target Word,Joined,Label
0,carry,You must carry your camping gear . Sound carri...,6.869005e-03
1,go,Messages must go through diplomatic channels ....,2.360440e-05
2,break,Break an alibi . The wholesaler broke the cont...,3.691553e-08
3,cup,He wore a jock strap with a metal cup . Bees f...,9.999992e-01
4,academy,The Academy of Music . The French Academy .,4.864547e-09
...,...,...,...
5423,krona,Piecas kronas — five krona . Kronas kurss — th...,9.999934e-01
5424,conflict,The harder the conflict the more glorious the ...,3.771217e-02
5425,answer,Answer the riddle . Answer a question .,6.947930e-01
5426,play,Play the casinos in Trouville . Play the races .,5.954841e-02


In [20]:
df.to_csv('new_data.csv', index=False)