# Hypernym Discovery using XLNet language model
Project work done for the course of Natural Language Processing for the Fall of 2023.

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install simpletransformers
!pip install pytorch-transformers

In [None]:
import pandas as pd
from transformers import XLNetTokenizer, XLNetLMHeadModel
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
import random
import numpy as np
import time
import datetime
import os



In [None]:
# Create Variables for setting some basic parameters for training
PROJECT_ROOT = os.getcwd()
TRAINING_DATA_FILE="combined-preprocessed.data.txt"
TRAINING_GOLD_FILE="combined-preprocessed.gold.txt"
PREPROCESSED_TRAINING_DATA_PATH=PROJECT_ROOT+"/SemEval2018-Task9/preprocessed/training/"
SAVE_MODEL_PATH=PROJECT_ROOT+"/saved-models/"
BATCH_SIZE=4
EPOCH_PER_TOKENSIZE={1:15,2:15,3:5}
MODEL_SAVE_PREFIX="r1-all-xlnet-large-b32"


In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def load_dataframe(data_file, gold_file):
    '''
    Function to load the data onto a dataframe using pandas
    '''
    data_df = pd.read_csv(data_file, header=None, index_col=False)
    label_df = pd.read_csv(gold_file, header=None, index_col=False)
    return data_df, label_df

In [None]:
def tokenize_and_pad(input_data,output_data):
  '''
  Function that takes the input data and gold standard output data and tokenizes them for trianing using the XLNet model
  '''

  # We use the tokenizer that was used in the pretrainng of the XLNet model for tokenize our data as well.
  tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')

  label_list=[str(x[0]) for x in output_data.values ]
  label_list=[(tokenizer.encode(x, add_special_tokens=False)) for x in label_list ]


  PADDING_TEXT="""The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story.A pamphlet with photos and comments from the journals kept by the students added to the display. <eod> <eos>"""
  input_ids_list=[PADDING_TEXT +" "+ str(x[0]) +" is a " for x in input_data.values ]

  label_list_dictionary={}
  input_ids_dictionary={}

  i=0
  while(i<len(label_list)):
    if(len(label_list[i])>3):
      del label_list[i]
      del input_ids_list[i]
    else:
      temp_input_ids=input_ids_list[i]+(len(label_list[i])*"<mask>")
      temp_input_ids=tokenizer.encode(temp_input_ids, add_special_tokens=True)
      if(label_list_dictionary.get(len(label_list[i])) is not None):
        label_list_dictionary[len(label_list[i])].append(label_list[i])
        input_ids_dictionary[len(label_list[i])].append(temp_input_ids)
      else:
        label_list_dictionary[len(label_list[i])]=[label_list[i]]
        input_ids_dictionary[len(label_list[i])]=[temp_input_ids]
      i=i+1

  input_ids_attention_mask_dictionary={}
  for i in label_list_dictionary.keys():
    input_ids_dictionary[i] = pad_sequences(input_ids_dictionary[i],  dtype='long', value=5, padding='pre')
    input_ids_dictionary[i]=torch.tensor(input_ids_dictionary[i])
    input_ids_attention_mask_list = [[int(token_id !=5) for token_id in word] for word in input_ids_dictionary[i]]
    input_ids_attention_mask=torch.tensor(input_ids_attention_mask_list)
    input_ids_attention_mask_dictionary[i]=input_ids_attention_mask
    label_list_dictionary[i]=torch.tensor(label_list_dictionary[i])
  return input_ids_dictionary,label_list_dictionary,input_ids_attention_mask_dictionary

In [None]:
def create_dataloader(input_id_list,label_list,attention_mask_list,batch_size):
  '''
  Function that creates a dataloader for the input_ids, labels, and the attention_mask
  '''

  #We are using a 90:10 split for training : validation split
  train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_id_list, label_list,
                                                              random_state=2018, test_size=0.1)
  train_masks, validation_masks= train_test_split(attention_mask_list,random_state=2018, test_size=0.1)

  train_inputs = torch.tensor(train_inputs)
  validation_inputs = torch.tensor(validation_inputs)

  train_labels = torch.tensor(train_labels)
  validation_labels = torch.tensor(validation_labels)

  train_masks = torch.tensor(train_masks)
  validation_masks = torch.tensor(validation_masks)
  # Create the DataLoader for our training set.
  train_data = TensorDataset(train_inputs, train_masks, train_labels)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  # Create the DataLoader for our validation set.
  validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  return train_dataloader,validation_dataloader


## A). Loading the preprocessed dataset.

*  We load the preprocessed dataset from our folder and tokenize, pad and generate masks for the input hyponyms. We also split the data based on the number of tokens of the label hypernym as well to train them seperately.




In [None]:
#Import the preprocessed hyponym-hypernym pairs
input_hyponym_df,output_hypernym_df=load_dataframe(PREPROCESSED_TRAINING_DATA_PATH+TRAINING_DATA_FILE,PREPROCESSED_TRAINING_DATA_PATH+TRAINING_GOLD_FILE)

In [None]:
# Get the input, label and input attention mask from the preprocessed data. The input,label and mask is grouped with the size of the number of tokens the label(hypernym) had.
input_ids_dictionary,label_list_dictionary,input_ids_attention_mask_dictionary=tokenize_and_pad(input_data=input_hyponym_df,output_data=output_hypernym_df)

## B). Download and load the pretrained XLNet-Large model from HuggingFace

In [None]:
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr = 2e-5,eps = 1e-8 )

## C). Fine Tuning the model for our dataset

In [None]:
# 
token_size_list=[x for x in input_ids_dictionary.keys()]
token_size_list.sort(reverse=True)
for label_token_size in token_size_list:
    
  train_dataloader,validation_dataloader=create_dataloader(input_ids_dictionary[label_token_size],
                                                           label_list_dictionary[label_token_size],
                                                           input_ids_attention_mask_dictionary[label_token_size],
                                                           BATCH_SIZE)

  epochs = EPOCH_PER_TOKENSIZE[label_token_size]
  total_steps = len(train_dataloader) * epochs
  scheduler = get_linear_schedule_with_warmup(optimizer,
                                              num_warmup_steps = 0,
                                              num_training_steps = total_steps)
  seed_val = 42
  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed_all(seed_val)
    
  loss_values = []
  for epoch_i in range(0, epochs):

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()
      total_loss = 0
      model.train()

      # For each batch of training data...
      for step, batch in enumerate(train_dataloader):

          # Update progress every 20 batches
          if step % 20 == 0 and not step == 0:
              # Calculate elapsed time in minutes.
              elapsed = format_time(time.time() - t0)

              # Report progress.
              print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

          # Unpack this training batch from our dataloader.
          #
          # As we unpack the batch, we'll also copy each tensor to the GPU using the
          # `to` method.
          #
          # `batch` contains three pytorch tensors:
          #   [0]: input ids
          #   [1]: attention masks
          #   [2]: labels

          b_input_ids, b_input_mask, b_labels = batch

          # We create the target mapping tensor and the perm_mask tensor based on the size of the label tokens
          
          targets=[(-2-i) for i in range(label_token_size,0,-1)] # This will be the indices of the masks that will be present in the input_ids
          perm_mask_batch = torch.zeros((b_input_ids.shape[0], b_input_ids.shape[1], b_input_ids.shape[1]), dtype=torch.float)
          perm_mask_batch[:, :, targets] = 1.0
          target_mapping_batch = torch.zeros((b_input_mask.shape[0], len(targets), b_input_ids.shape[1]), dtype=torch.float)
          for i in range(len(targets)):
            target_mapping_batch[:, i, targets[i]] = 1.0

          b_input_ids = b_input_ids.to(device)
          b_input_mask = b_input_mask.to(device)
          b_labels = b_labels.to(device)
          perm_mask_batch_tensor=perm_mask_batch.to(device)
          target_mapping_batch_tensor=target_mapping_batch.to(device)

          model.zero_grad()

          outputs = model(input_ids=b_input_ids,attention_mask=b_input_mask,labels=b_labels,perm_mask=perm_mask_batch_tensor, target_mapping=target_mapping_batch_tensor)

          loss = outputs[0]

          total_loss += loss.item()

          loss.backward()

          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          optimizer.step()

          scheduler.step()

      # Calculate the average loss over the training data.
      avg_train_loss = total_loss / len(train_dataloader)

      # Store the loss value for plotting the learning curve.
      loss_values.append(avg_train_loss)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(format_time(time.time() - t0)))



      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.

      print("")
      print("Running Validation...")
      t0 = time.time()
      model.eval()
      tmp_validation_loss, validation_loss = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0

      # Evaluate data for one epoch
      for batch in validation_dataloader:
          batch = tuple(t.to(device) for t in batch)

          b_input_ids, b_input_mask, b_labels = batch
          b_input_ids = b_input_ids.to(device)
          b_input_mask = b_input_mask.to(device)
          b_labels = b_labels.to(device)
          perm_mask_batch = torch.zeros((b_input_ids.shape[0], b_input_ids.shape[1], b_input_ids.shape[1]), dtype=torch.float)
          perm_mask_batch[:, :, targets] = 1.0
          target_mapping_batch = torch.zeros((b_input_mask.shape[0], len(targets), b_input_ids.shape[1]), dtype=torch.float)

          for i in range(len(targets)):
            target_mapping_batch[:, i, targets[i]] = 1.0

          perm_mask_batch_tensor=perm_mask_batch.to(device)
          target_mapping_batch_tensor=target_mapping_batch.to(device)

          with torch.no_grad():
              outputs = model(input_ids=b_input_ids,attention_mask=b_input_mask,labels=b_labels,perm_mask=perm_mask_batch_tensor, target_mapping=target_mapping_batch_tensor)

          logits = outputs[0]
          tmp_validation_loss=logits.cpu().item()

          validation_loss += tmp_validation_loss

          nb_eval_steps += 1

      # Report the final accuracy for this validation run.
      print("  Validation Loss: {0:.2f}".format(validation_loss/nb_eval_steps))
      print("  Validation took: {:}".format(format_time(time.time() - t0)))

      if((epoch_i+1)%4==0):
        model_save_name=SAVE_MODEL_PATH+MODEL_SAVE_PREFIX+str(epoch_i+1)+"-v2-ts"+str(label_token_size)
        torch.save(model,model_save_name)
        print("Model saved in :"+model_save_name)

  print("")
  print("Training complete for "+str(label_token_size)+" sized labels.")

model_save_name=SAVE_MODEL_PATH+MODEL_SAVE_PREFIX+"-final"
torch.save(model,model_save_name)
print("Final Model saved in :"+model_save_name)