In [None]:
! pip install peft transformers datasets evaluate accelerate sentencepiece --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

from peft import get_peft_config, PrefixTuningConfig, TaskType, get_peft_model, PeftConfig

import pandas as pd
import torch
import csv
import regex as re
import os

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Data Preprocessing

In [None]:
dpr_data_path = '/content/DPR Fixed Set.tsv'
dpr_data = pd.read_csv(dpr_data_path, sep='\t')
dpr_data['candidates'] = dpr_data['candidates'].map(lambda x : x[2:-2].split("' '"))
dpr_data = dpr_data[dpr_data['candidates'].map(len) == 2]

In [None]:
dpr_data_train = dpr_data[dpr_data['split'] == 'train']
dpr_data_dev = dpr_data[dpr_data['split'] == 'dev']

In [None]:
# Modify the model name here fro large and XL
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

In [None]:
# Modify the model name here fro large and XL
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto")

In [None]:
peft_config = PrefixTuningConfig(peft_type="PREFIX_TUNING",
                                 task_type=TaskType.SEQ_2_SEQ_LM, 
                                 inference_mode= False, 
                                 num_virtual_tokens=8)

                                  
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.to(device)

In [None]:
def preprocess(input_data):
  data = {'input': [], 'labels': [], 'right': []}
  for _, row in input_data.iterrows():
    row = dict(row)  
    cand = row['candidates'] 
    input_sentence = 'Based on the given two options, select one option that best fits the pronoun in the below sentence.' + os.linesep + os.linesep + row['left'] + os.linesep + os.linesep + ' OPTIONS: ' + os.linesep + '- ' + cand[0] + ' ' + row['right'] + os.linesep + ' - ' + cand[1] + ' ' + row['right'] + os.linesep + 'Output: '
    # input_sentence = 'How does the sentence end?' + os.linesep + os.linesep + row['left'] + os.linesep + os.linesep + ' OPTIONS: ' + os.linesep + '- ' + cand[0] + ' ' + row['right'] + os.linesep + ' - ' + cand[1] + ' ' + row['right'] + os.linesep + 'Output: '
    
    data['input'].append(input_sentence)
    data['labels'].append(row['correct'].lower())
    data['right'].append(row['right'])
  return data

In [None]:
dpr_pp_data_train = preprocess(dpr_data_train)
dpr_pp_data_dev = preprocess(dpr_data_dev)

In [None]:
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
  """
  Creating a custom dataset for reading the dataset and
  loading it into the dataloader to pass it to the
  neural network for finetuning the model

  """

  def __init__(
    self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
    """
    Initializes a Dataset class

    Args:
        dataframe (pandas.DataFrame): Input dataframe
        tokenizer (transformers.tokenizer): Transformers tokenizer
        source_len (int): Max length of source text
        target_len (int): Max length of target text
        source_text (str): column name of source text
        target_text (str): column name of target text
    """
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]
    self.suffix = self.data['right']

  def __len__(self):
    """returns the length of dataframe"""

    return len(self.target_text)

  def __getitem__(self, index):
    """return the input ids, attention masks and target ids"""

    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    # cleaning data so as to ensure data is in string type
    source_text = " ".join(source_text.split())
    target_text = " ".join(target_text.split())

    source = self.tokenizer.batch_encode_plus(
        [source_text],
        max_length=self.source_len,
        pad_to_max_length=True,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    target = self.tokenizer.batch_encode_plus(
        [target_text],
        max_length=self.summ_len,
        pad_to_max_length=True,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    source_ids = source["input_ids"].squeeze()
    source_mask = source["attention_mask"].squeeze()
    target_ids = target["input_ids"].squeeze()
    target_mask = target["attention_mask"].squeeze()

    return {
        "input_ids": source_ids.to(dtype=torch.long),
        "attention_mask": source_mask.to(dtype=torch.long),
        "target_ids": target_ids.to(dtype=torch.long),
        "target_ids_y": target_ids.to(dtype=torch.long),
        "label": self.target_text[index],
        "source_text": self.source_text[index],
        "suffix": self.suffix[index]
    }

    def __get__label(self, index):
      return str()

In [None]:
train_dataset = CustomDataset(dpr_pp_data_train, tokenizer, 128, 32, 'input', 'labels')
dev_dataset = CustomDataset(dpr_pp_data_dev, tokenizer, 128, 32, 'input', 'labels')

# Model Training

In [None]:
NUM_EPOCHS = 5
LR = 5e-3
BATCH_SIZE = 32

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
optim = AdamW(model.parameters(), lr=LR)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optim,
    num_warmup_steps=0,
    num_training_steps=(len(train_loader) * NUM_EPOCHS),
)



In [None]:
def exactMatch(pred, truth, suffix):
  
  replacement = re.sub(r'[^A-Za-z0-9 ]+', '', suffix)
  pred = re.sub(r'[^A-Za-z0-9 ]+', '', pred)
  truth = re.sub(r'[^A-Za-z0-9 ]+', '', truth)
  
  pred = pred.lstrip()
  pred = pred.rstrip()
  truth = truth.lstrip()
  truth = truth.rstrip()
  truth = truth.lower()
  pred = pred.lower()
  temp_pred = pred.replace(replacement,'')
  temp_truth = truth.replace(replacement,'')
  temp_pred = temp_pred.lstrip()
  temp_pred = temp_pred.rstrip()
  temp_truth = temp_truth.lstrip()
  temp_truth = temp_truth.rstrip()
  return temp_pred==temp_truth

In [None]:
def get_accuracy(true_labels, generated_text, suffix, source_text=None):
  correct_count = 0
  total_count = 0
  incorrect_pairs = []
  for i,text in enumerate(generated_text):
    total_count += 1
    try:
      predicted_label = re.search(r'^(<pad> )(.+)(</s>)', text).groups()[1]
    except:
      # print("No prediction")
      # print(text)
      predicted_label = ''
    # epredicted_label.lower() == true_labels[i].lower()
    # exactMatch(predicted_label.lower(), true_labels[i].lower(), suffix[i])
    if exactMatch(predicted_label.lower(), true_labels[i].lower(), suffix[i]):
      correct_count += 1
    else:
      if source_text != None:
        incorrect_pairs.append((true_labels[i].lower(), predicted_label.lower(), source_text[i]))

  if source_text != None:
   return incorrect_pairs  
  return 100 * correct_count / total_count


In [None]:
model.to(device)

best_model = model
best_acc = 0
for epoch in range(NUM_EPOCHS):
  total_loss = 0

  model.train()
  
  for batch in train_loader:
    optim.zero_grad()
    
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    y = batch["target_ids"].to(device)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100

    outputs = model(input_ids, attention_mask=attention_mask, decoder_input_ids=y_ids, labels=lm_labels)
    
    loss = outputs[0]
    loss.backward()
    optim.step()
    lr_scheduler.step()
    
    total_loss += loss

  print("==============================") 
  print("Training loss at epoch {0}: {1}".format(epoch, total_loss))

  model.eval()

  generated_text = []
  true_labels = []
  suffix = []
  for batch in train_loader:
    batch2 = {k: v.to(device) for k, v in batch.items() if k not in ['label', 'source_text', 'target_ids', 'target_ids_y', 'suffix']}
    outputs = model.generate(**batch2)
    generated_text.extend(tokenizer.batch_decode(outputs))
    true_labels.extend(batch['label'])
    suffix.extend(batch['suffix'])
  train_acc = get_accuracy(true_labels, generated_text, suffix)
  print("Train Accuracy:", train_acc)

  generated_text = []
  true_labels = []
  source_text = []
  suffix = []
  dev_loss = 0
  for batch in dev_loader:
    batch2 = {k: v.to(device) for k, v in batch.items() if k not in ['label', 'source_text', 'target_ids', 'target_ids_y', 'suffix']}
    outputs = model.generate(**batch2)
    generated_text.extend(tokenizer.batch_decode(outputs))
    true_labels.extend(batch['label'])
    suffix.extend(batch['suffix'])
    source_text.extend(batch['source_text'])

  dev_acc = get_accuracy(true_labels, generated_text, suffix)
  print("\nDev Accuracy:", dev_acc)

  if dev_acc > best_acc:
    best_model = model
    best_acc = dev_acc



In [None]:
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/685 NLP/FLAN_Base_Prefix_Tunning')

# Test on WSC

In [None]:
wsc_data_dev = pd.read_csv('/content/wsc273.tsv', sep='\t')
wsc_data_dev.head(5)

Unnamed: 0,left,pron,right,candidates,selected
0,The city councilmen refused the demonstrators ...,they,feared violence.,"The city councilmen,The demonstrators",0
1,The city councilmen refused the demonstrators ...,they,advocated violence.,"The city councilmen,The demonstrators",1
2,The trophy doesn't fit into the brown suitcase...,it,is too large.,"the trophy,the suitcase",0
3,The trophy doesn't fit into the brown suitcase...,it,is too small.,"the trophy,the suitcase",1
4,Joan made sure to thank Susan for all the help,she,had recieved.,"Joan,Susan",0


In [None]:
wsc_results = []
c = 0

model.eval()
for _, row in wsc_data_dev.iterrows():
  row = dict(row)
  cand = row['candidates'].split(',')
  input_sentence = 'How does the sentence end?' + os.linesep + os.linesep + row['left'] + os.linesep + os.linesep + ' OPTIONS: ' + os.linesep + '- ' + cand[0] + ' ' + row['right'] + os.linesep + ' - ' + cand[1] + ' ' + row['right'] + os.linesep + 'Output: '
  row['input_sentence'] = input_sentence
  scores = []
  correct = cand[row['selected']].lower() + ' ' + row["right"]
  
  tokenized_input = tokenizer(input_sentence, return_tensors="pt", max_length = 1000)
  input_ids = tokenized_input.input_ids.to(device)
  attention_mask = tokenized_input.attention_mask.to(device)
 
 
  outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id, max_new_tokens=1000)
  predicted_label = tokenizer.decode(outputs[0]).encode('utf-8').decode("utf-8").strip()[6:-4]
  
  row['predicted_label'] = predicted_label
  row['true_label'] = correct
  if exactMatch(predicted_label.lower(), correct.lower(), row['right']):
    row['correct'] = True
    c+=1
  else:
    row['correct'] = False
  wsc_results.append(row)

In [None]:
result_df = pd.DataFrame(wsc_results)
result_df.to_csv('FLAN_Base_Prefix_Tunning.csv')

In [None]:
(c/285)*100