In [1]:

import torch
from torch import nn
import pandas as pd
from transformers.models.t5 import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
device = 'cuda' if torch.cuda.is_available() else 'cpu'
from rouge_score import rouge_scorer
import pickle
import random
random.seed(42)
import wandb


In [2]:
class DataSet(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    #cleaning data so as to ensure data is in string type
    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    source_mask = torch.ones(100 + self.source_len)  # UPDATE FOR PREFIX FINE TUNING
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }

In [4]:
from pickle import load
test_pairs = load(open('/home/kjros2/cs546/project/2021cs546project/data/text_pairs.pkl', 'rb'))



In [4]:
"""
input_ids = tokenizer.encode("hello world this is a neural test. Can you hear me?", return_tensors='pt').to(device)
attention_mask = torch.ones((1, input_ids.size()[1] + 50)).to(device)
outputs = model.generate(
    input_ids=input_ids,
    max_length=64,
    do_sample=True,
    top_k=10,
    attention_mask=attention_mask,
    num_return_sequences=3)

for i in range(3):
    print(f'sample {i + 1}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
"""

'\ninput_ids = tokenizer.encode("hello world this is a neural test. Can you hear me?", return_tensors=\'pt\').to(device)\nattention_mask = torch.ones((1, input_ids.size()[1] + 50)).to(device)\noutputs = model.generate(\n    input_ids=input_ids,\n    max_length=64,\n    do_sample=True,\n    top_k=10,\n    attention_mask=attention_mask,\n    num_return_sequences=3)\n\nfor i in range(3):\n    print(f\'sample {i + 1}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}\')\n'

In [4]:
def train(epoch, tokenizer, model, device, loader, optimizer):

  """
  Function to be called for training with the parameters passed from main function

  """

  model.train()

  for _,data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    if _%100==0:
      print(str(epoch), str(_), str(loss))
      wandb.log({'epoch': epoch, 'train_loss': float(loss)})
      
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [5]:
def val(epoch, fold_idx, tokenizer, model, device, loader):

  """
  Function to be called for validation with the parameters passed from main function

  """

  model.eval()

  with torch.no_grad():
    total_loss = 0
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        total_loss += float(outputs[0])
    total_loss /= len(loader)
    print({'epoch': epoch, 'fold_idx': fold_idx, 'val_loss': total_loss})
    return total_loss

In [6]:
def test(tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals

In [5]:
def score(preds, acts, typ):
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
  
  rouge1 = {'precision': [], 'recall': [], 'fmeasure': []}
  rouge2 = {'precision': [], 'recall': [], 'fmeasure': []}
  rougeL = {'precision': [], 'recall': [], 'fmeasure': []}


  for pred,act in zip(preds, acts):
      scores = scorer.score(act, pred)
      r1_pre, r1_re, r1_fm = scores['rouge1']
      r2_pre, r2_re, r2_fm = scores['rouge2']
      rL_pre, rL_re, rL_fm = scores['rougeL']
      rouge1['precision'].append(r1_pre)
      rouge1['recall'].append(r1_re)
      rouge1['fmeasure'].append(r1_fm)
      rouge2['precision'].append(r2_pre)
      rouge2['recall'].append(r2_re)
      rouge2['fmeasure'].append(r2_fm)
      rougeL['precision'].append(rL_pre)
      rougeL['recall'].append(rL_re)
      rougeL['fmeasure'].append(rL_fm)
  # for sig testing, uncomment for running
  #return rouge1, rouge2, rougeL

  num_samples = len(preds)
  for key in rouge1:
      rouge1[key] = sum(rouge1[key]) /  num_samples
      #wandb.log({typ + '_rouge1_' + key: rouge1[key]})
      #print({'rouge1_' + key: rouge1[key]})
  for key in rouge2:
      rouge2[key] = sum(rouge2[key]) / num_samples
      #wandb.log({typ + '_rouge2_' + key: rouge2[key]})
      #print({'rouge2_' + key: rouge2[key]})
  for key in rougeL:
      rougeL[key] = sum(rougeL[key]) / num_samples
      #wandb.log({typ + '_rougeL_' + key: rougeL[key]})
      #print({'rougeL_' + key: rougeL[key]})

  return rouge1, rouge2, rougeL

In [8]:
import numpy as np

def T5Trainer(test_pairs, val_idx, source_text, target_text, model_params, output_dir="./outputs/", mode="tune" ):
  
  """
  T5 trainer

  """

  # Set random seeds and deterministic pytorch for reproducibility
  torch.manual_seed(model_params["SEED"]) # pytorch random seed
  np.random.seed(model_params["SEED"]) # numpy random seed
  torch.backends.cudnn.deterministic = True

  # logging
  print(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

  # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
  # Further this model is sent to device (GPU/TPU) for using the hardware.
  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)

  # for prefix tuning (comment out for regular fine tuning)
  model._modules.requires_grad=False
  model._modules['encoder'].embedding.requires_grad=True
  model._modules['encoder'].linear1.requires_grad=True
  model._modules['encoder'].linear2.requires_grad=True
  #model._modules['encoder'].linear3.requires_grad=True
  
  # Defining the optimizer that will be used to tune the weights of the network in the training session. 
  optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()),
                               lr=model_params["LEARNING_RATE"])

  
  # logging
  print(f"[Data]: Reading data...\n")

  shuff_lec_names = list(test_pairs.keys())
  random.shuffle(shuff_lec_names)


  # perform cross validation with a test set of the last 5
  test_questions = []
  test_lecture_text = []
  for lecture in shuff_lec_names[85:]:
    for question_unit in test_pairs[lecture]:
        test_questions.append(question_unit['text'])
        test_lecture_text.append(question_unit['lecture_text'])
  test_dataset = pd.DataFrame({'lecture': test_lecture_text, 'question': test_questions})

  val_questions = []
  val_lecture_text = []
  for lecture in shuff_lec_names[val_idx:val_idx+10]:
    for question_unit in test_pairs[lecture]:
        val_questions.append(question_unit['text'])
        val_lecture_text.append(question_unit['lecture_text'])
  val_dataset = pd.DataFrame({'lecture': val_lecture_text, 'question': val_questions})

  train_questions = []
  train_lecture_text = []
  for lecture in shuff_lec_names[0:val_idx] + shuff_lec_names[val_idx+10:85]:
    for question_unit in test_pairs[lecture]:
        train_questions.append(question_unit['text'])
        train_lecture_text.append(question_unit['lecture_text'])
  train_dataset = pd.DataFrame({'lecture': train_lecture_text, 'question': train_questions})

  if mode == "test":
      # merge val and train data sets
      train_questions = train_questions + val_questions
      train_lecture_text = train_lecture_text + val_lecture_text
      train_dataset = pd.DataFrame({'lecture': train_lecture_text, 'question': train_questions})

  #print(f"FULL Dataset: {dataframe.shape}")
  print(f"TRAIN Dataset: {train_dataset.shape}")
  print(f"VAL Dataset: {val_dataset.shape}")
  print(f"TEST Dataset: {test_dataset.shape}\n")



  # Creating the Training and Validation dataset for further creation of Dataloader
  training_set = DataSet(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  val_set = DataSet(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  test_set = DataSet(test_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)


  # Defining the parameters for creation of dataloaders
  train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


  val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }

  test_params = {
      'batch_size': model_params["TEST_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


  # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  training_loader = DataLoader(training_set, **train_params)
  val_loader = DataLoader(val_set, **val_params)
  test_loader = DataLoader(test_set, **test_params)



  # Training loop
  print(f'[Initiating Fine Tuning]...\n')

  prev_loss = []
  prev_rouge1f = []
  for epoch in range(model_params["TRAIN_EPOCHS"]):
      train(epoch, tokenizer, model, device, training_loader, optimizer)
      if mode == "test":
          continue
      loss = val(epoch, val_idx, tokenizer, model, device, val_loader)
      preds, acts = test(tokenizer, model, device, val_loader)
      wandb.log({'epoch': epoch, 'val_loss': loss})
      rouge1_f = score(preds, acts, 'val')

      prev_loss.append(loss)
      prev_rouge1f.append(rouge1_f)

      #if len(prev_loss) > 2 and prev_loss[-3] < prev_loss[-2] and prev_loss[-2] < prev_loss[-1]:
      #    break
      if len(prev_rouge1f) > 2 and prev_rouge1f[-1] < prev_rouge1f[-2] and prev_rouge1f[-2] < prev_rouge1f[-3]:
          break

  
  
  if mode == "test":
      preds, acts = test(tokenizer, model, device, test_loader)
      score(preds, acts, 'test')
      pickle.dump(preds, open('preds.pkl', 'wb'))
      pickle.dump(acts, open('acts.pkl', 'wb'))
  del model
  return max(prev_rouge1f)


In [None]:
# castorini/doc2query-t5-base-msmarco
# t5-base 
mode = "test"
model_params={
    "MODEL":"castorini/doc2query-t5-base-msmarco",
    "TRAIN_BATCH_SIZE":1,          # training batch size
    "VALID_BATCH_SIZE":1,          # validation batch size
    "TEST_BATCH_SIZE":1,           # validation batch size
    "TRAIN_EPOCHS":7,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":50,   # max length of target text
    "SEED": 42,                    # set seed for reproducibility 
    "type": 'prefix with embedding layer (post review recovering morning-pond-100)',
    "mode": mode,
    "input_dim": "768*20",
    "hidden": "800",
    "output_dim": "768*100"
}
wandb.init(project="prefix-questions", entity="kevinros", config=model_params)
if mode == "tune":
    all_f1s = []
    for fold_idx in [0,10,20]:
        wandb.log({'fold_idx': fold_idx})
        f1 = T5Trainer(test_pairs=test_pairs, val_idx=fold_idx,
            source_text="lecture", target_text="question", model_params=model_params, output_dir="outputs")
        all_f1s.append(f1)
    avg_f1 = sum(all_f1s) / 3
    wandb.log({'selection_crit': avg_f1})
elif mode == "test":
    T5Trainer(test_pairs=test_pairs, val_idx=0,
            source_text="lecture", target_text="question", model_params=model_params, output_dir="outputs", mode=mode)


In [2]:
acts = pickle.load(open('runs/docTTTTTquery/acts.pkl', 'rb'))
t5_base_ft_pred = pickle.load(open('runs/t5-base_ft/preds.pkl', 'rb'))
t5_base_prefix_pred = pickle.load(open('runs/t5-base_prefix/preds.pkl', 'rb'))


docTTTTTquery_pred = pickle.load(open('runs/docTTTTTquery/preds.pkl', 'rb'))
docTTTTTquery_ft_pred = pickle.load(open('runs/docTTTTTquery_ft/preds.pkl', 'rb'))
#docTTTTTquery_prefix_pred = pickle.load(open('runs/docTTTTTquery_prefix/preds.pkl', 'rb'))
# old run was different than reported in paper, so had to rerun and update
docTTTTTquery_prefix_pred = pickle.load(open('runs/docTTTTTquery_prefix_v2/preds.pkl', 'rb'))


In [3]:
len(docTTTTTquery_prefix_pred)

53

In [6]:
from scipy.stats import wilcoxon
import seaborn as sb
t5_base_ft_pred_scores = score(t5_base_ft_pred, acts, 'test')
t5_base_prefix_pred_scores = score(t5_base_prefix_pred, acts, 'test')


docTTTTTquery_pred_scores = score(docTTTTTquery_pred, acts, 'test')
docTTTTTquery_ft_pred_scores = score(docTTTTTquery_ft_pred, acts, 'test')
docTTTTTquery_prefix_pred_scores = score(docTTTTTquery_prefix_pred, acts, 'test')

In [None]:
# rq1: is the precision of the t5 models lower than the docTTTTTquery models?

# yes in t5-base FT to docTTTTTquery
print(wilcoxon(t5_base_ft_pred_scores[0]['precision'], docTTTTTquery_pred_scores[0]['precision'], alternative='less'))
print(wilcoxon(t5_base_ft_pred_scores[1]['precision'], docTTTTTquery_pred_scores[1]['precision'], alternative='less'))
print(wilcoxon(t5_base_ft_pred_scores[2]['precision'], docTTTTTquery_pred_scores[2]['precision'], alternative='less'))
print('=')


# yes in t5-base FT to docTTTTTquery FT
print(wilcoxon(t5_base_ft_pred_scores[0]['precision'], docTTTTTquery_ft_pred_scores[0]['precision'], alternative='less'))
print(wilcoxon(t5_base_ft_pred_scores[1]['precision'], docTTTTTquery_ft_pred_scores[1]['precision'], alternative='less'))
print(wilcoxon(t5_base_ft_pred_scores[2]['precision'], docTTTTTquery_ft_pred_scores[2]['precision'], alternative='less'))
print('=')

# ? in t5-base prefix to docTTTTTquery prefix
print(wilcoxon(t5_base_prefix_pred_scores[0]['precision'], docTTTTTquery_prefix_pred_scores[0]['precision'], alternative='less'))
print(wilcoxon(t5_base_prefix_pred_scores[1]['precision'], docTTTTTquery_prefix_pred_scores[1]['precision'], alternative='less'))
print(wilcoxon(t5_base_prefix_pred_scores[2]['precision'], docTTTTTquery_prefix_pred_scores[2]['precision'], alternative='less'))

In [None]:
# rq2: effect of continuous prefix tuning

# t5-base case
print(wilcoxon(t5_base_ft_pred_scores[0]['fmeasure'], t5_base_prefix_pred_scores[0]['fmeasure'], alternative='less'))
print(wilcoxon(t5_base_ft_pred_scores[1]['fmeasure'], t5_base_prefix_pred_scores[1]['fmeasure'], alternative='less'))
print(wilcoxon(t5_base_ft_pred_scores[2]['fmeasure'], t5_base_prefix_pred_scores[2]['fmeasure'], alternative='less'))
print('=')

# docTTTTTquery case
print(wilcoxon(docTTTTTquery_ft_pred_scores[0]['fmeasure'], docTTTTTquery_prefix_pred_scores[0]['fmeasure'], alternative='less'))
print(wilcoxon(docTTTTTquery_ft_pred_scores[1]['fmeasure'], docTTTTTquery_prefix_pred_scores[1]['fmeasure'], alternative='less'))
print(wilcoxon(docTTTTTquery_ft_pred_scores[2]['fmeasure'], docTTTTTquery_prefix_pred_scores[2]['fmeasure'], alternative='less'))