In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install sentencepiece
!pip install huggingface_hub
!pip install nltk



In [2]:
import transformers
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric, Dataset
from datasets.dataset_dict import DatasetDict
from tqdm import tqdm
import pandas as pd
import ast
import gc
import torch
import collections
import numpy as np
from transformers import Trainer
from sklearn.utils import shuffle
import os
from accelerate import Accelerator
from transformers import get_scheduler
import math
from time import time
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    AutoTokenizer,
    T5Tokenizer,
    get_linear_schedule_with_warmup,
    default_data_collator
)
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
def process_data(tokenizer, df_dir_path, df_name, max_len=256, truncation=True, pad_to_max_len=True):
  path = df_dir_path + df_name
  df = pd.read_csv(path)
  df.drop(columns=['Unnamed: 0'], inplace=True)
  d = {}
  input_ids = []
  attention_mask= []
  decoder_input_ids = []
  decoder_attention_mask = []
  for idx in tqdm(range(df.shape[0])):
    context, true_false, target = df.loc[idx, 'passage'], df.loc[idx, 'answer'], df.loc[idx, 'question']
    tokenizer_input = "answer: %s context: %s" % (true_false, context)
    tokenized_input = tokenizer(tokenizer_input, max_length=max_len, truncation=truncation, pad_to_max_length=pad_to_max_len, return_tensors='pt')
    tokenized_target = tokenizer(target, max_length=max_len, truncation=truncation, pad_to_max_length=pad_to_max_len, return_tensors='pt')
    input_ids.append(tokenized_input['input_ids'].squeeze())
    attention_mask.append(tokenized_input['attention_mask'].squeeze())
    decoder_input_ids.append(tokenized_target['input_ids'].squeeze())
    decoder_attention_mask.append(tokenized_target['attention_mask'].squeeze())
  d['input_ids'] = input_ids
  d['attention_mask'] = attention_mask
  d['decoder_input_ids'] = decoder_input_ids
  d['decoder_attention_mask'] = decoder_attention_mask
  dataset = Dataset.from_dict(d)
  return dataset

In [26]:
def get_mean_loss(dir_path, eval_csv, model_path, batch_size, tokenizer_name='t5-base'):
    tokenizer = T5Tokenizer.from_pretrained(tokenizer_name)
    model = torch.load(model_path)
    eval_dataset = process_data(tokenizer, dir_path, eval_csv)
    data_collator = default_data_collator
    eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=data_collator)
    num_iterations = len(eval_dataset)//batch_size
    progress_bar_ = tqdm(range(num_iterations))
    device = torch.device('cuda:0') if torch.cuda.is_available() else 'cpu'
    torch.cuda.empty_cache()
    model.to(device)
    mean_loss = 0
    num_batches = len(eval_dataset)//batch_size
    model.eval()
    with torch.no_grad():
        for batch in tqdm(eval_dataloader):
                batch = {k: v.to(device) for k, v in batch.items()}
                input_ids, attention_mask, labels, decoder_attention_mask = batch['input_ids'], batch['attention_mask'], batch['decoder_input_ids'], batch['decoder_attention_mask']
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, labels=labels)
                loss = outputs.loss
                mean_loss += loss
    mean_loss/= num_batches
    return mean_loss

In [27]:
dir_path = "/home/jovyan/PlanetaryComputerExamples/AI Planet Task/dataset/"
eval_csv = "val_df.csv"
batch_size = 4

In [19]:
model_path = '/home/jovyan/PlanetaryComputerExamples/AI Planet Task/t5-base-boolean-qgen_pretrained-finetuned.bin'
mean_loss = get_mean_loss(dir_path, eval_csv, model_path, batch_size)
print("t5 base pretrained qgen finetuned eval mean loss: ", mean_loss)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.




  4%|▍         | 127/3270 [00:00<00:02, 1264.89it/s][A[A

  8%|▊         | 254/3270 [00:00<00:02, 1245.07it/s][A[A

 12%|█▏        | 379/3270 [00:00<00:02, 1238.90it/s][A[A

 16%|█▌        | 509/3270 [00:00<00:02, 1258.90it/s][A[A

 19%|█▉        | 635/3270 [00:00<00:02, 1251.27it/s][A[A

 23%|██▎       | 766/3270 [00:00<00:01, 1268.38it/s][A[A

 27%|██▋       | 893/3270 [00:00<00:01, 1264.57it/s][A[A

 31%|███       | 1020/3270 [00:00<00:01, 1241.78it/s][A[A

 35%|███▌      | 1147/3270 [00:00<00:01, 1249.12it/s][A[A

 39%|███▉      | 1272/3270 [00:01<00:01, 1248.50it/s

t5 base pretrained qgen finetuned eval mean loss:  tensor(0.0631, device='cuda:0')


In [20]:
model_path = '/home/jovyan/PlanetaryComputerExamples/AI Planet Task/t5-base-finetuned.bin'
mean_loss2 = get_mean_loss(dir_path, eval_csv, model_path, batch_size)
print("t5 base finetuned eval mean loss: ", mean_loss)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.




  4%|▍         | 125/3270 [00:00<00:02, 1248.80it/s][A[A

  8%|▊         | 250/3270 [00:00<00:02, 1227.07it/s][A[A

 11%|█▏        | 373/3270 [00:00<00:02, 1204.21it/s][A[A

 15%|█▌        | 502/3270 [00:00<00:02, 1234.85it/s][A[A

 19%|█▉        | 628/3270 [00:00<00:02, 1241.27it/s][A[A

 23%|██▎       | 755/3270 [00:00<00:02, 1250.45it/s][A[A

 27%|██▋       | 882/3270 [00:00<00:01, 1256.52it/s][A[A

 31%|███       | 1008/3270 [00:00<00:01, 1236.78it/s][A[A

 35%|███▍      | 1133/3270 [00:00<00:01, 1240.84it/s][A[A

 39%|███▊      | 1260/3270 [00:01<00:01, 1248.49it/s

t5 base finetuned eval mean loss:  tensor(0.0631, device='cuda:0')


In [30]:
model_path = '/home/jovyan/PlanetaryComputerExamples/AI Planet Task/t5-small_5epochs.bin'
mean_loss3 = get_mean_loss(dir_path, eval_csv, model_path, batch_size, tokenizer_name='t5-small')
print("t5 small finetuned eval mean loss: ", mean_loss)





  4%|▍         | 126/3270 [00:00<00:02, 1259.08it/s][A[A

  8%|▊         | 252/3270 [00:00<00:02, 1252.11it/s][A[A

 12%|█▏        | 379/3270 [00:00<00:02, 1257.90it/s][A[A

 16%|█▌        | 512/3270 [00:00<00:02, 1281.35it/s][A[A

 20%|█▉        | 641/3270 [00:00<00:02, 1278.55it/s][A[A

 24%|██▎       | 773/3270 [00:00<00:01, 1292.40it/s][A[A

 28%|██▊       | 903/3270 [00:00<00:01, 1288.61it/s][A[A

 32%|███▏      | 1032/3270 [00:00<00:01, 1266.47it/s][A[A

 36%|███▌      | 1162/3270 [00:00<00:01, 1276.07it/s][A[A

 39%|███▉      | 1290/3270 [00:01<00:01, 1272.96it/s][A[A

 43%|████▎     | 1418/3270 [00:01<00:01, 1266.38it/s][A[A

 47%|████▋     | 1547/3270 [00:01<00:01, 1271.29it/s][A[A

 51%|█████     | 1675/3270 [00:01<00:01, 1265.04it/s][A[A

 55%|█████▌    | 1802/3270 [00:01<00:01, 1238.03it/s][A[A

 59%|█████▉    | 1928/3270 [00:01<00:01, 1243.93it/s][A[A

 63%|██████▎   | 2053/3270 [00:01<00:00, 1243.57it/s][A[A

 67%|██████▋   | 2178/3270 

t5 base finetuned eval mean loss:  tensor(0.0631, device='cuda:0')



