In [1]:
import torch
import torch.nn as nn
print(torch.cuda.is_available())
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['WANDB_DISABLED'] = 'true'

import numpy as np
import pandas as pd
import transformers
import accelerate
import tensorboard
import bitsandbytes as bnb

True


  from .autonotebook import tqdm as notebook_tqdm


#### 1. Quantization

In [2]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#### 2. Model lnitialization

In [18]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_id = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(
        model_id, 
        quantization_config = bnb_config, 
        torch_dtype = torch.float16,
        device_map = {"":0}
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading model.safetensors:  30%|███       | 912M/3.00G [08:13<18:50, 1.85MB/s]


In [4]:
for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

#### 3. Observe model output before fine-tuning

In [5]:
from IPython.display import display, Markdown

def make_inference(model, context, question, max_new_tokens=200):
    batch = tokenizer(f"#### CONTEXT\n{context}\n\n#### QUESTION\n{question}\n\n#### ANSWER\n", return_tensors='pt', return_token_type_ids=False).to('cuda')

    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**batch, max_new_tokens=max_new_tokens)

    display(Markdown((tokenizer.decode(output_tokens[0], skip_special_tokens=True))))

In [6]:
context = "Cheese is the best food."
question = "What is the best food?"

make_inference(model, context, question)

Cheese

In [7]:
context = "Cheese is the best food."
question = "How far away is the Moon from the Earth?"

make_inference(model, context, question)

The Moon is approximately 1.3 billion light years away.

In [8]:
context = "The Moon orbits Earth at an average distance of 384,400 km (238,900 miles), or about 30 times Earth's diameter. Its gravitational influence is the main driver of Earth's tides and very slowly lengthens Earth's day. The Moon's orbit around Earth has a sidereal period of 27.3 days. During each synodic period of 29.5 days, the amount of visible surface illuminated by the Sun varies from none up to 100%, resulting in lunar phases that form the basis for the months of a lunar calendar. The Moon is tidally locked to Earth, which means that the length of a full rotation of the Moon on its own axis causes its same side (the near side) to always face Earth, and the somewhat longer lunar day is the same as the synodic period. However, 59% of the total lunar surface can be seen from Earth through cyclical shifts in perspective known as libration."
question = "At what distance does the Moon orbit the Earth?"

make_inference(model, context, question)

30 times Earth's diameter

#### 4. Helper functions

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [11]:
def create_prompt(context, question, answer):
    if len(answer["text"]) < 1:
        answer = "Cannot Find Answer"
    else:
        answer = answer["text"][0]
    prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n{answer}</s>"
    return prompt_template

 #### 5. Load preprocessed dataset from disk (skip to step 7)

In [216]:
## Load from disk

from datasets import load_dataset, Dataset, load_from_disk

dataset = load_from_disk('/home/dataiku/data/saved_datasets/squad/raw/train')
tokenized_dataset = load_from_disk('/home/dataiku/data/saved_datasets/squad/tokenized/train')

#### 6a. Load raw dataset from HuggingFace

In [219]:
from datasets import load_dataset, Dataset

dataset = load_dataset("squad_v2")
dataset = pd.DataFrame(dataset['train'])

# remove rows with empty answers
exclude = []
for i in range(len(dataset)):
    if not dataset.iloc[i]['answers']['text']:
        exclude.append(i)
dataset = dataset.drop(exclude)
print(f'{len(exclude)} rows removed.')

# accept only the first answer in every line of data
answer = []
for i in range(len(dataset)):
    answer.append(dataset.iloc[i]['answers']['text'][0])
dataset['answer'] = answer

dataset = Dataset.from_pandas(dataset)
dataset = dataset.train_test_split(train_size=0.15, test_size=0.02) # smaller dataset

dataset["validation"] = dataset["test"]
del dataset["test"]

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['validation'])}")


## Save dataset to disk for easy loading later

dataset_path = '/home/dataiku/data/saved_datasets/squad'
dataset.save_to_disk(f'{dataset_path}/raw')

43498 rows removed.
Train dataset size: 13023
Test dataset size: 1737



Saving the dataset (0/1 shards):   0%|          | 0/13023 [00:00<?, ? examples/s][A
Saving the dataset (0/1 shards):  15%|█▌        | 2000/13023 [00:00<00:00, 19896.18 examples/s][A
Saving the dataset (0/1 shards):  54%|█████▍    | 7000/13023 [00:00<00:00, 31883.86 examples/s][A
Saving the dataset (0/1 shards): 100%|█████████▉| 13000/13023 [00:00<00:00, 37680.72 examples/s][A
Saving the dataset (1/1 shards): 100%|██████████| 13023/13023 [00:00<00:00, 32627.31 examples/s][A

Saving the dataset (0/1 shards):   0%|          | 0/1737 [00:00<?, ? examples/s][A
Saving the dataset (1/1 shards): 100%|██████████| 1737/1737 [00:00<00:00, 20940.78 examples/s][A


In [230]:
## To load dataset from local directory

load_from_disk('/home/dataiku/data/saved_datasets/squad/raw/')

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 13023
})

#### 6b. Preprocess training dataset

In [111]:
from datasets import concatenate_datasets

## Determine maximum total input sequence length after tokenization => 
## Sequences beyond this will be truncated, sequences shorter will be padded

tokenized_inputs = concatenate_datasets([dataset["train"], dataset["validation"]]).map(lambda x: tokenizer(x["context"], truncation=True), batched=True, remove_columns=['id', 'title', 'context', 'question', 'answers', 'answer', '__index_level_0__'])
input_lengths = [len(x) for x in tokenized_inputs["input_ids"]]
max_source_length = int(np.percentile(input_lengths, 85))    # 85% of max length for better utilization
print(f"Max source length: {max_source_length}")


## Determine maximum total sequence length for target text after tokenization =>  
## Sequences beyond this will be truncated, sequences shorter will be padded
tokenized_targets = concatenate_datasets([dataset["train"], dataset["validation"]]).map(lambda x: tokenizer(x["answer"], truncation=True), batched=True, remove_columns=['id', 'title', 'context', 'question', 'answers', 'answer', '__index_level_0__'])
target_lengths = [len(x) for x in tokenized_targets["input_ids"]]
max_target_length = int(np.percentile(target_lengths, 90))    # 90% of max length for better utilization
print(f"Max target length: {max_target_length}")


Map:   0%|          | 0/14760 [00:00<?, ? examples/s][A
Map:   7%|▋         | 1000/14760 [00:00<00:03, 3670.79 examples/s][A
Map:  14%|█▎        | 2000/14760 [00:00<00:03, 3899.45 examples/s][A
Map:  20%|██        | 3000/14760 [00:00<00:02, 4069.40 examples/s][A
Map:  27%|██▋       | 4000/14760 [00:00<00:02, 4224.46 examples/s][A
Map:  34%|███▍      | 5000/14760 [00:01<00:02, 4216.91 examples/s][A
Map:  41%|████      | 6000/14760 [00:01<00:02, 3807.28 examples/s][A
Map:  47%|████▋     | 7000/14760 [00:01<00:02, 3835.22 examples/s][A
Map:  54%|█████▍    | 8000/14760 [00:02<00:01, 3896.34 examples/s][A
Map:  61%|██████    | 9000/14760 [00:02<00:01, 3899.55 examples/s][A
Map:  68%|██████▊   | 10000/14760 [00:02<00:01, 3824.29 examples/s][A
Map:  75%|███████▍  | 11000/14760 [00:02<00:00, 3814.15 examples/s][A
Map:  81%|████████▏ | 12000/14760 [00:03<00:00, 3879.74 examples/s][A
Map:  88%|████████▊ | 13000/14760 [00:03<00:00, 3874.62 examples/s][A
Map:  95%|█████████▍| 14000/

Max source length: 243



Map:   0%|          | 0/14760 [00:00<?, ? examples/s][A
Map:  20%|██        | 3000/14760 [00:00<00:00, 20649.42 examples/s][A
Map:  41%|████      | 6000/14760 [00:00<00:00, 21161.22 examples/s][A
Map:  61%|██████    | 9000/14760 [00:00<00:00, 22118.97 examples/s][A
Map:  81%|████████▏ | 12000/14760 [00:01<00:00, 9087.57 examples/s][A
Map: 100%|██████████| 14760/14760 [00:01<00:00, 12504.39 examples/s][A


Max target length: 11


In [112]:
def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    inputs = [f'context: {i} question: {j}' for i, j in zip(sample["context"], sample["question"])]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, 
                                batched=True, 
                                remove_columns=['id', 'title', 'context', 'question', 'answers', 'answer', '__index_level_0__'], 
                                desc="Running tokenizer on dataset")
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

      
## Save tokenized_dataset to disk for later easy loading

dataset_path = '/home/dataiku/data/saved_datasets/squad/tokenized'
tokenized_dataset['train'].save_to_disk(f'{dataset_path}/train')
tokenized_dataset['validation'].save_to_disk(f'{dataset_path}/test')   # used for evaluation


Running tokenizer on dataset:   0%|          | 0/13023 [00:00<?, ? examples/s][A
Running tokenizer on dataset:   8%|▊         | 1000/13023 [00:00<00:03, 3401.40 examples/s][A
Running tokenizer on dataset:  15%|█▌        | 2000/13023 [00:00<00:03, 3410.19 examples/s][A
Running tokenizer on dataset:  23%|██▎       | 3000/13023 [00:00<00:03, 3085.31 examples/s][A
Running tokenizer on dataset:  31%|███       | 4000/13023 [00:01<00:02, 3129.54 examples/s][A
Running tokenizer on dataset:  38%|███▊      | 5000/13023 [00:01<00:02, 3187.97 examples/s][A
Running tokenizer on dataset:  46%|████▌     | 6000/13023 [00:01<00:02, 3242.43 examples/s][A
Running tokenizer on dataset:  54%|█████▍    | 7000/13023 [00:02<00:01, 3286.57 examples/s][A
Running tokenizer on dataset:  61%|██████▏   | 8000/13023 [00:02<00:01, 3260.45 examples/s][A
Running tokenizer on dataset:  69%|██████▉   | 9000/13023 [00:02<00:01, 3199.85 examples/s][A
Running tokenizer on dataset:  77%|███████▋  | 10000/13023 [00

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']



Saving the dataset (0/1 shards):   0%|          | 0/13023 [00:00<?, ? examples/s][A
Saving the dataset (1/1 shards): 100%|██████████| 13023/13023 [00:00<00:00, 180625.52 examples/s][A

Saving the dataset (0/1 shards):   0%|          | 0/1737 [00:00<?, ? examples/s][A
Saving the dataset (1/1 shards): 100%|██████████| 1737/1737 [00:00<00:00, 76934.92 examples/s][A


#### 7. Fine-Tune T5 with LoRA and bnb int-8

In addition to the LoRA technique, we will use [bitsanbytes LLM.int8()](https://huggingface.co/blog/hf-bitsandbytes-integration) to quantize out frozen LLM to int8. This allows us to reduce the needed memory for FLAN-T5 XXL ~4x.  

The first step of our training is to load the model. We are going to use [philschmid/flan-t5-xxl-sharded-fp16](https://huggingface.co/philschmid/flan-t5-xxl-sharded-fp16), which is a sharded version of [google/flan-t5-xxl](https://huggingface.co/google/flan-t5-xxl). The sharding will help us to not run off of memory when loading the model.

In [160]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_id = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(
        model_id, 
        quantization_config = bnb_config, 
        torch_dtype = torch.float16,
        device_map = {"":0}
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [161]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config 
lora_config = LoraConfig(
 r=16,              # 4
 lora_alpha=32,     # 8
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1769472 || all params: 249347328 || trainable%: 0.7096414524241463


In [162]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir = "outputs",
    save_strategy = "no",
    report_to = "tensorboard",
    auto_find_batch_size = True,
    warmup_steps = 100,
    learning_rate = 1e-3, 
    weight_decay = 0.001, 
    fp16_full_eval = True, 
    fp16 = False,                         # 16 bits precision is sufficient and good
    num_train_epochs = 3,
    logging_strategy = "steps", 
    logging_steps = 100, 
#     max_steps = 2000,                   # disable if specifying no. of epochs
#     gradient_accumulation_steps = 4,    # no. of updates steps to accumulate gradients, before updating it (higher = more accurate, but takes longer)
#     optim='adamw_bnb_8bit', 
#     save_total_limit = 8,               # no. of checkpoints (models) saved in output_dir
#     evaluation_strategy = 'epoch', 
#     logging_dir = f"{output_dir}/logs",

)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model = model,
    label_pad_token_id = -100,   # we want to ignore tokenizer pad token in the loss
    pad_to_multiple_of = 8
)

trainer = Seq2SeqTrainer(
    model = model, 
    args = training_args, 
    data_collator = data_collator, 
    train_dataset = tokenized_dataset       # why when add the eval_dataset argument, training loss becomes 0
    # if tokenized_dataset regenerated in 6b. (not loaded from disk), need to add in ['train'] indices
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,0.9644
100,0.7185
150,0.7782
200,0.7159
250,0.7919
300,0.7763
350,0.7032
400,0.7382
450,0.7638
500,0.6268


TrainOutput(global_step=4884, training_loss=0.6191869320974889, metrics={'train_runtime': 2055.125, 'train_samples_per_second': 19.011, 'train_steps_per_second': 2.376, 'total_flos': 1.3061247910060032e+16, 'train_loss': 0.6191869320974889, 'epoch': 3.0})

#### 8. Saving model

In [164]:
print(f'total: {round(torch.cuda.get_device_properties(0).total_memory / 10**9, 3)} GB')
print(f'reserved: {round(torch.cuda.memory_reserved(0) / 10**9, 3)} GB')     # reserved = allocated + cached
print(f'allocated: {round(torch.cuda.memory_allocated(0) / 10**9, 3)} GB')

[i/10**9 for i in torch.cuda.mem_get_info()]  # (free memory usage, total available memory)

total: 15.656 GB
reserved: 6.164 GB
allocated: 2.653 GB


[9.2192768, 15.655829504]

In [165]:
peft_model_path = '<path>'
trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)   # not rly necessary unless changes made to tokenizer: add new tokens to its vocab, redefine special symbols such as '[CLS]', '[MASK]', '[SEP]', '[PAD]' etc.

('/home/dataiku/data/saved_models/flant5base_lora_squad/tokenizer_config.json',
 '/home/dataiku/data/saved_models/flant5base_lora_squad/special_tokens_map.json',
 '/home/dataiku/data/saved_models/flant5base_lora_squad/spiece.model',
 '/home/dataiku/data/saved_models/flant5base_lora_squad/added_tokens.json',
 '/home/dataiku/data/saved_models/flant5base_lora_squad/tokenizer.json')

In [15]:
## To push model to HuggingFace

# trainer.model.push_to_hub("<huggingface directory>",
#                   use_auth_token='<token>',
#                   commit_message="v1",
#                   private=True)

#### 9. Model inference

In [166]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc. 
peft_model_path = '<saved model path>'
config = PeftConfig.from_pretrained(peft_model_path)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, 
                                              return_dict=True, 
                                              load_in_8bit=True,    # True if quantizing
                                              device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
peft_model = PeftModel.from_pretrained(model, peft_model_path, device_map={"":0})
peft_model.eval()

print("Peft model loaded")



Peft model loaded


In [167]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_id = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(
        model_id, 
        quantization_config = bnb_config, 
        torch_dtype = torch.float16,
        device_map = {"":0}
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [174]:
context = "Cheese is the best food."
question = "What is the best food?"

print('model:')
make_inference(model, context, question)
print('peft_model:')
make_inference(peft_model, context, question)

model:


Cheese

peft_model:


Cheese

In [175]:
context = "Cheese is the best food."
question = "How far away is the Moon from the Earth?"

print('model:')
make_inference(model, context, question)
print('peft_model:')
make_inference(peft_model, context, question)

model:


The Moon is approximately 1.3 billion light years away.

peft_model:


a distance of 0.002 miles

In [176]:
context = "The Moon orbits Earth at an average distance of 384,400 km (238,900 miles), or about 30 times Earth's diameter. Its gravitational influence is the main driver of Earth's tides and very slowly lengthens Earth's day. The Moon's orbit around Earth has a sidereal period of 27.3 days. During each synodic period of 29.5 days, the amount of visible surface illuminated by the Sun varies from none up to 100%, resulting in lunar phases that form the basis for the months of a lunar calendar. The Moon is tidally locked to Earth, which means that the length of a full rotation of the Moon on its own axis causes its same side (the near side) to always face Earth, and the somewhat longer lunar day is the same as the synodic period. However, 59% of the total lunar surface can be seen from Earth through cyclical shifts in perspective known as libration."
question = "At what distance does the Moon orbit the Earth?"

print('model:')
make_inference(model, context, question)
print('peft_model:')
make_inference(peft_model, context, question)

model:


30 times Earth's diameter

peft_model:


384,400 km (238,900 miles),

In [177]:
## Basic

context = f"""
Another approach to brain function is to examine the consequences of damage to specific brain areas. 
Even though it is protected by the skull and meninges, surrounded by cerebrospinal fluid, 
and isolated from the bloodstream by the blood–brain barrier, 
the delicate nature of the brain makes it vulnerable to numerous diseases and several types of damage. 
In humans, the effects of strokes and other types of brain damage have been a key source of information about brain function. 
Because there is no ability to experimentally control the nature of the damage, however, 
this information is often difficult to interpret. In animal studies, most commonly involving rats, 
it is possible to use electrodes or loclly injected chemicals to produce precise patterns of damage 
and then examine the consequences for behavior.
"""
question = "Why is it difficult to study the brain?"

print('model:')
make_inference(model, context, question)
print('peft_model:')
make_inference(peft_model, context, question)

model:


Because there is no ability to experimentally control the nature of the damage, however, this information is often difficult to interpret

peft_model:


there is no ability to experimentally control the nature

In [178]:
## Intermediate

context = f"""
Another approach to brain function is to examine the consequences of damage to specific brain areas. 
Even though it is protected by the skull and meninges, surrounded by cerebrospinal fluid, 
and isolated from the bloodstream by the blood–brain barrier, 
the delicate nature of the brain makes it vulnerable to numerous diseases and several types of damage. 
In humans, the effects of strokes and other types of brain damage have been a key source of information about brain function. 
Because there is no ability to experimentally control the nature of the damage, however, 
this information is often difficult to interpret. In animal studies, most commonly involving rats, 
it is possible to use electrodes or locally injected chemicals to produce precise patterns of damage 
and then examine the consequences for behavior.
"""
question = "How do we check for brain damage?"

print('model:')
make_inference(model, context, question)
print('peft_model:')
make_inference(peft_model, context, question)

model:


In animal studies, most commonly involving rats, it is possible to use electrodes or locally injected chemicals to produce precise patterns of damage and then examine the consequences for behavior.

peft_model:


use electrodes or locally injected chemicals to produce

#### 10. Model evaluation

In [148]:
## Helper functions

def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [163]:
from datasets import load_from_disk
from tqdm import tqdm

## function to generate predictions
def evaluate_peft_model(sample, max_target_length=200):
    outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)    
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
    labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)    # Replace -100 in the labels as cannot be decoded
    labels = tokenizer.decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    return prediction, labels

## load test dataset from distk
test_dataset = load_from_disk('/home/dataiku/data/saved_datasets/squad/tokenized/test').with_format("torch")

## compute score
f1_scores, exact_scores = [], []
for sample in tqdm(test_dataset, miniters=100, maxinterval=float("inf"), position=0, leave=True):
    p, l = evaluate_peft_model(sample)
    f1_scores.append(compute_f1(p, l))
    exact_scores.append(compute_exact_match(p, l))

print(np.mean(f1_scores))
print(np.mean(exact_scores))

100%|██████████| 1737/1737 [08:48<00:00,  3.29it/s]

0.6424420525932688
0.514104778353483



