In [2]:
import os
import math
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, default_data_collator
from peft import PeftModel
from datasets import load_dataset

In [3]:
model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
adapter_path = './accountant_lora_model'
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)



base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'auto',
    trust_remote_code=True
).eval()

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)

tmp_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'auto',
    trust_remote_code=True
)

tuned_model = PeftModel.from_pretrained(tmp_model, adapter_path)
tuned_model = tuned_model.merge_and_unload().eval()

Loading weights: 100%|██████████| 201/201 [00:14<00:00, 14.16it/s, Materializing param=model.norm.weight]                              
Loading weights: 100%|██████████| 201/201 [00:13<00:00, 15.11it/s, Materializing param=model.norm.weight]                              


ValueError: Can't find 'adapter_config.json' at './accountant_lora_model'

In [5]:
def tokenize(batch):
    texts = [
        f"### Instruction:\n{instruction}\n### Response:\n{out}"
        for instruction, out in zip(batch['question'], batch['answer'])
    ]

    tokens = tokenizer(
        texts,
        padding = 'max_length',
        max_length = 256,
        truncation = True,
        return_tensors = 'pt'
    )

    tokens['labels'] = tokens['input_ids'].clone()
    return tokens

In [6]:
eval_ds = load_dataset('openai/gsm8k', 'main', split='train[:200]')
eval_ds = eval_ds.map(tokenize, batched=True, remove_columns=['question', 'answer'])
eval_ds = eval_ds.with_format('torch')

Map: 100%|██████████| 200/200 [00:00<00:00, 5134.35 examples/s]


In [7]:
eval_loader = DataLoader(
    eval_ds,
    batch_size=8,
    collate_fn=default_data_collator
)

In [8]:
@torch.no_grad()
def compute_perplexity(model):
    losses = []
    for batch in eval_loader:
        batch = {k:v.to('cuda') for k,v in batch.items()}
        loss = model(**batch).loss
        losses.append(loss.item())

    return math.exp(sum(losses) / len(losses))


In [None]:
print("Base model perplexity:", compute_perplexity(base_model))
print("Tuned model perplexity:", compute_perplexity(tuned_model))

In [9]:
import random

raw_data = load_dataset('openai/gsm8k', 'main', split='train[:200]')
refs = raw_data['answer']

def generate(model, instruction):
    token_ids = tokenizer(f'### Instruction:\n{instruction}\n### Response:\n', return_tensors='pt').input_ids.to('cuda')

    with torch.no_grad():
        out = model.generate(token_ids, max_new_tokens=256)
    return tokenizer.decode(out[0], skip_special_tokens=True)