### This file was created to evaluate the performance of models trained with the OASST1 dataset before having the file 'nlp_ifeval.ipynb', with the IfEval, but evaluation is made by humans' perception of the result. The model creates a response to each prompt, and saves it in a file name 'ifeval_human_perception.jsonl'.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
import torch
import json
from tqdm import tqdm

In [None]:
model_name = "model_OASST1_2000/checkpoint-2000"

In [None]:
# Step 1: Load the tokenizer and model with quantization
device = torch.device('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
compute_dtype = getattr(torch, "bfloat16")  # Set computation data type to bfloat16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable loading the model in 4-bit precision
    bnb_4bit_quant_type="nf4",            # Specify quantization type as Normal Float 4
    bnb_4bit_compute_dtype=compute_dtype, # Set computation data type
    bnb_4bit_use_double_quant=True,       # Use double quantization for better accuracy
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    trust_remote_code=True,
    quantization_config=bnb_config,
).to(device)


In [None]:
# Step 2: Load the google/IFEval dataset
dataset = load_dataset("google/IFEval")
dataset


In [None]:
# keep 20 examples only to speed up the evaluation
# dataset = dataset['train'].select(range(20))
# dataset

In [None]:
# Step 3: Generate predictions on the dataset
output_file = "ifeval_human_perception.jsonl"
with open(output_file, 'w', encoding='utf-8') as f_out:
    for sample in tqdm(dataset['train']):   # Use 'validation' or 'train' split if 'test' is not available
        input_text = sample['prompt']  # Adjust the field name based on the dataset's structure

        # Prepare the input prompt
        prompt = input_text
        # si el prompt es mayor a 256 cortar longitud
        if len(prompt) > 512:
            prompt = prompt[:512]

        # Tokenize input
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

        # Generate output
        outputs = model.generate(
            inputs,
            max_length=512,
            eos_token_id=tokenizer.eos_token_id,
        )

        # Decode output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Since the model may include the prompt in its output, we extract the generated response
        response = generated_text[len(prompt):]

        print("Prompt", prompt)
        print("Response", response)
        #Prepare the JSON object
        json_obj = {
            "prompt": prompt,
            "response": response
        }

        # Write the JSON object to file
        f_out.write(json.dumps(json_obj) + '\n')

In [None]:
# abre model_responses.jsonl
with open("model_responses.jsonl", "r") as f:
    for line in f:
        # print prompt y response
        prompt = json.loads(line)["prompt"]
        response = json.loads(line)["response"]
        print("####################")
        print(f"Prompt: {prompt}")
        print(f"Response: {response}")