In [1]:
import torch, re, gc, evaluate
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from datasets import load_dataset
from sklearn.metrics import f1_score
from tqdm import tqdm
import os

gc.collect()
torch.cuda.empty_cache()

if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

# Bits and Bytes configuration for the model
use_4bit = True
bnb_4bit_compute_dtype = "bfloat16"
bnb_4bit_quant_type = "nf4"
use_double_quant = True

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_double_quant,
)

# Load Model on GPU 
device_map = {"": 0}

# Load the evaluation dataset
eval_dataset = load_dataset("lex_glue", "case_hold", split="test")
# eval_dataset = eval_dataset.take(100)

# Prepare the evaluation data without including the label in the input text
def create_message_column(row):
    messages = []
    answers = "".join([f"{i}. {val}\n" for i, val in enumerate(row['endings'])])
    user = {
        "content": f"Choose the best option to fill <HOLDING> based on this context, answer only with the number of option(for example: 0):\n{row['context']}\nOptions to choose:\n{answers}",
        "role": "user"
    }
    messages.append(user)
    return {"messages": messages, "label": row["label"]}

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False), "label": row["label"]}


def format_func(example):
    answers = "".join([f"{i}. {val}\n" for i, val in enumerate(example['endings'])])
    text = f"<|user|>Choose the best option to fill <HOLDING> based on this context, answer only with the number of option (for example: 0):\n{example['context']}\nOptions to choose:\n{answers}<|end|>\n<|assistant|>"
    return {"text": text, "label": example["label"], "endings": example["endings"], "context":  example["context"]}

rouge = evaluate.load("rouge")

models = ["./CH-Phi3m4k","microsoft/Phi-3-mini-4k-instruct"]

for model in models:
    local_model_dir = model
    print(f"\n\n\nEVALUATE {local_model_dir}")
    try:
        # Load the tokenizer and model from the local directory
        tokenizer = AutoTokenizer.from_pretrained(local_model_dir, trust_remote_code=True, device_map=device_map, attn_implementation=attn_implementation)
        model = AutoModelForCausalLM.from_pretrained(local_model_dir, torch_dtype=compute_dtype, trust_remote_code=True, quantization_config=bnb_config, device_map=device_map, attn_implementation=attn_implementation)

        eval_dataset = eval_dataset.map(format_func)
        # eval_dataset = eval_dataset.map(format_dataset_chatml)

        # Set up the evaluation pipeline
        evaluation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

        # Perform evaluation
        true_labels = []
        predicted_labels = []
        error_answers = []

        for i, example in enumerate(tqdm(eval_dataset, desc="Evaluating")):
            input_text = example['text']
            expected_label = example['label']

            expected_answer = [str(example['label']), example['endings'][example['label']], str(example['label']) +". "+ example['endings'][example['label']]]
            # expected_answer = "\n".join(expected_answer_arr)

            result = evaluation_pipeline(input_text, max_length=1024, num_return_sequences=1)
            generated_text = result[0]['generated_text']

            # print(f"Expected Label: {expected_label}")
            # print(f"Generated Text: {generated_text}")

            try:
                answer = generated_text.strip().split("<|assistant|>")[-1].strip().replace("<|end|>", "").replace("<|endoftext|>", "").strip()
                predicted_label = int(re.search(r"(\d+)", answer).group(1))
                if(predicted_label != expected_label):
                    pred = [generated_text]
                    refs = [expected_answer]
                    scores = rouge.compute(predictions=pred, references=refs)
                    total_score = sum(scores.values())
                    if(total_score == 4):
                        predicted_label = expected_label
                    else:
                        predicted_label = -1
            except Exception:
                    predicted_label = -1
            
            true_labels.append(expected_label)
            predicted_labels.append(predicted_label)
            if(predicted_label == -1):
                error_answers.append({"label": expected_label, "response": generated_text})

        print("Evaluation Complete")
        # Calculate micro and macro F1 scores
        micro_f1 = f1_score(true_labels, predicted_labels, average='micro')
        # macro_f1 = f1_score(true_labels, predicted_labels, average='macro')

        print(f"F1 Score: {(micro_f1*100):.2f}")
        # print(f"Macro F1 Score: {(macro_f1*100):.2f}")

        # Save the error answers to json file
        err_file_name = local_model_dir.replace("/", "_")
        error_file = f"{err_file_name}_error_answers.json"
        with open(error_file, "w") as f:
            f.write(str(error_answers))

        # Clear memory after evaluation
        del model
        del tokenizer
        torch.cuda.empty_cache()
        # clear RAM after finishing the evaluation
        gc.collect()
        torch.cuda.empty_cache()
    except Exception as e:
        print(f"Error: {e}")
        del model
        del tokenizer
        torch.cuda.empty_cache()
        gc.collect()
        torch.cuda.empty_cache()


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.





EVALUATE ./CH-Phi3m4k


Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.50s/it]
Evaluating:   0%|          | 0/3600 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Evaluating:   0%|          | 10/3600 [00:05<26:55,  2.22it/s] You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating: 100%|██████████| 3600/3600 [25:55<00:00,  2.31it/s]


Evaluation Complete
F1 Score: 76.89



EVALUATE microsoft/Phi-3-mini-4k-instruct


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.41s/it]
Evaluating:   0%|          | 0/3600 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Evaluating: 100%|██████████| 3600/3600 [1:19:59<00:00,  1.33s/it]  

Evaluation Complete
F1 Score: 64.89



