In [None]:
import logging
import sys
from functools import partial
from pathlib import Path

import pandas as pd
from datasets import load_dataset
from omegaconf import OmegaConf
from peft import PeftModel
from transformers import (
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

dataset_name = "kamel-usp/aes_enem_dataset"
dataset_split = "JBCS2025"
cache_dir = "/tmp/"
base_model = "microsoft/Phi-3.5-mini-instruct"
fine_tuned_model_id = "kamel-usp/jbcs2025_phi35-balanced-C3"
grade_index = int(fine_tuned_model_id[-1]) - 1
print(f"Grade index being used: {grade_index}")

Grade index being used: 2


In [2]:
parent_dir = str(Path(".").resolve().parent) + "/scripts"
sys.path.append(str(parent_dir))

In [3]:
from preprocess import load_tokenizer, tokenize_dataset

from metrics.metrics import compute_metrics
from models.fine_tuning_models.model_types_enum import ModelTypesEnum

In [4]:
model_type = ModelTypesEnum.PHI35_CLASSIFICATION_LORA.value
model_type

'phi35_classification_lora'

In [5]:
dataset = load_dataset(
    dataset_name,
    dataset_split,
    cache_dir=cache_dir,
)

# Load the tokenizer
tokenizer = load_tokenizer(
    model_type,
    base_model,
    cache_dir=cache_dir,
)

tokenized_dataset = tokenize_dataset(
    dataset,
    tokenizer,
    text_column="essay_text",
    grade_index=grade_index,
    model_type=model_type,
    logger=logging.getLogger(),
)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

In [6]:
NUM_LABELS = 6
if model_type in [
    ModelTypesEnum.LLAMA31_CLASSIFICATION_LORA.value,
    ModelTypesEnum.PHI35_CLASSIFICATION_LORA.value,
    ModelTypesEnum.PHI4_CLASSIFICATION_LORA.value,
]:
    base_model = AutoModelForSequenceClassification.from_pretrained(
        base_model, num_labels=NUM_LABELS
    )
    model = PeftModel.from_pretrained(base_model, fine_tuned_model_id)
else:
    model = AutoModelForSequenceClassification.from_pretrained(
        fine_tuned_model_id, cache_dir=cache_dir, num_labels=NUM_LABELS
    )
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Phi3ForSequenceClassification were not initialized from the model checkpoint at microsoft/Phi-3.5-mini-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/50.4M [00:00<?, ?B/s]

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Phi3ForSequenceClassification(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [7]:
experiment_config = OmegaConf.create({"experiments": {"model": {"type": model_type}}})

In [8]:
compute_metrics_partial = partial(compute_metrics, cfg=experiment_config)
PER_DEVICE_EVAL = 2

training_args = TrainingArguments(
    output_dir="test_trainer",
    do_eval=True,
    per_device_eval_batch_size=PER_DEVICE_EVAL,
    bf16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_dataset["test"],  # or whichever split you want
    compute_metrics=compute_metrics_partial,
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [9]:
if model_type in [
    ModelTypesEnum.LLAMA31_CLASSIFICATION_LORA.value,
    ModelTypesEnum.PHI35_CLASSIFICATION_LORA.value,
    ModelTypesEnum.PHI4_CLASSIFICATION_LORA.value,
]:
    model.config.pad_token_id = model.config.eos_token_id
assert model.config.pad_token_id is not None

In [10]:
eval_results = trainer.evaluate(tokenized_dataset["test"])

In [11]:
pd.DataFrame.from_dict(eval_results, orient="index").T

Unnamed: 0,eval_loss,eval_model_preparation_time,eval_accuracy,eval_RMSE,eval_QWK,eval_HDIV,eval_Macro_F1,eval_Micro_F1,eval_Weighted_F1,eval_Macro_F1_(ignoring_nan),eval_runtime,eval_samples_per_second,eval_steps_per_second
0,1.67208,0.0244,0.333333,60.433219,0.235356,0.115942,0.262559,0.333333,0.333661,0.31507,18.2815,7.549,3.774


In [12]:
print(f"""
metrics:
          - name: Macro F1 (ignoring nan)
            type: f1
            value: {eval_results["eval_Macro_F1_(ignoring_nan)"]}
          - name: QWK
            type: qwk
            value: {eval_results["eval_QWK"]}
          - name: Weighted Macro F1
            type: f1
            value: {eval_results["eval_Weighted_F1"]}""")


metrics:
          - name: Macro F1 (ignoring nan)
            type: f1
            value: 0.3150704718786213
          - name: QWK
            type: qwk
            value: 0.23535620052770445
          - name: Weighted Macro F1
            type: f1
            value: 0.3336611749101599


In [14]:
columns_to_use = [
    "eval_accuracy",
    "eval_RMSE",
    "eval_QWK",
    "eval_Macro_F1",
    "eval_Macro_F1_(ignoring_nan)",
    "eval_Weighted_F1",
    "eval_Micro_F1",
    "eval_HDIV",
]
report_df = pd.DataFrame.from_dict(eval_results, orient="index").T
report_df = report_df[columns_to_use].T
print(report_df.rename(columns={0: "test_data"}).to_markdown())

|                              |   test_data |
|:-----------------------------|------------:|
| eval_accuracy                |    0.333333 |
| eval_RMSE                    |   60.4332   |
| eval_QWK                     |    0.235356 |
| eval_Macro_F1                |    0.262559 |
| eval_Macro_F1_(ignoring_nan) |    0.31507  |
| eval_Weighted_F1             |    0.333661 |
| eval_Micro_F1                |    0.333333 |
| eval_HDIV                    |    0.115942 |
