In [None]:
import logging
import sys
from functools import partial
from pathlib import Path

from datasets import load_dataset
from omegaconf import OmegaConf
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd

dataset_name = "kamel-usp/aes_enem_dataset"
dataset_split = "JBCS2025"
cache_dir = "/tmp/"
base_model = "meta-llama/Llama-3.1-8B"
fine_tuned_model_id = "kamel-usp/jbcs2025_llama31_8b-balanced-C3"
grade_index = int(fine_tuned_model_id[-1]) - 1
print(f"Grade index being used: {grade_index}")

Grade index being used: 1


In [2]:
parent_dir = str(Path(".").resolve().parent) + "/scripts"
sys.path.append(str(parent_dir))

In [3]:
from preprocess import load_tokenizer, tokenize_dataset
from metrics.metrics import compute_metrics
from models.fine_tuning_models.model_types_enum import ModelTypesEnum

In [4]:
model_type = ModelTypesEnum.LLAMA31_CLASSIFICATION_LORA.value
model_type

'llama31_classification_lora'

In [5]:
dataset = load_dataset(
    dataset_name,
    dataset_split,
    cache_dir=cache_dir,
)

# Load the tokenizer
tokenizer = load_tokenizer(
    model_type,
    base_model,
    cache_dir=cache_dir,
)

tokenized_dataset = tokenize_dataset(
    dataset,
    tokenizer,
    text_column="essay_text",
    grade_index=grade_index,
    model_type=model_type,
    logger=logging.getLogger(),
)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

In [6]:
NUM_LABELS = 6
model = AutoModelForSequenceClassification.from_pretrained(
    fine_tuned_model_id, cache_dir=cache_dir, num_labels=NUM_LABELS
)

adapter_config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/84.0M [00:00<?, ?B/s]

In [7]:
# 1. Create a dummy config (mimicking the structure your code expects)
experiment_config = OmegaConf.create({"experiments": {"model": {"type": model_type}}})

In [8]:
compute_metrics_partial = partial(compute_metrics, cfg=experiment_config)
PER_DEVICE_EVAL = 2

training_args = TrainingArguments(
    output_dir="test_trainer",
    do_eval=True,
    per_device_eval_batch_size=PER_DEVICE_EVAL,
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_dataset["test"],  # or whichever split you want
    compute_metrics=compute_metrics_partial,
)

In [9]:
model.config.pad_token_id = tokenizer.pad_token_id

In [10]:
eval_results = trainer.evaluate()

In [11]:
pd.DataFrame.from_dict(eval_results, orient="index").T

Unnamed: 0,eval_loss,eval_model_preparation_time,eval_accuracy,eval_RMSE,eval_QWK,eval_HDIV,eval_Macro_F1,eval_Micro_F1,eval_Weighted_F1,eval_Macro_F1_(ignoring_nan),eval_runtime,eval_samples_per_second,eval_steps_per_second
0,1.749257,0.0147,0.304348,73.819207,0.199259,0.15942,0.186761,0.304348,0.291343,0.224113,428.7594,0.322,0.161


In [12]:
print(f"""
metrics:
          - name: Macro F1 (ignoring nan)
            type: f1
            value: {eval_results["eval_Macro_F1_(ignoring_nan)"]}
          - name: QWK
            type: qwk
            value: {eval_results["eval_QWK"]}
          - name: Weighted Macro F1
            type: f1
            value: {eval_results["eval_Weighted_F1"]}""")


metrics:
          - name: Macro F1 (ignoring nan)
            type: f1
            value: 0.22411344969352892
          - name: QWK
            type: qwk
            value: 0.19925925925925914
          - name: Weighted Macro F1
            type: f1
            value: 0.29134261033082426


In [13]:
columns_to_use = [
    "eval_accuracy",
    "eval_RMSE",
    "eval_QWK",
    "eval_Macro_F1",
    "eval_Macro_F1_(ignoring_nan)",
    "eval_Weighted_F1",
    "eval_Micro_F1",
    "eval_HDIV",
]
report_df = pd.DataFrame.from_dict(eval_results, orient="index").T
report_df = report_df[columns_to_use].T
print(report_df.rename(columns={0: "test_data"}).to_markdown())

|                              |   test_data |
|:-----------------------------|------------:|
| eval_accuracy                |    0.304348 |
| eval_RMSE                    |   73.8192   |
| eval_QWK                     |    0.199259 |
| eval_Macro_F1                |    0.186761 |
| eval_Macro_F1_(ignoring_nan) |    0.224113 |
| eval_Weighted_F1             |    0.291343 |
| eval_Micro_F1                |    0.304348 |
| eval_HDIV                    |    0.15942  |
