In [1]:
from unsloth import FastLanguageModel
from src.llm_helper import invoke_llm, max_seq_length
import pandas as pd
import numpy as np
from tqdm import tqdm
from src.helper import rouge_lsum

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from datasets import load_dataset

dataset = load_dataset(
    'json', 
    data_files={
        'test': 'data/test.jsonl'
    }
)

In [3]:
ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "saved_models/full_model2",
    max_seq_length = max_seq_length,
    load_in_4bit = False,
    load_in_8bit = True
)

==((====))==  Unsloth 2025.8.9: Fast Gemma3 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [32]:
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-3-270m-it",
    max_seq_length = max_seq_length,
    load_in_4bit = False,
    load_in_8bit = True
)

==((====))==  Unsloth 2025.8.9: Fast Gemma3 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [33]:
# inference - base 
base_test_res = []
for example in tqdm(dataset['test']):
    answer = invoke_llm(example['instruction'], base_model, base_tokenizer)
    S = rouge_lsum(answer, example['output'])
    base_test_res.append([S.precision, S.recall, S.fmeasure, example['instruction'], example['output'], answer])


100%|██████████| 1000/1000 [13:40<00:00,  1.22it/s]


In [37]:

pd.DataFrame(base_test_res).to_csv('_base_test_res', index=False)

In [None]:
# inference - fine tuned
ft_test_res = []
for example in tqdm(dataset['test']):
    answer = invoke_llm(example['instruction'], ft_model, ft_tokenizer)
    S = rouge_lsum(answer, example['output'])
    ft_test_res.append([S.precision, S.recall, S.fmeasure, example['instruction'], example['output'], answer])


  0%|          | 1/1000 [00:13<3:40:16, 13.23s/it]

In [None]:
def eval(ft_res, base_res, score_idx):
    eval_df = pd.DataFrame(columns = [
        'ft_precision', 'ft_recall', 'ft_fscore', 
        'base_precision', 'base_recall', 'base_fscore',
        'query', 'answer', 
        'pred_ft', 'pred_base'
    ])
    for i in range(len(ft_res)):
        ft_precision, ft_recall, ft_fscore = ft_res[i][0], ft_res[i][1], ft_res[i][2]
        base_precision, base_recall, base_fscore = base_res[i][0], base_res[i][1], base_res[i][2]
        base_score = base_res[i][score_idx]
        query, answer = ft_res[i][3], ft_res[i][4]
        pred_ft = ft_res[i][-1]
        pred_base = base_res[i][-1]
        eval_df.loc[i] = [
            ft_precision, ft_recall, ft_fscore, 
            base_precision, base_recall, base_fscore, 
            query, answer, 
            pred_ft, pred_base
        ]
    return eval_df
    


In [None]:
eval_df = eval(ft_test_res, base_test_res, score_idx=0)
eval_df.to_csv('evaluation/eval_df.csv', index=False)

In [None]:
np.seed(1)
base_test_res[:2]
ft_test_res[:2]