In [None]:
!pip install torch datasets tdqm transformers peft

In [None]:
import torch
import time
import os
from datasets import load_dataset
import pandas as pd
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import json

In [None]:
# --- 1. Configuration ---
base_model_name = "microsoft/phi-3-mini-4k-instruct"
# --- !! SET YOUR ADAPTER PATH !! ---
adapter_path = "/content/drive/MyDrive/my-phi3-json-adapter" # CHANGE THIS
test_dataset_path = "test_json_extraction.jsonl"
output_results_filename = "results_slm_tuned_task2.jsonl" # Output file

# Optional: Google Drive path
# from google.colab import drive
# drive.mount('/content/drive')
# output_base_path = "/content/drive/MyDrive/"
output_base_path = "./"
output_results_filepath = os.path.join(output_base_path, output_results_filename)

# Check GPU
if not torch.cuda.is_available():
    raise SystemError("GPU not available.")
print(f"GPU detected: {torch.cuda.get_device_name(0)}")

In [None]:
# --- 2. Load Base Model and Tokenizer (Quantized) ---
print("Loading base model and tokenizer (4-bit)...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [None]:
# --- 3. Load Fine-Tuned Adapter ---
print(f"Loading adapter from: {adapter_path}")
model = PeftModel.from_pretrained(base_model, adapter_path)
# model = model.merge_and_unload() # Optional merge
model.eval()
print("Using FINE-TUNED model for inference.")

In [None]:
# --- 4. Load Test Data ---
print(f"Loading test dataset from: {test_dataset_path}")
try:
    test_dataset = load_dataset("json", data_files=test_dataset_path, split="train")
    print(f"Test dataset loaded with {len(test_dataset)} examples.")
    required_cols = ['formatted_prompt', 'ground_truth_json', 'schema']
    if not all(col in test_dataset.column_names for col in required_cols):
         raise ValueError(f"Test file missing required columns: {required_cols}")
except Exception as e:
    print(f"Error loading test dataset: {e}")
    raise

In [None]:
# --- 5. Function to Run Inference (Tuned Model) ---
@torch.inference_mode()
def get_tuned_slm_json_output(prompt_text):
    # Tuned model expects the full prompt including <s>[INST]...[/INST]
    input_prompt = prompt_text.split("</s>")[0] + "</s>" # Ensure it ends correctly before generation

    inputs = tokenizer(input_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024).to("cuda")

    start_time = time.time()
    outputs = model.generate(
        **inputs,
        max_new_tokens=512, # Allow space for JSON
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=False,
    )
    end_time = time.time()
    latency = end_time - start_time

    input_token_len = inputs.input_ids.shape[1]
    generated_text = tokenizer.decode(outputs[0][input_token_len:], skip_special_tokens=True)
    cleaned_response = generated_text.strip()

    # Attempt to find the JSON part (tuned model should be better)
    json_output = None
    try:
        # Fine-tuned model ideally outputs JSON directly
        start_brace = cleaned_response.find('{')
        end_brace = cleaned_response.rfind('}')
        if start_brace != -1 and end_brace != -1 and end_brace > start_brace:
            potential_json = cleaned_response[start_brace : end_brace + 1]
            json.loads(potential_json) # Validate
            json_output = potential_json
        # No fallback to markdown needed ideally, but keep if you see issues
        # else: ... check for ```json ... ```

    except (json.JSONDecodeError, Exception) as e:
        pass # Handle invalid/missing JSON during evaluation

    return json_output, latency, cleaned_response

In [None]:
# --- 6. Run Evaluation Loop ---
results = []
print("\nStarting evaluation for FINE-TUNED SLM - Task 2...")
for example in tqdm(test_dataset):
    prompt = example['formatted_prompt']
    ground_truth = example['ground_truth_json']
    schema = example['schema']

    predicted_json_str, latency, raw_response = get_tuned_slm_json_output(prompt)

    results.append({
        "schema": schema,
        "ground_truth": ground_truth,
        "predicted_json_str": predicted_json_str,
        "latency": latency,
        "raw_response": raw_response
    })

print("\nEvaluation complete.")

In [None]:
# --- 7. Save Results ---
print(f"Saving raw results to: {output_results_filepath}")
try:
    with open(output_results_filepath, 'w', encoding='utf-8') as f:
        for item in results:
            item_to_save = item.copy()
            item_to_save['ground_truth'] = json.dumps(item_to_save['ground_truth'], ensure_ascii=False)
            f.write(json.dumps(item_to_save, ensure_ascii=False) + '\n')
    print(f"Successfully saved {len(results)} results.")
except Exception as e:
    print(f"Error saving results file: {e}")