In [21]:
!pip install transformers peft bitsandbytes datasets accelerate tqdm torch




In [22]:
import torch
import time
import os
from datasets import load_dataset
import pandas as pd
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import json

In [23]:
# --- 1. Configuration ---
base_model_name = "microsoft/phi-3-mini-4k-instruct"
test_dataset_path = "/content/drive/MyDrive/test_json_extraction.jsonl" # Path to your test data
output_results_filename = "results_slm_zeroshot_task2.jsonl" # Output file

# Optional: Google Drive path
# from google.colab import drive
# drive.mount('/content/drive')
output_base_path = "/content/drive/MyDrive/"
output_base_path = "./" # Save to current Colab directory
output_results_filepath = os.path.join(output_base_path, output_results_filename)

# Check GPU
if not torch.cuda.is_available():
    raise SystemError("GPU not available.")
print(f"GPU detected: {torch.cuda.get_device_name(0)}")

GPU detected: Tesla T4


In [28]:
print("Loading BASE model and tokenizer (4-bit)...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="eager"  # <-- ADICIONE ESTA LINHA
)
model.config.use_cache = True
model.eval()

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
print("Using BASE model (untuned) for inference.")

Loading BASE model and tokenizer (4-bit)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using BASE model (untuned) for inference.


In [25]:
# --- 3. Load Test Data (Prepared for Evaluation) ---
print(f"Loading test dataset from: {test_dataset_path}")
try:
    # Load the test file which should contain 'formatted_prompt', 'ground_truth_json', 'schema'
    test_dataset = load_dataset("json", data_files=test_dataset_path, split="train")
    print(f"Test dataset loaded with {len(test_dataset)} examples.")
    # Check required columns
    required_cols = ['formatted_prompt', 'ground_truth_json', 'schema']
    if not all(col in test_dataset.column_names for col in required_cols):
         raise ValueError(f"Test file missing required columns: {required_cols}")
except Exception as e:
    print(f"Error loading test dataset: {e}")
    raise

Loading test dataset from: /content/drive/MyDrive/test_json_extraction.jsonl
Test dataset loaded with 40 examples.


In [26]:
# --- 4. Function to Run Inference ---
@torch.inference_mode()
def get_slm_json_output(prompt_text):
    # Extract the part needed for the base model prompt (inside <s>[INST]...[/INST])
    # The base instruct model might work better if prompted more directly
    try:
        instruction_part = prompt_text.split("[INST]")[1].split("[/INST]")[0].strip()
        # Maybe simplify for zero-shot base model? Or keep the full instruction? Test both if needed.
        # Let's try keeping the full instruction format for consistency first.
        input_prompt = prompt_text.split("</s>")[0] + "</s>" # Keep original format for now
        # Alternative: input_prompt = instruction_part # Just the inner instruction part

    except IndexError:
         print(f"Warning: Could not parse [INST] tags in: {prompt_text[:100]}...")
         input_prompt = prompt_text # Fallback to using the full string if parsing fails

    inputs = tokenizer(input_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024).to("cuda")

    start_time = time.time()

    outputs = model.generate(
        **inputs,
        max_new_tokens=512, # Allow more tokens for potentially complex JSON
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=False, # Use greedy for consistency
        temperature=None,
        top_p=None,
    )

    end_time = time.time()
    latency = end_time - start_time

    input_token_len = inputs.input_ids.shape[1]
    generated_text = tokenizer.decode(outputs[0][input_token_len:], skip_special_tokens=True)
    cleaned_response = generated_text.strip()

    # Attempt to find the JSON part (heuristic)
    json_output = None
    try:
        # Look for the first '{' and the last '}'
        start_brace = cleaned_response.find('{')
        end_brace = cleaned_response.rfind('}')
        if start_brace != -1 and end_brace != -1 and end_brace > start_brace:
            potential_json = cleaned_response[start_brace : end_brace + 1]
            # Try parsing to validate
            json.loads(potential_json)
            json_output = potential_json # Assume valid if parsing works
        else:
             # Look for ```json ... ``` markdown block if no braces found
             start_md = cleaned_response.find('```json')
             end_md = cleaned_response.rfind('```')
             if start_md != -1 and end_md != -1 and end_md > start_md:
                  potential_json = cleaned_response[start_md + 7 : end_md].strip()
                  json.loads(potential_json)
                  json_output = potential_json

    except (json.JSONDecodeError, Exception) as e:
        # If parsing fails or braces not found, keep json_output as None
        pass # Handle invalid/missing JSON during evaluation

    return json_output, latency, cleaned_response # Return extracted JSON, latency, raw response

In [27]:
# --- 5. Run Evaluation Loop ---
results = []
print("\nStarting evaluation for UNTUNED SLM (Zero-Shot) - Task 2...")
for example in tqdm(test_dataset):
    prompt = example['formatted_prompt'] # The full prompt string
    ground_truth = example['ground_truth_json'] # The GT JSON object
    schema = example['schema'] # The schema string
    predicted_json_str, latency, raw_response = get_slm_json_output(prompt)

    results.append({
        "schema": schema,
        "ground_truth": ground_truth,
        "predicted_json_str": predicted_json_str, # Store the extracted string (or None)
        "latency": latency,
        "raw_response": raw_response
    })

print("\nEvaluation complete.")


Starting evaluation for UNTUNED SLM (Zero-Shot) - Task 2...


  0%|          | 0/40 [00:00<?, ?it/s]

aqui


AttributeError: 'DynamicCache' object has no attribute 'seen_tokens'

In [None]:
# --- 6. Save Results ---
print(f"Saving raw results to: {output_results_filepath}")
try:
    with open(output_results_filepath, 'w', encoding='utf-8') as f:
        for item in results:
            # Convert ground_truth back to string for saving in JSONL
            item_to_save = item.copy()
            item_to_save['ground_truth'] = json.dumps(item_to_save['ground_truth'], ensure_ascii=False)
            f.write(json.dumps(item_to_save, ensure_ascii=False) + '\n')
    print(f"Successfully saved {len(results)} results.")
except Exception as e:
    print(f"Error saving results file: {e}")