# Product Owner LoRA Model - Evaluation (Baseline vs Student)

Este notebook eval√∫a el modelo Product Owner entrenado con LoRA contra el baseline.

**Requisitos**:
- Google Colab con GPU (T4 Free)
- El notebook clona autom√°ticamente el repositorio

**Pasos**:
1. Verificar GPU
2. Instalar dependencias
3. Clonar repositorio con el modelo LoRA
4. Ejecutar evaluaci√≥n baseline (Qwen2.5-7B sin LoRA)
5. Ejecutar evaluaci√≥n student (Qwen2.5-7B + LoRA)
6. Comparar resultados

## 1. Verificar GPU

In [None]:
!nvidia-smi

## 2. Instalar Dependencias

In [None]:
%%bash
pip install -q transformers>=4.36.0 peft>=0.7.0 bitsandbytes>=0.41.0 accelerate>=0.25.0 torch typer pyyaml

## 3. Clonar Repositorio y Verificar Modelo LoRA

In [None]:
import os
from pathlib import Path

# 1. Clonar repositorio con el modelo
print("üì• Clonando repositorio con el modelo LoRA...")

repo_url = "https://github.com/krukmat/agnostic-ai-pipeline.git"
repo_branch = "dspy-multi-role"
repo_path = "/content/agnostic-ai-pipeline"

if not os.path.exists(repo_path):
    !git clone --depth 1 --branch {repo_branch} {repo_url} {repo_path}
    print(f"‚úÖ Repositorio clonado (branch: {repo_branch})")
else:
    print(f"‚úÖ Repositorio ya existe en: {repo_path}")

# 2. Verificar que el modelo est√° en el repo
model_path = f"{repo_path}/artifacts/models/po_student_v1"
valset_path = f"{repo_path}/artifacts/synthetic/product_owner/product_owner_val.jsonl"

if not os.path.exists(model_path):
    print(f"\n‚ùå ERROR: Modelo no encontrado en: {model_path}")
    raise FileNotFoundError("Modelo LoRA no encontrado en el repositorio")

if not os.path.exists(valset_path):
    print(f"\n‚ùå ERROR: Dataset de validaci√≥n no encontrado en: {valset_path}")
    raise FileNotFoundError("Dataset de validaci√≥n no encontrado")

print(f"‚úÖ Modelo encontrado en: {model_path}")
print(f"‚úÖ Dataset de validaci√≥n encontrado: {valset_path}")

# 3. Verificar archivos cr√≠ticos del modelo
print(f"\nüìÇ Contenido del modelo:")
!ls -lh {model_path}

required_files = ["adapter_config.json", "adapter_model.safetensors", "tokenizer_config.json"]
missing_files = []

for file in required_files:
    file_path = os.path.join(model_path, file)
    if not os.path.exists(file_path):
        missing_files.append(file)
    else:
        file_size = os.path.getsize(file_path) / 1024**2  # MB
        print(f"  ‚úì {file} ({file_size:.1f} MB)")

if missing_files:
    print(f"\n‚ö†Ô∏è  ADVERTENCIA: Faltan archivos del modelo: {missing_files}")
    raise FileNotFoundError(f"Archivos cr√≠ticos faltantes: {missing_files}")
else:
    print("\n‚úÖ Todos los archivos del modelo est√°n presentes")

# 4. Cambiar al directorio del repo
os.chdir(repo_path)
print(f"\n‚úÖ Working directory: {os.getcwd()}")

## 4. Evaluaci√≥n Baseline (Qwen2.5-7B sin LoRA)

Esta evaluaci√≥n usa el modelo base sin el adapter LoRA.

In [None]:
%%bash
cd /content/agnostic-ai-pipeline

PYTHONPATH=. python scripts/eval_po_student.py \
  --tag baseline \
  --base-model Qwen/Qwen2.5-7B-Instruct \
  --max-samples 20 \
  --retries 2 \
  --max-new-tokens 1200 \
  --load-4bit \
  --bnb-compute-dtype float16

## 5. Evaluaci√≥n Student (Qwen2.5-7B + LoRA)

Esta evaluaci√≥n usa el modelo base con el adapter LoRA entrenado.

In [None]:
%%bash
cd /content/agnostic-ai-pipeline

PYTHONPATH=. python scripts/eval_po_student.py \
  --tag student \
  --base-model Qwen/Qwen2.5-7B-Instruct \
  --adapter-path artifacts/models/po_student_v1 \
  --max-samples 20 \
  --retries 2 \
  --max-new-tokens 1200 \
  --load-4bit \
  --bnb-compute-dtype float16

## 6. Comparar Resultados

In [None]:
import json
import glob
from pathlib import Path

# Buscar archivos de resultados
results_dir = Path("/content/agnostic-ai-pipeline/inference_results")
baseline_files = sorted(results_dir.glob("baseline_*.json"))
student_files = sorted(results_dir.glob("student_*.json"))

if not baseline_files:
    print("‚ö†Ô∏è  No se encontraron resultados de baseline")
else:
    print(f"\nüìä Archivos de resultados encontrados:")
    print(f"  Baseline: {len(baseline_files)} archivo(s)")
    print(f"  Student: {len(student_files)} archivo(s)")

# Cargar el resultado m√°s reciente de cada uno
if baseline_files and student_files:
    with open(baseline_files[-1], 'r') as f:
        baseline_data = json.load(f)
    
    with open(student_files[-1], 'r') as f:
        student_data = json.load(f)
    
    print(f"\n{'='*60}")
    print("COMPARACI√ìN DE RESULTADOS")
    print(f"{'='*60}\n")
    
    # M√©tricas generales
    print("üìà M√âTRICAS GENERALES\n")
    print(f"{'M√©trica':<30} {'Baseline':<15} {'Student':<15} {'Diff'}")
    print("-" * 70)
    
    baseline_metrics = baseline_data.get('metrics', {})
    student_metrics = student_data.get('metrics', {})
    
    if baseline_metrics and student_metrics:
        for metric in ['mean', 'std', 'min', 'max']:
            b_val = baseline_metrics.get(metric, 0)
            s_val = student_metrics.get(metric, 0)
            diff = s_val - b_val
            diff_pct = (diff / b_val * 100) if b_val != 0 else 0
            
            print(f"{metric.upper():<30} {b_val:<15.4f} {s_val:<15.4f} {diff:+.4f} ({diff_pct:+.1f}%)")
    
    # Tasa de √©xito YAML
    print(f"\nüìã TASA DE √âXITO YAML\n")
    print(f"{'Modelo':<30} {'Total':<10} {'V√°lidos':<10} {'Errores':<10} {'Tasa √âxito'}")
    print("-" * 70)
    
    b_total = baseline_data.get('total_samples', 0)
    b_valid = baseline_data.get('valid_samples', 0)
    b_failed = baseline_data.get('failed_samples', 0)
    b_rate = (b_valid / b_total * 100) if b_total > 0 else 0
    
    s_total = student_data.get('total_samples', 0)
    s_valid = student_data.get('valid_samples', 0)
    s_failed = student_data.get('failed_samples', 0)
    s_rate = (s_valid / s_total * 100) if s_total > 0 else 0
    
    print(f"{'Baseline':<30} {b_total:<10} {b_valid:<10} {b_failed:<10} {b_rate:.1f}%")
    print(f"{'Student':<30} {s_total:<10} {s_valid:<10} {s_failed:<10} {s_rate:.1f}%")
    
    # Criterios de aceptaci√≥n
    print(f"\n‚úÖ CRITERIOS DE ACEPTACI√ìN (9.D.4)\n")
    print("-" * 70)
    
    yaml_valid_threshold = 0.90
    quality_threshold = 0.90
    
    yaml_pass = (b_rate >= yaml_valid_threshold * 100) and (s_rate >= yaml_valid_threshold * 100)
    quality_pass = (s_val >= quality_threshold * b_val) if baseline_metrics and student_metrics else False
    
    print(f"1. YAML v√°lido ‚â•90%:")
    print(f"   Baseline: {b_rate:.1f}% {'‚úÖ PASS' if b_rate >= yaml_valid_threshold * 100 else '‚ùå FAIL'}")
    print(f"   Student:  {s_rate:.1f}% {'‚úÖ PASS' if s_rate >= yaml_valid_threshold * 100 else '‚ùå FAIL'}")
    
    if baseline_metrics and student_metrics:
        print(f"\n2. Student ‚â• 0.9 √ó Baseline:")
        target = quality_threshold * baseline_metrics.get('mean', 0)
        actual = student_metrics.get('mean', 0)
        print(f"   Target:  {target:.4f}")
        print(f"   Actual:  {actual:.4f} {'‚úÖ PASS' if actual >= target else '‚ùå FAIL'}")
    
    overall_pass = yaml_pass and quality_pass
    print(f"\n{'='*70}")
    print(f"RESULTADO GENERAL: {'‚úÖ PASS - Listo para 9.D.5' if overall_pass else '‚ùå FAIL - Requiere ajustes'}")
    print(f"{'='*70}")
    
    # Casos con errores
    if b_failed > 0 or s_failed > 0:
        print(f"\n‚ö†Ô∏è  CASOS CON ERROR DE FORMATO:\n")
        
        if b_failed > 0:
            print("Baseline:")
            for result in baseline_data.get('results', []):
                if result.get('status') == 'format_error':
                    print(f"  - {result.get('concept_id')} (tier: {result.get('tier')})")
        
        if s_failed > 0:
            print("\nStudent:")
            for result in student_data.get('results', []):
                if result.get('status') == 'format_error':
                    print(f"  - {result.get('concept_id')} (tier: {result.get('tier')})")

else:
    print("‚ö†Ô∏è  No se pueden comparar resultados: falta alg√∫n archivo")

## 7. Descargar Resultados

Descarga los archivos JSON para incluirlos en el repositorio.

In [None]:
from google.colab import files
import shutil

# Comprimir resultados
results_dir = "/content/agnostic-ai-pipeline/inference_results"
archive_path = "/content/eval_results_20251115"

if os.path.exists(results_dir):
    shutil.make_archive(archive_path, 'zip', results_dir)
    print(f"‚úÖ Resultados comprimidos en: {archive_path}.zip")
    
    # Descargar
    files.download(f"{archive_path}.zip")
    print("‚úÖ Descarga iniciada")
else:
    print("‚ùå No se encontr√≥ el directorio de resultados")