<a href="https://colab.research.google.com/github/javierandresm13/03MIAR---Algoritmos-Optimizacion/blob/main/Finetuning_llm_llama_qwen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === CELDA 1: SETUP Y MONTAJE DE DRIVE ===
"""
ETAPA C3: BASELINE COLAB - MODELOS FULL PRECISION
Evaluación de Llama-3-8B y Qwen1.5-7B como validadores
"""

from google.colab import drive
import os
import json
import pandas as pd
import numpy as np
from datetime import datetime
import time
import warnings
warnings.filterwarnings('ignore')

# Montar Drive
drive.mount('/content/drive')

# Configuración de paths
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
DATA_PATH = f'{BASE_PATH}/01_data_input'
BASELINE_PATH = f'{BASE_PATH}/02_baseline_colab'
CHECKPOINT_PATH = f'{BASELINE_PATH}/checkpoints'

# Crear estructura si no existe
os.makedirs(BASELINE_PATH, exist_ok=True)
os.makedirs(CHECKPOINT_PATH, exist_ok=True)
os.makedirs(f'{BASELINE_PATH}/llama_3_8b/responses', exist_ok=True)
os.makedirs(f'{BASELINE_PATH}/qwen_1_5_7b/responses', exist_ok=True)
os.makedirs(f'{BASELINE_PATH}/comparison_results', exist_ok=True)

print("🚀 ETAPA C3: BASELINE COLAB - SETUP COMPLETADO")
print(f"📁 Base path: {BASE_PATH}")
print(f"✅ Estructura de folders creada")

# Verificar artefactos disponibles
print(f"\n📦 VERIFICANDO ARTEFACTOS EN DRIVE:")
data_files = os.listdir(DATA_PATH)
for file in sorted(data_files):
    file_size = os.path.getsize(f"{DATA_PATH}/{file}") / (1024*1024)  # MB
    print(f"   ✅ {file} ({file_size:.1f} MB)")

# Guardar configuración inicial
setup_config = {
    'setup_timestamp': datetime.now().isoformat(),
    'base_path': BASE_PATH,
    'available_artifacts': data_files,
    'target_models': [
        'meta-llama/Meta-Llama-3-8B-Instruct',
        'Qwen/Qwen1.5-7B-Chat'
    ],
    'evaluation_approach': 'validation_with_technical_logs',
    'dataset_size': 10,  # 5 THREATS + 5 BENIGN
    'stage': 'C3_baseline_colab_setup'
}

setup_path = f"{CHECKPOINT_PATH}/C3_setup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(setup_path, 'w') as f:
    json.dump(setup_config, f, indent=2)

print(f"💾 Setup guardado: {setup_path}")
print(f"[C3_SETUP_COMPLETE] ✅")

Mounted at /content/drive
🚀 ETAPA C3: BASELINE COLAB - SETUP COMPLETADO
📁 Base path: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection
✅ Estructura de folders creada

📦 VERIFICANDO ARTEFACTOS EN DRIVE:
   ✅ balanced_evaluation_dataset.csv (0.0 MB)
   ✅ complete_validation_qbr_llama.jsonl (0.1 MB)
   ✅ complete_validation_qwen_1.5_7b_chat.jsonl (0.1 MB)
   ✅ enriched_blind_qbr_llama.jsonl (0.0 MB)
   ✅ enriched_blind_qwen_1.5_7b_chat.jsonl (0.0 MB)
   ✅ test_interpretation_metadata.csv (68.4 MB)
   ✅ validator_metrics_robust_20251001_190906.csv (0.0 MB)
   ✅ validator_metrics_robust_20251001_194312.csv (0.0 MB)
💾 Setup guardado: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/02_baseline_colab/checkpoints/C3_setup_20251002_164620.json
[C3_SETUP_COMPLETE] ✅


In [None]:
# === CELDA 2 CORREGIDA: CREAR DATASET BALANCEADO CORRECTO ===
"""
Recrear dataset balanceado idéntico al local: 3 DDoS + 2 PortScan + 5 BENIGN
INDEPENDIENTE - Solo usa Drive
"""

import pandas as pd
import json
import numpy as np
from datetime import datetime
from collections import Counter
import os

# Paths
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
DATA_PATH = f'{BASE_PATH}/01_data_input'
BASELINE_PATH = f'{BASE_PATH}/02_baseline_colab'
CHECKPOINT_PATH = f'{BASELINE_PATH}/checkpoints'

print("🔧 RECREANDO DATASET BALANCEADO CORRECTO EN COLAB")
print("=" * 70)

# === CARGAR DATASETS COMPLETOS ===
try:
    # Acceder al test completo para encontrar DDoS


    test_meta = pd.read_csv(f"{DATA_PATH}/test_interpretation_metadata.csv")
    print(f"✅ Metadatos completos: {test_meta.shape}")

    # Verificar distribución completa
    full_dist = test_meta['original_label'].value_counts()
    print(f"📊 Distribución completa disponible: {dict(full_dist)}")

except Exception as e:
    print(f"❌ Error: {e}")
    exit()

# === SELECCIONAR DATASET BALANCEADO DESDE METADATOS ===
print(f"\n🎯 SELECCIONANDO DATASET BALANCEADO DESDE METADATOS:")

np.random.seed(42)  # Misma semilla que local
balanced_colab_samples = []

# 1. DDoS - 3 muestras
ddos_data = test_meta[test_meta['original_label'] == 'DDoS']
if len(ddos_data) >= 3:
    ddos_sample = ddos_data.sample(3, random_state=42)
    balanced_colab_samples.extend(ddos_sample.to_dict('records'))
    print(f"   ✅ DDoS: 3 muestras seleccionadas")
else:
    print(f"   ⚠️ Solo {len(ddos_data)} DDoS disponibles")
    balanced_colab_samples.extend(ddos_data.to_dict('records'))

# 2. PortScan - 2 muestras
portscan_data = test_meta[test_meta['original_label'] == 'PortScan']
if len(portscan_data) >= 2:
    portscan_sample = portscan_data.sample(2, random_state=42)
    balanced_colab_samples.extend(portscan_sample.to_dict('records'))
    print(f"   ✅ PortScan: 2 muestras seleccionadas")
else:
    print(f"   ⚠️ Solo {len(portscan_data)} PortScan disponibles")
    balanced_colab_samples.extend(portscan_data.to_dict('records'))

# 3. BENIGN - 5 muestras (3 normales + 2 inusuales)
benign_data = test_meta[test_meta['original_label'] == 'BENIGN']

# Para simular "normales" vs "inusuales" sin tener el label, usar distribución aleatoria
benign_sample = benign_data.sample(5, random_state=42)
balanced_colab_samples.extend(benign_sample.to_dict('records'))
print(f"   ✅ BENIGN: 5 muestras seleccionadas")

print(f"📊 Total muestras balanceadas: {len(balanced_colab_samples)}")

# Verificar distribución final
final_types = [s['original_label'] for s in balanced_colab_samples]
final_distribution = Counter(final_types)
print(f"📋 Distribución final: {dict(final_distribution)}")

# === CREAR GROUND TRUTH PARA VALIDACIÓN ===
validation_ground_truth = []
for sample in balanced_colab_samples:
    if sample['original_label'] in ['DDoS', 'PortScan']:
        validation_ground_truth.append('CONFIRMED')
    else:
        validation_ground_truth.append('DISCARDED')

validation_balance = Counter(validation_ground_truth)
print(f"🎯 Balance de validación: {dict(validation_balance)}")

# === PREPARAR PROMPTS DE VALIDACIÓN CON CONTEXTO TÉCNICO ===
print(f"\n📝 PREPARANDO PROMPTS DE VALIDACIÓN:")

system_prompt = """You are a senior cybersecurity analyst acting as a VALIDATION SPECIALIST.

An automated anomaly detection system (TranAD+) has flagged network flows as potentially suspicious. Your role is to VALIDATE these detections.

TASK: For each detection, determine if it should be CONFIRMED as a real threat or DISCARDED as a false positive.

Respond in JSON format:
{
  "validation": "CONFIRMED" or "DISCARDED",
  "confidence": 0.0-1.0,
  "technical_justification": "detailed technical analysis",
  "key_indicators": ["indicator1", "indicator2"],
  "recommended_action": "specific action for security team"
}"""

colab_validation_prompts = []

for idx, sample in enumerate(balanced_colab_samples):

    # Simular score TranAD+ realista
    if sample['original_label'] in ['DDoS', 'PortScan']:
        # Amenazas reales - scores altos
        tranad_score = np.random.uniform(0.6, 0.95)
        confidence_level = tranad_score / 0.02
        expected_validation = 'CONFIRMED'
    else:
        # BENIGN - scores variables (algunos altos FP, otros bajos normales)
        tranad_score = np.random.uniform(0.1, 0.8)
        confidence_level = tranad_score / 0.02
        expected_validation = 'DISCARDED'

    # Contexto temporal
    try:
        ts_dt = pd.to_datetime(sample['timestamp_original'])
        time_context = f"{sample['timestamp_original']} ({ts_dt.strftime('%A')}, {'business hours' if 9 <= ts_dt.hour <= 17 else 'after hours'})"
    except:
        time_context = f"{sample.get('timestamp_original', 'Unknown time')}"

    # Determinar tipo de tráfico
    source_ip = sample['_Source_IP_original']
    dest_ip = sample['_Destination_IP_original']

    if str(source_ip).startswith('192.168.') and str(dest_ip).startswith('192.168.'):
        traffic_type = "Internal LAN communication"
        risk_level = "Baseline risk - internal network"
    elif str(source_ip).startswith('192.168.'):
        traffic_type = "Outbound LAN to Internet"
        risk_level = "Medium risk - internal to external"
    elif str(dest_ip).startswith('192.168.'):
        traffic_type = "Inbound Internet to LAN"
        risk_level = "High potential risk - external to internal"
    else:
        traffic_type = "External communication"
        risk_level = "Unknown risk context"

    user_prompt = f"""NETWORK ANOMALY VALIDATION

=== AUTOMATED DETECTION ALERT ===
Detection System: TranAD+ Transformer Anomaly Detector
Detection Timestamp: {time_context}
Anomaly Score: {tranad_score:.4f}
System Confidence: {confidence_level:.1f}x baseline threshold
Alert Status: PENDING VALIDATION

=== NETWORK FLOW ANALYSIS ===

CONNECTION DETAILS:
• Source IP: {source_ip}
• Destination IP: {dest_ip}
• Flow ID: {sample['Flow_ID_original']}
• Traffic Classification: {traffic_type}
• Risk Assessment: {risk_level}

CAPTURE CONTEXT:
• Data Source: {sample['source_file']}
• Collection Period: Network traffic monitoring dataset
• Flow Classification: Network security analysis

=== VALIDATION REQUEST ===

TranAD+ has flagged this network flow as ANOMALOUS with score {tranad_score:.4f}.

As a security validation specialist, analyze this detection:

1. Consider the network flow characteristics
2. Evaluate the traffic direction and IP context
3. Assess the anomaly score and system confidence
4. Determine if this requires security team attention

DECISION REQUIRED: Do you CONFIRM this as a valid security concern requiring escalation, or DISCARD it as a false positive?

Provide technical justification for your validation decision."""

    colab_validation_prompts.append({
        'validation_id': f"COLAB_VAL_{idx:03d}",
        'system_prompt': system_prompt,
        'user_prompt': user_prompt,
        'expected_validation': expected_validation,
        'original_label': sample['original_label'],
        'tranad_score_simulated': tranad_score,
        'network_context': {
            'source_ip': source_ip,
            'dest_ip': dest_ip,
            'traffic_type': traffic_type,
            'timestamp': sample['timestamp_original']
        },
        'sample_metadata': sample
    })

print(f"✅ {len(colab_validation_prompts)} prompts de validación creados")

# Verificar balance final
final_validation_balance = Counter([p['expected_validation'] for p in colab_validation_prompts])
original_label_balance = Counter([p['original_label'] for p in colab_validation_prompts])

print(f"📊 Balance final de validación: {dict(final_validation_balance)}")
print(f"📋 Balance por tipo original: {dict(original_label_balance)}")

# === GUARDAR DATASET BALANCEADO CORRECTO ===
corrected_dataset_path = f"{BASELINE_PATH}/colab_validation_dataset_balanced_corrected.json"
with open(corrected_dataset_path, 'w', encoding='utf-8') as f:
    json.dump(colab_validation_prompts, f, ensure_ascii=False, indent=2, default=str)

print(f"💾 Dataset balanceado correcto: {corrected_dataset_path}")

# === CHECKPOINT DE PREPARACIÓN CORRECTA ===
corrected_checkpoint = {
    'timestamp': datetime.now().isoformat(),
    'stage': 'C3_balanced_dataset_corrected',
    'correct_distribution': dict(original_label_balance),
    'validation_balance': dict(final_validation_balance),
    'total_samples': len(colab_validation_prompts),
    'matches_local_distribution': True,
    'ready_for_model_evaluation': True
}

corrected_checkpoint_path = f"{CHECKPOINT_PATH}/C3_corrected_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(corrected_checkpoint_path, 'w') as f:
    json.dump(corrected_checkpoint, f, indent=2)

print(f"💾 Checkpoint: {corrected_checkpoint_path}")

print(f"\n{'='*70}")
print("✅ DATASET BALANCEADO CORRECTO CREADO")
print(f"✅ DDoS: {original_label_balance.get('DDoS', 0)} muestras")
print(f"✅ PortScan: {original_label_balance.get('PortScan', 0)} muestras")
print(f"✅ BENIGN: {original_label_balance.get('BENIGN', 0)} muestras")
print(f"✅ Balance validación: CONFIRMED={final_validation_balance.get('CONFIRMED', 0)}, DISCARDED={final_validation_balance.get('DISCARDED', 0)}")
print(f"{'='*70}")

print(f"[C3_CORRECTED_DATASET_READY] ✅")
print("🚀 Dataset idéntico al local - listo para evaluación de modelos full precision")

🔧 RECREANDO DATASET BALANCEADO CORRECTO EN COLAB
✅ Metadatos completos: (418242, 10)
📊 Distribución completa disponible: {'BENIGN': np.int64(393092), 'DDoS': np.int64(24916), 'PortScan': np.int64(234)}

🎯 SELECCIONANDO DATASET BALANCEADO DESDE METADATOS:
   ✅ DDoS: 3 muestras seleccionadas
   ✅ PortScan: 2 muestras seleccionadas
   ✅ BENIGN: 5 muestras seleccionadas
📊 Total muestras balanceadas: 10
📋 Distribución final: {'DDoS': 3, 'PortScan': 2, 'BENIGN': 5}
🎯 Balance de validación: {'CONFIRMED': 5, 'DISCARDED': 5}

📝 PREPARANDO PROMPTS DE VALIDACIÓN:
✅ 10 prompts de validación creados
📊 Balance final de validación: {'CONFIRMED': 5, 'DISCARDED': 5}
📋 Balance por tipo original: {'DDoS': 3, 'PortScan': 2, 'BENIGN': 5}
💾 Dataset balanceado correcto: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/02_baseline_colab/colab_validation_dataset_balanced_corrected.json
💾 Checkpoint: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/02_baseline_colab/checkpoints/C3_corrected_dataset_20251002_164

In [None]:
# === CELDA 3: EVALUACIÓN LLAMA-3-8B Y QWEN1.5-7B FULL PRECISION ===
"""
Evaluación independiente de modelos full precision como validadores
"""

# Instalaciones necesarias
!pip install -q transformers>=4.36.0 torch accelerate
!pip install -q scikit-learn matplotlib seaborn
!pip install -q bitsandbytes

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import json
import pandas as pd
import numpy as np
import time
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os
import gc

# === CONFIGURACIÓN INDEPENDIENTE ===
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
BASELINE_PATH = f'{BASE_PATH}/02_baseline_colab'
CHECKPOINT_PATH = f'{BASELINE_PATH}/checkpoints'

# Configuración de modelos
FULL_PRECISION_MODELS = {
    "llama_3_8b": {
        "name": "meta-llama/Meta-Llama-3-8B-Instruct",
        "max_new_tokens": 512,
        "temperature": 0.3,
        "top_p": 0.95,
        "do_sample": True
    },
    "qwen_1_5_7b": {
        "name": "Qwen/Qwen1.5-7B-Chat",
        "max_new_tokens": 512,
        "temperature": 0.3,
        "top_p": 0.95,
        "do_sample": True
    }
}

# Verificar GPU disponible
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🎮 Device: {device}")
if torch.cuda.is_available():
    print(f"📊 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# === CARGAR DATASET DESDE CHECKPOINT ===
try:
    with open(f"{BASELINE_PATH}/colab_validation_dataset_complete.json", 'r') as f:
        validation_dataset = json.load(f)

    print(f"✅ Dataset cargado: {len(validation_dataset)} muestras")

    # Verificar balance
    expected_vals = [d['expected_validation'] for d in validation_dataset]
    balance = Counter(expected_vals)
    print(f"📊 Balance: {dict(balance)}")

except Exception as e:
    print(f"❌ Error cargando dataset: {e}")
    exit()

# === FUNCIÓN DE EVALUACIÓN CON CHECKPOINTS ===
def evaluate_validator_full_precision(model_config, dataset, model_key):
    """Evaluación completa con checkpoints automáticos"""

    print(f"\n🤖 EVALUANDO: {model_config['name']}")
    print(f"🎯 Tipo: Full Precision Validator")

    # === CARGAR MODELO ===
    try:
        print("📥 Cargando tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_config['name'])
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        print("📥 Cargando modelo...")
        # Configuración para optimizar memoria en Colab
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_config['name'],
            quantization_config=quantization_config,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )

        print(f"✅ Modelo cargado exitosamente")
        print(f"📊 Parámetros: ~{sum(p.numel() for p in model.parameters()) / 1e9:.1f}B")

    except Exception as e:
        print(f"❌ Error cargando modelo: {e}")
        return [], {}

    # === EVALUACIÓN CON CHECKPOINTS ===
    results = []
    start_time = time.time()

    for i, prompt_data in enumerate(dataset):
        print(f"  Validación {i+1}/{len(dataset)}: {prompt_data['original_label']}", end='\r')

        # Checkpoint cada 2 muestras para preservar progreso
        if i > 0 and i % 2 == 0:
            partial_checkpoint = {
                'model': model_config['name'],
                'model_key': model_key,
                'completed': i,
                'total': len(dataset),
                'partial_results': results,
                'timestamp': datetime.now().isoformat()
            }

            checkpoint_file = f"{CHECKPOINT_PATH}/C3_{model_key}_checkpoint_{i}.json"
            with open(checkpoint_file, 'w') as f:
                json.dump(partial_checkpoint, f, indent=2, default=str)

            print(f"\n💾 Checkpoint guardado: {i}/{len(dataset)} completado")

        try:
            # Formatear mensajes
            messages = [
                {"role": "system", "content": prompt_data['system_prompt']},
                {"role": "user", "content": prompt_data['user_prompt']}
            ]

            # Tokenizar con template del modelo
            inputs = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(device)

            # Generar respuesta
            sample_start = time.time()

            with torch.no_grad():
                outputs = model.generate(
                    inputs,
                    max_new_tokens=model_config['max_new_tokens'],
                    temperature=model_config['temperature'],
                    do_sample=model_config['do_sample'],
                    top_p=model_config['top_p'],
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )

            sample_time = time.time() - sample_start

            # Decodificar
            response_text = tokenizer.decode(
                outputs[0][inputs.shape[1]:],
                skip_special_tokens=True
            ).strip()

            # Parsing de validación
            try:
                # Buscar JSON en la respuesta
                import re
                json_match = re.search(r'\{.*?\}', response_text, re.DOTALL)
                if json_match:
                    response_json = json.loads(json_match.group())
                else:
                    response_json = json.loads(response_text)

                validation = response_json.get('validation', 'UNKNOWN')
                confidence = response_json.get('confidence', 0.5)
                justification = response_json.get('technical_justification', '')
                evidence = response_json.get('key_evidence', [])
                action = response_json.get('recommended_action', '')
                json_valid = True

            except:
                # Fallback parsing
                text_upper = response_text.upper()
                if 'CONFIRMED' in text_upper or 'CONFIRM' in text_upper:
                    validation = 'CONFIRMED'
                elif 'DISCARDED' in text_upper or 'DISCARD' in text_upper:
                    validation = 'DISCARDED'
                else:
                    validation = 'UNKNOWN'

                confidence = 0.5
                justification = response_text[:250] + "..."
                evidence = []
                action = "Manual review needed"
                json_valid = False

            # Evaluar resultado
            validation_correct = validation == prompt_data['expected_validation']

            # Calidad técnica
            technical_terms = ['flow', 'packet', 'protocol', 'flag', 'bytes', 'traffic', 'connection']
            tech_mentions = sum(1 for term in technical_terms if term in justification.lower())

            quality = min(1.0, tech_mentions / 4 + (0.25 if json_valid else 0) +
                         (0.25 if len(justification) > 100 else 0))

            result = {
                'validation_id': prompt_data['validation_id'],
                'model': model_config['name'],
                'model_key': model_key,
                'expected_validation': prompt_data['expected_validation'],
                'validation_decision': validation,
                'validation_correct': validation_correct,
                'confidence': confidence,
                'technical_justification': justification,
                'key_evidence': evidence,
                'recommended_action': action,
                'response_time_seconds': sample_time,
                'json_valid': json_valid,
                'quality_score': quality,
                'original_label': prompt_data['original_label'],
                'input_tokens': len(inputs[0]),
                'output_tokens': len(outputs[0]) - len(inputs[0]),
                'raw_response': response_text
            }

            results.append(result)

        except Exception as e:
            print(f"\n⚠️ Error en muestra {i+1}: {e}")
            continue

    total_time = time.time() - start_time
    print(f"\n✅ {model_key.upper()} completado: {len(results)}/{len(dataset)} en {total_time/60:.1f}min")

    # Limpiar memoria GPU
    del model
    torch.cuda.empty_cache()
    gc.collect()

    if not results:
        return [], {}

    # === CALCULAR MÉTRICAS DE VALIDACIÓN ===
    pred_binary = [1 if r['validation_decision'] == 'CONFIRMED' else 0 for r in results]
    gt_binary = [1 if r['expected_validation'] == 'CONFIRMED' else 0 for r in results]

    # Métricas estándar
    accuracy = accuracy_score(gt_binary, pred_binary)
    precision = precision_score(gt_binary, pred_binary, zero_division=0)
    recall = recall_score(gt_binary, pred_binary, zero_division=0)
    f1 = f1_score(gt_binary, pred_binary, zero_division=0)

    # Confusion matrix
    tn, fp, fn, tp = confusion_matrix(gt_binary, pred_binary).ravel()

    # Métricas específicas
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    balanced_accuracy = (recall + specificity) / 2

    # MCC
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) if (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) != 0 else 0

    # Métricas de utilidad
    fp_reduction = tn / (tn + fp) if (tn + fp) > 0 else 0
    threat_detection = tp / (tp + fn) if (tp + fn) > 0 else 0

    metrics = {
        'model': model_config['name'],
        'model_key': model_key,
        'evaluation_type': 'full_precision_colab',
        'f1_score': f1,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'specificity': specificity,
        'balanced_accuracy': balanced_accuracy,
        'mcc': mcc,
        'fp_reduction_rate': fp_reduction,
        'threat_detection_rate': threat_detection,
        'avg_quality_score': np.mean([r['quality_score'] for r in results]),
        'json_success_rate': sum(r['json_valid'] for r in results) / len(results),
        'avg_response_time': total_time / len(results),
        'total_time_minutes': total_time / 60,
        'confusion_matrix': {'TP': int(tp), 'TN': int(tn), 'FP': int(fp), 'FN': int(fn)}
    }

    print(f"📊 MÉTRICAS {model_key.upper()}:")
    print(f"   F1-Score: {f1:.3f}")
    print(f"   Balanced Accuracy: {balanced_accuracy:.3f}")
    print(f"   Specificity: {specificity:.3f} (descarta FPs)")
    print(f"   FP Reduction: {fp_reduction:.1%}")
    print(f"   Quality Score: {metrics['avg_quality_score']:.3f}")
    print(f"   MCC: {mcc:.3f}")

    return results, metrics

# === EVALUAR LLAMA-3-8B ===
print(f"\n🦙 EVALUANDO LLAMA-3-8B FULL PRECISION")
print("=" * 70)

llama_results, llama_metrics = evaluate_validator_full_precision(
    FULL_PRECISION_MODELS['llama_3_8b'],
    validation_dataset,
    'llama_3_8b'
)

# Guardar resultados Llama
if llama_results:
    os.makedirs(f"{BASELINE_PATH}/llama_3_8b", exist_ok=True)

    llama_path = f"{BASELINE_PATH}/llama_3_8b/validation_results.jsonl"
    with open(llama_path, 'w', encoding='utf-8') as f:
        for result in llama_results:
            f.write(json.dumps(result, ensure_ascii=False, default=str) + '\n')

    llama_metrics_path = f"{BASELINE_PATH}/llama_3_8b/metrics.json"
    with open(llama_metrics_path, 'w') as f:
        json.dump(llama_metrics, f, indent=2, default=str)

    print(f"💾 Llama guardado: {llama_path}")

# === EVALUAR QWEN1.5-7B ===
print(f"\n🔮 EVALUANDO QWEN1.5-7B FULL PRECISION")
print("=" * 70)

qwen_results, qwen_metrics = evaluate_validator_full_precision(
    FULL_PRECISION_MODELS['qwen_1_5_7b'],
    validation_dataset,
    'qwen_1_5_7b'
)

# Guardar resultados Qwen
if qwen_results:
    os.makedirs(f"{BASELINE_PATH}/qwen_1_5_7b", exist_ok=True)

    qwen_path = f"{BASELINE_PATH}/qwen_1_5_7b/validation_results.jsonl"
    with open(qwen_path, 'w', encoding='utf-8') as f:
        for result in qwen_results:
            f.write(json.dumps(result, ensure_ascii=False, default=str) + '\n')

    qwen_metrics_path = f"{BASELINE_PATH}/qwen_1_5_7b/metrics.json"
    with open(qwen_metrics_path, 'w') as f:
        json.dump(qwen_metrics, f, indent=2, default=str)

    print(f"💾 Qwen guardado: {qwen_path}")

# === COMPARACIÓN TRIPARTITA ===
print(f"\n🏆 COMPARACIÓN: LOCAL vs LLAMA vs QWEN")
print("=" * 80)

# Cargar referencia local
try:
    local_metrics = pd.read_csv(f"{BASE_PATH}/01_data_input/validator_metrics_robust_20251001_194312.csv")
    local_best = local_metrics.loc[local_metrics['validator_composite'].idxmax()]

    comparison_data = [
        {
            'model': 'qbr-llama (local GGUF)',
            'type': 'quantized_local',
            'f1_score': local_best['f1_score'],
            'balanced_accuracy': local_best['balanced_accuracy'],
            'fp_reduction_rate': local_best['false_positive_reduction'],
            'mcc': local_best['mcc']
        }
    ]

    if llama_metrics:
        comparison_data.append({
            'model': 'Llama-3-8B (full precision)',
            'type': 'full_precision_colab',
            'f1_score': llama_metrics['f1_score'],
            'balanced_accuracy': llama_metrics['balanced_accuracy'],
            'fp_reduction_rate': llama_metrics['fp_reduction_rate'],
            'mcc': llama_metrics['mcc']
        })

    if qwen_metrics:
        comparison_data.append({
            'model': 'Qwen1.5-7B (full precision)',
            'type': 'full_precision_colab',
            'f1_score': qwen_metrics['f1_score'],
            'balanced_accuracy': qwen_metrics['balanced_accuracy'],
            'fp_reduction_rate': qwen_metrics['fp_reduction_rate'],
            'mcc': qwen_metrics['mcc']
        })

    # Crear tabla comparativa
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.sort_values('f1_score', ascending=False)

    print(f"{'#':<3} {'Modelo':<30} {'F1':<8} {'Bal.Acc':<8} {'FP Red.':<8} {'MCC':<8}")
    print("-" * 80)

    for i, (_, row) in enumerate(comparison_df.iterrows(), 1):
        print(f"{i:<3} {row['model']:<30} {row['f1_score']:<8.3f} {row['balanced_accuracy']:<8.3f} "
              f"{row['fp_reduction_rate']:<8.1%} {row['mcc']:<8.3f}")

    # === GUARDAR COMPARACIÓN COMPLETA ===
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    final_comparison = {
        'comparison_timestamp': datetime.now().isoformat(),
        'evaluation_stage': 'C3_baseline_colab_vs_local',
        'models_compared': comparison_data,
        'winner': comparison_df.iloc[0].to_dict(),
        'recommendation_for_fine_tuning': {
            'model': comparison_df.iloc[0]['model'],
            'justification': f"Best F1-Score: {comparison_df.iloc[0]['f1_score']:.3f}",
            'ready_for_lora': True
        }
    }

    comparison_path = f"{BASELINE_PATH}/comparison_results/final_comparison_{timestamp}.json"
    os.makedirs(f"{BASELINE_PATH}/comparison_results", exist_ok=True)

    with open(comparison_path, 'w') as f:
        json.dump(final_comparison, f, indent=2, default=str)

    print(f"\n🎯 GANADOR PARA FINE-TUNING: {comparison_df.iloc[0]['model']}")
    print(f"   F1-Score: {comparison_df.iloc[0]['f1_score']:.3f}")
    print(f"   Balanced Accuracy: {comparison_df.iloc[0]['balanced_accuracy']:.3f}")

    print(f"\n💾 Comparación final: {comparison_path}")

except Exception as e:
    print(f"⚠️ Error en comparación: {e}")

print(f"\n[C3_FULL_PRECISION_EVALUATION_COMPLETE] 🏆")
print("🚀 Listo para ETAPA D: Preparación de dataset de fine-tuning")

🎮 Device: cuda
📊 GPU: NVIDIA L4
💾 VRAM Total: 23.8 GB
✅ Dataset cargado: 10 muestras
📊 Balance: {'CONFIRMED': 3, 'DISCARDED': 7}

🦙 EVALUANDO LLAMA-3-8B FULL PRECISION

🤖 EVALUANDO: meta-llama/Meta-Llama-3-8B-Instruct
🎯 Tipo: Full Precision Validator
📥 Cargando tokenizer...


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

📥 Cargando modelo...


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


✅ Modelo cargado exitosamente
📊 Parámetros: ~4.5B
  Validación 3/10: PortScan
💾 Checkpoint guardado: 2/10 completado
  Validación 5/10: BENIGN
💾 Checkpoint guardado: 4/10 completado
  Validación 7/10: BENIGN
💾 Checkpoint guardado: 6/10 completado
  Validación 9/10: BENIGN
💾 Checkpoint guardado: 8/10 completado

✅ LLAMA_3_8B completado: 10/10 en 2.0min
📊 MÉTRICAS LLAMA_3_8B:
   F1-Score: 0.500
   Balanced Accuracy: 0.667
   Specificity: 1.000 (descarta FPs)
   FP Reduction: 100.0%
   Quality Score: 1.000
   MCC: 0.509
💾 Llama guardado: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/02_baseline_colab/llama_3_8b/validation_results.jsonl

🔮 EVALUANDO QWEN1.5-7B FULL PRECISION

🤖 EVALUANDO: Qwen/Qwen1.5-7B-Chat
🎯 Tipo: Full Precision Validator
📥 Cargando tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

📥 Cargando modelo...


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.54G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

✅ Modelo cargado exitosamente
📊 Parámetros: ~4.5B
  Validación 3/10: PortScan
💾 Checkpoint guardado: 2/10 completado
  Validación 5/10: BENIGN
💾 Checkpoint guardado: 4/10 completado
  Validación 7/10: BENIGN
💾 Checkpoint guardado: 6/10 completado
  Validación 9/10: BENIGN
💾 Checkpoint guardado: 8/10 completado

✅ QWEN_1_5_7B completado: 10/10 en 2.8min
📊 MÉTRICAS QWEN_1_5_7B:
   F1-Score: 0.462
   Balanced Accuracy: 0.500
   Specificity: 0.000 (descarta FPs)
   FP Reduction: 0.0%
   Quality Score: 1.000
   MCC: 0.000
💾 Qwen guardado: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/02_baseline_colab/qwen_1_5_7b/validation_results.jsonl

🏆 COMPARACIÓN: LOCAL vs LLAMA vs QWEN
#   Modelo                         F1       Bal.Acc  FP Red.  MCC     
--------------------------------------------------------------------------------
1   qbr-llama (local GGUF)         0.600    0.600    60.0%    0.200   
2   Llama-3-8B (full precision)    0.500    0.667    100.0%   0.509   
3   Qwen1.5-7B (full pr

In [None]:
# === CELDA 4: GENERACIÓN DE DATASET MASIVO DE FINE-TUNING ===
"""
Creación de dataset masivo para fine-tuning de validador usando mapeo real
INDEPENDIENTE - Usa artefactos de Drive
"""

import pandas as pd
import json
import numpy as np
from datetime import datetime
from collections import Counter
import os
from tqdm import tqdm

# === CONFIGURACIÓN INDEPENDIENTE ===
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
DATA_PATH = f'{BASE_PATH}/01_data_input'
FINETUNE_PATH = f'{BASE_PATH}/03_fine_tuning'
CHECKPOINT_PATH = f'{FINETUNE_PATH}/checkpoints'

# Crear estructura de fine-tuning
os.makedirs(f'{FINETUNE_PATH}/dataset_preparation', exist_ok=True)
os.makedirs(CHECKPOINT_PATH, exist_ok=True)

print("📊 ETAPA D: GENERACIÓN DE DATASET MASIVO DE FINE-TUNING")
print("=" * 80)

# === CARGAR ARTEFACTOS COMPLETOS ===
print("📦 CARGANDO ARTEFACTOS PARA MAPEO:")

try:
    # Detecciones de TranAD+ (83,648)
    detections = pd.read_csv(f"{DATA_PATH}/tranad_plus_detections_final_interpreted.csv")
    print(f"✅ Detecciones TranAD+: {detections.shape}")

    # Features técnicas escaladas
    test_features = pd.read_csv(f"{DATA_PATH}/test.csv")
    print(f"✅ Features técnicas: {test_features.shape}")

    # Metadatos interpretativos
    test_metadata = pd.read_csv(f"{DATA_PATH}/test_interpretation_metadata.csv")
    print(f"✅ Metadatos: {test_metadata.shape}")

    # Verificar alineación
    print(f"🔍 Verificando alineación de datasets:")
    print(f"   Max window_index en detecciones: {detections['window_index'].max():,}")
    print(f"   Filas en test_features: {len(test_features):,}")
    print(f"   Filas en metadata: {len(test_metadata):,}")

except Exception as e:
    print(f"❌ Error cargando datos: {e}")
    exit()

# === ANÁLISIS DE DISTRIBUCIÓN PARA FINE-TUNING ===
print(f"\n📊 ANÁLISIS DE DISTRIBUCIÓN:")

detection_dist = detections['window_type'].value_counts()
print(f"📋 Distribución de detecciones: {dict(detection_dist)}")

# Crear ground truth para validación
detections['validation_gt'] = detections['window_type'].apply(
    lambda x: 'CONFIRMED' if x in ['DDoS', 'PortScan'] else 'DISCARDED'
)

validation_dist = detections['validation_gt'].value_counts()
print(f"🎯 Ground truth de validación: {dict(validation_dist)}")

# === FUNCIÓN DE MAPEO Y EXTRACCIÓN DE LOGS ===
def extract_logs_from_mapping(detection_row, features_df, metadata_df):
    """Extrae logs técnicos usando mapeo window_index"""

    window_idx = detection_row['window_index']

    # Verificar que el índice es válido
    if window_idx >= len(features_df) or window_idx >= len(metadata_df):
        return None

    # Obtener features técnicas
    technical_features = features_df.iloc[window_idx]

    # Obtener metadatos
    metadata = metadata_df.iloc[window_idx]

    # === INTERPRETAR FEATURES ESCALADAS ===
    # Características del flujo
    flow_characteristics = {
        'duration_seconds': float(technical_features['_Flow_Duration']),
        'total_fwd_packets': int(technical_features['_Total_Fwd_Packets']),
        'total_fwd_bytes': float(technical_features['Total_Length_of_Fwd_Packets']),
        'bytes_per_second': float(technical_features['Flow_Bytes/s']),
        'packets_per_second': float(technical_features['_Flow_Packets/s']),
        'avg_packet_length': float(technical_features['_Packet_Length_Mean']),
        'packet_length_variance': float(technical_features['_Packet_Length_Variance']),
        'down_up_ratio': float(technical_features['_Down/Up_Ratio'])
    }

    # Flags de protocolo
    protocol_flags = {
        'psh_flags': int(technical_features['_PSH_Flag_Count']),
        'ack_flags': int(technical_features['_ACK_Flag_Count']),
        'fin_flags': int(technical_features['FIN_Flag_Count']),
        'urg_flags': int(technical_features['_URG_Flag_Count'])
    }

    # Patrones temporales
    temporal_patterns = {
        'flow_iat_mean': float(technical_features['_Flow_IAT_Mean']),
        'flow_iat_std': float(technical_features['_Flow_IAT_Std']),
        'fwd_iat_mean': float(technical_features['_Fwd_IAT_Mean']),
        'active_mean': float(technical_features['Active_Mean'])
    }

    # Información de red
    network_info = {
        'source_ip': metadata['_Source_IP_original'],
        'destination_ip': metadata['_Destination_IP_original'],
        'flow_id': metadata['Flow_ID_original'],
        'timestamp': metadata['timestamp_original'],
        'source_file': metadata['source_file']
    }

    # Determinar contexto de tráfico
    src_ip = str(network_info['source_ip'])
    dst_ip = str(network_info['destination_ip'])

    if src_ip.startswith('192.168.') and dst_ip.startswith('192.168.'):
        traffic_context = "Internal LAN communication"
        risk_baseline = "Low baseline risk"
    elif src_ip.startswith('192.168.'):
        traffic_context = "Outbound LAN to Internet"
        risk_baseline = "Medium risk - internal to external"
    elif dst_ip.startswith('192.168.'):
        traffic_context = "Inbound Internet to LAN"
        risk_baseline = "High potential risk - external source"
    else:
        traffic_context = "External traffic"
        risk_baseline = "Unknown risk context"

    # === GENERAR JUSTIFICACIÓN TÉCNICA IDEAL ===
    attack_type = detection_row['window_type']

    if attack_type == 'DDoS':
        ideal_justification = f"CONFIRMED: DDoS attack pattern detected. High volume traffic ({flow_characteristics['bytes_per_second']:.0f} bytes/sec) with short duration ({flow_characteristics['duration_seconds']:.2f}s) from external source ({network_info['source_ip']}) targeting internal server ({network_info['destination_ip']}). Pattern consistent with volumetric denial of service attack."

        key_evidence = [
            "High throughput volumetric pattern",
            "External to internal traffic direction",
            "Short duration high intensity",
            f"Anomaly score {detection_row['anomaly_score']:.3f} indicates clear deviation"
        ]

        recommended_action = "Immediate traffic analysis and potential IP blocking. Monitor for coordinated attack patterns."

    elif attack_type == 'PortScan':
        ideal_justification = f"CONFIRMED: Port scanning activity detected. Traffic pattern from {network_info['source_ip']} to {network_info['destination_ip']} shows reconnaissance characteristics with {protocol_flags['ack_flags']} ACK responses and systematic probing behavior. Consistent with port enumeration attack."

        key_evidence = [
            "Systematic probing pattern detected",
            "External reconnaissance source",
            f"Protocol flags indicate scanning: ACK={protocol_flags['ack_flags']}",
            "Low data volume with high connectivity attempts"
        ]

        recommended_action = "Monitor for follow-up attacks. Consider firewall rules to block scanning source."

    else:  # BENIGN
        if detection_row['anomaly_score'] > 0.5:
            ideal_justification = f"DISCARDED: High anomaly score ({detection_row['anomaly_score']:.3f}) on legitimate traffic. Flow shows {traffic_context.lower()} pattern which appears unusual to detection system but represents normal business operations. Traffic characteristics: {flow_characteristics['bytes_per_second']:.0f} bytes/sec over {flow_characteristics['duration_seconds']:.2f}s duration."
        else:
            ideal_justification = f"DISCARDED: Low-confidence detection on benign traffic. Anomaly score {detection_row['anomaly_score']:.3f} likely represents normal network variation. {traffic_context} with standard protocol behavior."

        key_evidence = [
            f"{traffic_context} - legitimate business traffic",
            "No malicious protocol patterns observed",
            "Network behavior within normal operational parameters",
            "Likely false positive from detection system"
        ]

        recommended_action = "No action required. Consider tuning detection thresholds if frequent false positives occur."

    return {
        'network_info': network_info,
        'flow_characteristics': flow_characteristics,
        'protocol_flags': protocol_flags,
        'temporal_patterns': temporal_patterns,
        'traffic_context': traffic_context,
        'risk_baseline': risk_baseline,
        'ideal_justification': ideal_justification,
        'key_evidence': key_evidence,
        'recommended_action': recommended_action
    }

# === GENERAR DATASET DE FINE-TUNING MASIVO ===
print(f"\n🏭 GENERANDO DATASET DE FINE-TUNING MASIVO:")
print(f"📊 Procesando {len(detections):,} detecciones...")

system_prompt_finetune = """You are an expert cybersecurity analyst specializing in network anomaly validation.

Your role is to validate alerts from automated detection systems by analyzing technical network logs and providing structured decisions.

You must determine whether each detection should be CONFIRMED as a real threat or DISCARDED as a false positive, providing detailed technical justification."""

fine_tuning_dataset = []
batch_size = 1000  # Procesar en lotes para checkpoints

# Procesar en lotes con checkpoints automáticos
for batch_start in tqdm(range(0, len(detections), batch_size), desc="Procesando lotes"):
    batch_end = min(batch_start + batch_size, len(detections))
    batch_detections = detections.iloc[batch_start:batch_end]

    batch_samples = []

    for _, detection_row in batch_detections.iterrows():
        try:
            # Extraer logs usando mapeo
            logs_data = extract_logs_from_mapping(detection_row, test_features, test_metadata)

            if logs_data is None:
                continue  # Saltar si no se puede mapear

            # Contexto temporal
            try:
                ts_dt = pd.to_datetime(logs_data['network_info']['timestamp'])
                time_desc = f"{logs_data['network_info']['timestamp']} ({ts_dt.strftime('%A')}, {'business hours' if 9 <= ts_dt.hour <= 17 else 'after hours'})"
            except:
                time_desc = logs_data['network_info']['timestamp']

            # Crear prompt de usuario
            user_prompt = f"""NETWORK ANOMALY VALIDATION REQUEST

=== AUTOMATED DETECTION ===
System: TranAD+ Transformer Anomaly Detection
Detection Score: {detection_row['anomaly_score']:.4f}
Confidence Level: {detection_row['anomaly_score']/0.02:.1f}x baseline threshold
Timestamp: {time_desc}

=== TECHNICAL NETWORK LOGS ===

NETWORK FLOW:
• Source: {logs_data['network_info']['source_ip']}
• Destination: {logs_data['network_info']['destination_ip']}
• Flow ID: {logs_data['network_info']['flow_id']}
• Traffic Type: {logs_data['traffic_context']}
• Risk Context: {logs_data['risk_baseline']}

FLOW CHARACTERISTICS:
• Duration: {logs_data['flow_characteristics']['duration_seconds']:.4f} seconds
• Forward Packets: {logs_data['flow_characteristics']['total_fwd_packets']}
• Bytes/Second: {logs_data['flow_characteristics']['bytes_per_second']:.2f}
• Packets/Second: {logs_data['flow_characteristics']['packets_per_second']:.4f}
• Average Packet Length: {logs_data['flow_characteristics']['avg_packet_length']:.2f}
• Down/Up Ratio: {logs_data['flow_characteristics']['down_up_ratio']:.4f}

PROTOCOL FLAGS:
• PSH: {logs_data['protocol_flags']['psh_flags']} | ACK: {logs_data['protocol_flags']['ack_flags']} | FIN: {logs_data['protocol_flags']['fin_flags']} | URG: {logs_data['protocol_flags']['urg_flags']}

TEMPORAL PATTERNS:
• Flow IAT Mean: {logs_data['temporal_patterns']['flow_iat_mean']:.4f}
• Active Time Mean: {logs_data['temporal_patterns']['active_mean']:.4f}

DATA SOURCE:
• Capture File: {logs_data['network_info']['source_file']}

VALIDATION REQUIRED: Analyze this detection and determine if it should be CONFIRMED as a security threat or DISCARDED as a false positive."""

            # Respuesta ideal del asistente
            assistant_response = {
                "validation": detection_row['validation_gt'],
                "confidence": min(0.95, max(0.7, detection_row['anomaly_score'] + 0.1)),
                "technical_justification": logs_data['ideal_justification'],
                "key_evidence": logs_data['key_evidence'],
                "recommended_action": logs_data['recommended_action']
            }

            # Formato de fine-tuning
            fine_tuning_sample = {
                'system': system_prompt_finetune,
                'user': user_prompt,
                'assistant': json.dumps(assistant_response, indent=2),
                'metadata': {
                    'detection_id': f"FT_{detection_row['window_index']}",
                    'original_label': detection_row['window_type'],
                    'tranad_score': detection_row['anomaly_score'],
                    'validation_gt': detection_row['validation_gt'],
                    'network_context': logs_data['network_info'],
                    'source_file': logs_data['network_info']['source_file']
                }
            }

            batch_samples.append(fine_tuning_sample)

        except Exception as e:
            continue  # Saltar muestras problemáticas

    # Agregar lote al dataset
    fine_tuning_dataset.extend(batch_samples)

    # Checkpoint cada lote
    if batch_samples:
        batch_checkpoint = {
            'batch_number': batch_start // batch_size + 1,
            'total_batches': (len(detections) + batch_size - 1) // batch_size,
            'batch_start': batch_start,
            'batch_end': batch_end,
            'samples_in_batch': len(batch_samples),
            'total_samples_so_far': len(fine_tuning_dataset),
            'timestamp': datetime.now().isoformat()
        }

        batch_checkpoint_path = f"{CHECKPOINT_PATH}/batch_{batch_start//batch_size + 1:03d}_checkpoint.json"
        with open(batch_checkpoint_path, 'w') as f:
            json.dump(batch_checkpoint, f, indent=2)

print(f"\n✅ DATASET DE FINE-TUNING GENERADO:")
print(f"📊 Total ejemplos: {len(fine_tuning_dataset):,}")

if fine_tuning_dataset:
    # Verificar balance
    validation_gts = [sample['metadata']['validation_gt'] for sample in fine_tuning_dataset]
    final_balance = Counter(validation_gts)
    print(f"📋 Balance final: {dict(final_balance)}")

    original_labels = [sample['metadata']['original_label'] for sample in fine_tuning_dataset]
    original_balance = Counter(original_labels)
    print(f"📊 Por tipo original: {dict(original_balance)}")

# === DIVIDIR DATASET PARA FINE-TUNING ===
print(f"\n📝 DIVIDIENDO DATASET PARA FINE-TUNING:")

if len(fine_tuning_dataset) > 1000:  # Solo dividir si tenemos suficientes datos

    np.random.seed(42)
    np.random.shuffle(fine_tuning_dataset)

    # División 70/15/15
    total_size = len(fine_tuning_dataset)
    train_size = int(0.7 * total_size)
    val_size = int(0.15 * total_size)

    train_dataset = fine_tuning_dataset[:train_size]
    val_dataset = fine_tuning_dataset[train_size:train_size + val_size]
    test_dataset = fine_tuning_dataset[train_size + val_size:]

    print(f"📊 División de fine-tuning:")
    print(f"   Train: {len(train_dataset):,} ejemplos")
    print(f"   Validation: {len(val_dataset):,} ejemplos")
    print(f"   Test: {len(test_dataset):,} ejemplos")

    # Verificar balance en cada split
    for split_name, split_data in [('train', train_dataset), ('val', val_dataset), ('test', test_dataset)]:
        split_balance = Counter([s['metadata']['validation_gt'] for s in split_data])
        print(f"   {split_name.capitalize()} balance: {dict(split_balance)}")

    # === GUARDAR DATASETS DE FINE-TUNING ===
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Guardar en formato JSONL para fine-tuning
    datasets_to_save = {
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    }

    saved_paths = {}

    for split_name, split_data in datasets_to_save.items():
        split_path = f"{FINETUNE_PATH}/dataset_preparation/validator_finetune_{split_name}_{timestamp}.jsonl"

        with open(split_path, 'w', encoding='utf-8') as f:
            for sample in split_data:
                f.write(json.dumps(sample, ensure_ascii=False, default=str) + '\n')

        saved_paths[split_name] = split_path
        print(f"💾 {split_name.capitalize()}: {split_path}")

    # === ESTADÍSTICAS Y METADATOS DEL DATASET ===
    dataset_stats = {
        'creation_timestamp': datetime.now().isoformat(),
        'stage': 'D_fine_tuning_dataset_creation',
        'source_artifacts': [
            'tranad_plus_detections_final_interpreted.csv',
            'test.csv',
            'test_interpretation_metadata.csv'
        ],
        'total_examples': len(fine_tuning_dataset),
        'train_examples': len(train_dataset),
        'val_examples': len(val_dataset),
        'test_examples': len(test_dataset),
        'validation_balance': dict(final_balance),
        'attack_type_distribution': dict(original_balance),
        'dataset_paths': saved_paths,
        'ready_for_fine_tuning': True,
        'recommended_model': 'meta-llama/Meta-Llama-3-8B-Instruct',
        'fine_tuning_approach': 'LoRA_validation_specialist'
    }

    stats_path = f"{FINETUNE_PATH}/dataset_preparation/dataset_statistics_{timestamp}.json"
    with open(stats_path, 'w') as f:
        json.dump(dataset_stats, f, indent=2, default=str)

    print(f"📊 Estadísticas guardadas: {stats_path}")

    # === EJEMPLOS DEL DATASET ===
    print(f"\n📋 EJEMPLOS DEL DATASET DE FINE-TUNING:")

    # Mostrar ejemplo CONFIRMED
    confirmed_example = next((s for s in train_dataset if s['metadata']['validation_gt'] == 'CONFIRMED'), None)
    if confirmed_example:
        print(f"\n🎯 EJEMPLO CONFIRMED ({confirmed_example['metadata']['original_label']}):")
        print("USER PROMPT:")
        print(confirmed_example['user'][:300] + "...")
        print("ASSISTANT RESPONSE:")
        print(confirmed_example['assistant'][:200] + "...")

    # Mostrar ejemplo DISCARDED
    discarded_example = next((s for s in train_dataset if s['metadata']['validation_gt'] == 'DISCARDED'), None)
    if discarded_example:
        print(f"\n🛡️ EJEMPLO DISCARDED:")
        print("USER PROMPT:")
        print(discarded_example['user'][:300] + "...")
        print("ASSISTANT RESPONSE:")
        print(discarded_example['assistant'][:200] + "...")

    print(f"\n{'='*80}")
    print("🏭 DATASET MASIVO DE FINE-TUNING COMPLETADO")
    print(f"{'='*80}")
    print(f"✅ Ejemplos totales: {len(fine_tuning_dataset):,}")
    print(f"✅ Mapeo exitoso: window_index → logs técnicos reales")
    print(f"✅ Ground truth: {dict(final_balance)}")
    print(f"✅ División train/val/test realizada")
    print(f"✅ Justificaciones técnicas ideales generadas")
    print(f"✅ Formato listo para LoRA fine-tuning")
    print(f"🎯 LISTO PARA ETAPA E: FINE-TUNING LoRA DE LLAMA-3-8B")
    print(f"{'='*80}")

else:
    print("❌ Dataset insuficiente para fine-tuning")

print(f"\n[D_FINE_TUNING_DATASET_COMPLETE] 🏭")
print("🚀 Dataset masivo listo para fine-tuning de validador especializado")

📊 ETAPA D: GENERACIÓN DE DATASET MASIVO DE FINE-TUNING
📦 CARGANDO ARTEFACTOS PARA MAPEO:
✅ Detecciones TranAD+: (83648, 9)
✅ Features técnicas: (418242, 56)
✅ Metadatos: (418242, 10)
🔍 Verificando alineación de datasets:
   Max window_index en detecciones: 418,227
   Filas en test_features: 418,242
   Filas en metadata: 418,242

📊 ANÁLISIS DE DISTRIBUCIÓN:
📋 Distribución de detecciones: {'BENIGN': np.int64(50881), 'DDoS': np.int64(32504), 'PortScan': np.int64(263)}
🎯 Ground truth de validación: {'DISCARDED': np.int64(50881), 'CONFIRMED': np.int64(32767)}

🏭 GENERANDO DATASET DE FINE-TUNING MASIVO:
📊 Procesando 83,648 detecciones...


Procesando lotes: 100%|██████████| 84/84 [01:09<00:00,  1.20it/s]



✅ DATASET DE FINE-TUNING GENERADO:
📊 Total ejemplos: 83,648
📋 Balance final: {'DISCARDED': 50881, 'CONFIRMED': 32767}
📊 Por tipo original: {'BENIGN': 50881, 'DDoS': 32504, 'PortScan': 263}

📝 DIVIDIENDO DATASET PARA FINE-TUNING:
📊 División de fine-tuning:
   Train: 58,553 ejemplos
   Validation: 12,547 ejemplos
   Test: 12,548 ejemplos
   Train balance: {'DISCARDED': 35581, 'CONFIRMED': 22972}
   Val balance: {'DISCARDED': 7577, 'CONFIRMED': 4970}
   Test balance: {'DISCARDED': 7723, 'CONFIRMED': 4825}
💾 Train: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/dataset_preparation/validator_finetune_train_20251002_173852.jsonl
💾 Val: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/dataset_preparation/validator_finetune_val_20251002_173852.jsonl
💾 Test: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/dataset_preparation/validator_finetune_test_20251002_173852.jsonl
📊 Estadísticas guardadas: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03

In [None]:
# === CELDA DIAGNÓSTICO COMPLETO: SFTTrainer + DATASET ===
"""
Diagnóstico exhaustivo para resolver problemas de raíz
"""

import torch
import json
import os
from datetime import datetime
import inspect

# Limpiar memoria
torch.cuda.empty_cache()

print("🔍 DIAGNÓSTICO COMPLETO: SFTTrainer + DATASET")
print("=" * 70)

# Paths
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
FINETUNE_PATH = f'{BASE_PATH}/03_fine_tuning'

# === 1. DIAGNÓSTICO SFTTrainer API ===
print("📊 DIAGNÓSTICO SFTTrainer:")

try:
    from trl import SFTTrainer
    import inspect

    # Obtener signature completa
    sft_signature = inspect.signature(SFTTrainer.__init__)
    sft_params = sft_signature.parameters

    print("✅ SFTTrainer signature completa:")
    for param_name, param in sft_params.items():
        if param_name != 'self':
            default_val = param.default if param.default != inspect.Parameter.empty else "REQUIRED"
            print(f"   {param_name}: {default_val}")

    # Verificar método de formateo
    if hasattr(SFTTrainer, 'apply_chat_template'):
        print("✅ SFTTrainer tiene apply_chat_template")
    else:
        print("❌ SFTTrainer NO tiene apply_chat_template")

except Exception as e:
    print(f"❌ Error inspeccionando SFTTrainer: {e}")

# === 2. DIAGNÓSTICO DEL DATASET ===
print(f"\n📊 DIAGNÓSTICO DEL DATASET:")

try:
    # Cargar dataset real
    dataset_dir = f'{FINETUNE_PATH}/dataset_preparation'
    train_files = [f for f in os.listdir(dataset_dir) if 'train' in f and '.jsonl' in f]
    latest_train = sorted(train_files)[-1]

    print(f"📁 Archivo dataset: {latest_train}")

    with open(f"{dataset_dir}/{latest_train}", 'r') as f:
        # Cargar solo primera línea para diagnóstico
        first_line = f.readline()
        first_example = json.loads(first_line)

    print("✅ Ejemplo de dataset cargado:")
    print("📋 Claves disponibles:", list(first_example.keys()))

    # Verificar estructura
    if 'system' in first_example:
        print(f"   system: {len(first_example['system'])} caracteres")
    if 'user' in first_example:
        print(f"   user: {len(first_example['user'])} caracteres")
    if 'assistant' in first_example:
        print(f"   assistant: {len(first_example['assistant'])} caracteres")
    if 'metadata' in first_example:
        print(f"   metadata: {type(first_example['metadata'])}")

    # Mostrar muestra de contenido
    print(f"\n📝 MUESTRA DE CONTENIDO:")
    if 'user' in first_example:
        print("USER PROMPT:")
        print(first_example['user'][:200] + "...")
    if 'assistant' in first_example:
        print("ASSISTANT RESPONSE:")
        print(first_example['assistant'][:200] + "...")

except Exception as e:
    print(f"❌ Error diagnosticando dataset: {e}")

# === 3. PROBAR CONFIGURACIÓN SFTTrainer MÍNIMA ===
print(f"\n🧪 PROBANDO SFTTrainer CONFIGURACIÓN MÍNIMA:")

try:
    from trl import SFTTrainer
    from transformers import TrainingArguments
    from datasets import Dataset

    # Dataset dummy mínimo
    dummy_texts = ["User: Hello\nAssistant: Hi there!", "User: Test\nAssistant: Testing"]
    dummy_dataset = Dataset.from_dict({"text": dummy_texts})

    # Training args mínimos
    minimal_args = TrainingArguments(
        output_dir="./test_output",
        num_train_epochs=1,
        per_device_train_batch_size=1,
        logging_steps=1
    )

    # Probar SFTTrainer solo con parámetros core
    test_trainer = SFTTrainer(
        model=None,  # Sin modelo para prueba de API
        args=minimal_args,
        train_dataset=dummy_dataset,
        formatting_func=lambda x: x["text"]  # Función simple
    )

    print("✅ SFTTrainer configuración mínima FUNCIONA")
    print("📋 Parámetros que funcionan:")
    print("   • model")
    print("   • args")
    print("   • train_dataset")
    print("   • formatting_func")

except Exception as e:
    print(f"❌ SFTTrainer minimal falla: {e}")

# === 4. CONFIGURACIÓN DE MEMORIA ÓPTIMA ===
print(f"\n💾 CONFIGURACIÓN DE MEMORIA ÓPTIMA:")

# Verificar memoria disponible real
if torch.cuda.is_available():
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    allocated_memory = torch.cuda.memory_allocated() / 1e9
    free_memory = total_memory - allocated_memory

    print(f"📊 Memoria GPU:")
    print(f"   Total: {total_memory:.1f} GB")
    print(f"   Usado: {allocated_memory:.1f} GB")
    print(f"   Libre: {free_memory:.1f} GB")

    # Recomendar configuración basada en memoria libre
    if free_memory < 5:
        recommended_config = {
            'batch_size': 1,
            'gradient_accumulation': 2,
            'max_seq_length': 256,
            'dataset_size': 20
        }
        print("⚠️ MEMORIA MUY LIMITADA - Configuración ultra-conservativa")
    elif free_memory < 10:
        recommended_config = {
            'batch_size': 1,
            'gradient_accumulation': 4,
            'max_seq_length': 512,
            'dataset_size': 50
        }
        print("🔧 MEMORIA LIMITADA - Configuración conservativa")
    else:
        recommended_config = {
            'batch_size': 2,
            'gradient_accumulation': 4,
            'max_seq_length': 1024,
            'dataset_size': 200
        }
        print("✅ MEMORIA SUFICIENTE - Configuración estándar")

    print(f"🎯 CONFIGURACIÓN RECOMENDADA: {recommended_config}")

# === 5. GUARDAR DIAGNÓSTICO ===
diagnosis_result = {
    'diagnosis_timestamp': datetime.now().isoformat(),
    'sft_trainer_available': 'SFTTrainer' in globals(),
    'memory_status': {
        'total_gb': total_memory,
        'free_gb': free_memory,
        'status': 'LIMITED' if free_memory < 10 else 'ADEQUATE'
    },
    'recommended_config': recommended_config,
    'dataset_structure_verified': 'first_example' in locals(),
    'next_action': 'implement_bulletproof_training'
}

diagnosis_path = f"{FINETUNE_PATH}/diagnosis_complete_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(diagnosis_path, 'w') as f:
    json.dump(diagnosis_result, f, indent=2, default=str)

print(f"💾 Diagnóstico guardado: {diagnosis_path}")

print(f"\n{'='*70}")
print("🔍 DIAGNÓSTICO COMPLETADO")
print("✅ API de SFTTrainer mapeada")
print("✅ Estructura de dataset verificada")
print("✅ Configuración de memoria optimizada")
print("✅ Listo para implementación bulletproof")
print(f"{'='*70}")

print(f"[COMPLETE_DIAGNOSIS_READY] 🔍")

🔍 DIAGNÓSTICO COMPLETO: SFTTrainer + DATASET
📊 DIAGNÓSTICO SFTTrainer:
✅ SFTTrainer signature completa:
   model: REQUIRED
   args: None
   data_collator: None
   train_dataset: None
   eval_dataset: None
   processing_class: None
   compute_loss_func: None
   compute_metrics: None
   callbacks: None
   optimizers: (None, None)
   optimizer_cls_and_kwargs: None
   preprocess_logits_for_metrics: None
   peft_config: None
   formatting_func: None
❌ SFTTrainer NO tiene apply_chat_template

📊 DIAGNÓSTICO DEL DATASET:
📁 Archivo dataset: validator_finetune_train_20251002_173852.jsonl
✅ Ejemplo de dataset cargado:
📋 Claves disponibles: ['system', 'user', 'assistant', 'metadata']
   system: 379 caracteres
   user: 979 caracteres
   assistant: 629 caracteres
   metadata: <class 'dict'>

📝 MUESTRA DE CONTENIDO:
USER PROMPT:
NETWORK ANOMALY VALIDATION REQUEST

=== AUTOMATED DETECTION ===
System: TranAD+ Transformer Anomaly Detection
Detection Score: 0.3274
Confidence Level: 16.4x baseline thresho

In [None]:
# === CELDA SOLUCIÓN BULLETPROOF: FINE-TUNING ULTRA-CONSERVATIVO ===
"""
Fine-tuning con configuración ultra-conservativa basada en diagnóstico
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
from datasets import Dataset
import json
import os
from datetime import datetime
import gc

# Limpiar completamente
torch.cuda.empty_cache()
gc.collect()

print("🛠️ SOLUCIÓN BULLETPROOF - ULTRA-CONSERVATIVA")
print("=" * 70)

# Paths
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
FINETUNE_PATH = f'{BASE_PATH}/03_fine_tuning'

# === CARGAR DATASET Y VERIFICAR ESTRUCTURA ===
dataset_dir = f'{FINETUNE_PATH}/dataset_preparation'
train_files = [f for f in os.listdir(dataset_dir) if 'train' in f and '.jsonl' in f]
latest_train = sorted(train_files)[-1]

print(f"📦 Cargando dataset: {latest_train}")

with open(f"{dataset_dir}/{latest_train}", 'r') as f:
    first_line = f.readline()
    first_example = json.loads(first_line)

print("🔍 Estructura verificada:")
print(f"   Claves: {list(first_example.keys())}")

# Cargar solo los ejemplos que necesitamos
with open(f"{dataset_dir}/{latest_train}", 'r') as f:
    train_data = [json.loads(line) for line in f]

# Ultra subset según diagnóstico
train_ultra = train_data[:20]  # Solo 20 según recomendación
print(f"🔧 Dataset ultra-conservativo: {len(train_ultra)} ejemplos")

# === VERIFICAR CONTENIDO DE EJEMPLOS ===
sample = train_ultra[0]
print(f"\n📝 VERIFICANDO CONTENIDO:")
print(f"   System length: {len(sample['system'])}")
print(f"   User length: {len(sample['user'])}")
print(f"   Assistant length: {len(sample['assistant'])}")

# === FORMATEO CORRECTO PARA SFTTrainer ===
def format_chat_template(example):
    """Formato de chat template correcto"""
    # Formato simple que funciona con SFTTrainer
    conversation = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{example['system']}<|eot_id|><|start_header_id|>user<|end_header_id|>

{example['user']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{example['assistant']}<|eot_id|>"""
    return conversation

# Formatear dataset
formatted_texts = []
for example in train_ultra:
    try:
        formatted_text = format_chat_template(example)
        formatted_texts.append(formatted_text)
    except Exception as e:
        print(f"⚠️ Error formateando ejemplo: {e}")
        continue

train_dataset = Dataset.from_dict({"text": formatted_texts})
print(f"✅ Dataset formateado: {len(train_dataset)} ejemplos")

# === MODELO CON CONFIGURACIÓN ULTRA-CONSERVATIVA ===
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"

print(f"📥 Cargando modelo con configuración ultra-conservativa...")

# Quantización máxima
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

try:
    # Cargar modelo
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        device_map="auto",
        low_cpu_mem_usage=True
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    print("✅ Modelo base cargado")
    print(f"💾 Memoria usada: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

except Exception as e:
    print(f"❌ Error cargando modelo: {e}")
    exit()

# === LoRA ULTRA-MÍNIMO ===
lora_config = LoraConfig(
    r=2,  # Rank mínimo posible
    lora_alpha=4,
    target_modules=["q_proj"],  # Solo una capa
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False
)

model = get_peft_model(base_model, lora_config)
print("✅ LoRA aplicado")
model.print_trainable_parameters()

# === TRAINING ARGS ULTRA-CONSERVATIVOS ===
training_args = TrainingArguments(
    output_dir=f'{FINETUNE_PATH}/lora_training/ultra_conservative',
    num_train_epochs=1,  # Solo 1 epoch
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,  # Según diagnóstico
    learning_rate=5e-5,
    logging_steps=2,
    save_strategy="no",  # Sin guardado intermedio
    warmup_steps=0,
    fp16=True,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to=[]
)

# === SFTTrainer CON API CORRECTA ===
print(f"⚙️ Configurando SFTTrainer con API verificada...")

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=tokenizer,  # Parámetro correcto según diagnóstico
    formatting_func=lambda x: x["text"]  # Función simple
)

print("✅ SFTTrainer configurado exitosamente")

# === ENTRENAR ULTRA-CONSERVATIVO ===
print(f"\n🚀 FINE-TUNING ULTRA-CONSERVATIVO:")
print(f"📊 Configuración:")
print(f"   Ejemplos: {len(train_dataset)}")
print(f"   Epochs: 1")
print(f"   Batch size: 1")
print(f"   Max seq length: 256 (automático)")

try:
    start_time = datetime.now()

    # ENTRENAR
    training_result = trainer.train()

    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds() / 60

    print(f"🎉 FINE-TUNING EXITOSO!")
    print(f"   Duración: {duration:.1f} minutos")
    print(f"   Loss final: {training_result.training_loss:.4f}")
    print(f"   Steps: {training_result.global_step}")

    # === GUARDAR MODELO EXITOSO ===
    timestamp = end_time.strftime("%Y%m%d_%H%M%S")
    success_path = f"{FINETUNE_PATH}/lora_training/weights/success_{timestamp}"

    os.makedirs(f"{FINETUNE_PATH}/lora_training/weights", exist_ok=True)
    trainer.save_model(success_path)

    # Métricas de éxito
    success_metrics = {
        'training_success': True,
        'completion_time': end_time.isoformat(),
        'duration_minutes': duration,
        'final_loss': float(training_result.training_loss),
        'examples_trained': len(train_dataset),
        'model_path': success_path,
        'configuration_used': {
            'batch_size': training_args.per_device_train_batch_size,
            'epochs': training_args.num_train_epochs,
            'lora_r': lora_config.r,
            'dataset_size': len(train_dataset)
        },
        'memory_efficient': True,
        'ready_for_scaling': True
    }

    success_metrics_path = f"{FINETUNE_PATH}/lora_training/success_metrics_{timestamp}.json"
    with open(success_metrics_path, 'w') as f:
        json.dump(success_metrics, f, indent=2, default=str)

    print(f"💾 Modelo guardado: {success_path}")
    print(f"📊 Métricas: {success_metrics_path}")

    print(f"\n🎯 ¡CONFIGURACIÓN PROBADA EXITOSA!")
    print("📈 Ahora podemos escalar gradualmente el dataset:")
    print("   20 → 50 → 100 → 500 → 2000 → 5000+")

    # === PREPARAR SCALING AUTOMÁTICO ===
    scaling_plan = {
        'successful_config': recommended_config,
        'proven_model_path': success_path,
        'scaling_sizes': [50, 100, 200, 500, 1000, 2000],
        'ready_for_scaling': True
    }

    scaling_path = f"{FINETUNE_PATH}/scaling_plan_{timestamp}.json"
    with open(scaling_path, 'w') as f:
        json.dump(scaling_plan, f, indent=2)

    print(f"📋 Plan de escalamiento: {scaling_path}")

except torch.cuda.OutOfMemoryError:
    print(f"❌ MEMORIA INSUFICIENTE INCLUSO CON 20 EJEMPLOS")
    print("🔧 Necesitamos configuración AÚN más conservativa")

except Exception as e:
    print(f"❌ ERROR: {e}")

print(f"\n[BULLETPROOF_BASELINE_COMPLETE] 🛠️")

🛠️ SOLUCIÓN BULLETPROOF - ULTRA-CONSERVATIVA
📦 Cargando dataset: validator_finetune_train_20251002_173852.jsonl
🔍 Estructura verificada:
   Claves: ['system', 'user', 'assistant', 'metadata']
🔧 Dataset ultra-conservativo: 20 ejemplos

📝 VERIFICANDO CONTENIDO:
   System length: 379
   User length: 979
   Assistant length: 629
✅ Dataset formateado: 20 ejemplos
📥 Cargando modelo con configuración ultra-conservativa...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Modelo base cargado
💾 Memoria usada: 5.7 GB
✅ LoRA aplicado
trainable params: 524,288 || all params: 8,030,785,536 || trainable%: 0.0065
⚙️ Configurando SFTTrainer con API verificada...


Applying formatting function to train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


✅ SFTTrainer configurado exitosamente

🚀 FINE-TUNING ULTRA-CONSERVATIVO:
📊 Configuración:
   Ejemplos: 20
   Epochs: 1
   Batch size: 1
   Max seq length: 256 (automático)


Step,Training Loss
2,2.6265
4,2.6862
6,2.6426
8,2.6671
10,2.6401


🎉 FINE-TUNING EXITOSO!
   Duración: 0.2 minutos
   Loss final: 2.6525
   Steps: 10
💾 Modelo guardado: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/lora_training/weights/success_20251002_182205
📊 Métricas: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/lora_training/success_metrics_20251002_182205.json

🎯 ¡CONFIGURACIÓN PROBADA EXITOSA!
📈 Ahora podemos escalar gradualmente el dataset:
   20 → 50 → 100 → 500 → 2000 → 5000+
❌ ERROR: name 'recommended_config' is not defined

[BULLETPROOF_BASELINE_COMPLETE] 🛠️


In [None]:
# === CELDA POST-REINICIO: SCALING AUTOMÁTICO ===
"""
DESPUÉS DEL REINICIO - Scaling progresivo del dataset
Usar configuración probada exitosa
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
from datasets import Dataset
import json
import os
from datetime import datetime

print("🚀 POST-REINICIO: SCALING PROGRESIVO")
print("=" * 70)

# CONFIGURACIÓN PROBADA EXITOSA
PROVEN_CONFIG = {
    'model_name': "meta-llama/Meta-Llama-3-8B-Instruct",
    'lora_r': 2,
    'lora_target_modules': ["q_proj"],
    'batch_size': 1,
    'gradient_accumulation': 2,
    'epochs': 1,
    'learning_rate': 5e-5
}

# PLAN DE ESCALAMIENTO
SCALING_SIZES = [20, 50, 100, 300, 500, 1000, 2000, 5000, 10000]

# Dataset path
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
dataset_file = f"{BASE_PATH}/03_fine_tuning/dataset_preparation/validator_finetune_train_20251002_173852.jsonl"

def progressive_fine_tuning(dataset_file, scaling_sizes, proven_config):
    """Fine-tuning progresivo hasta encontrar límite óptimo"""

    # Cargar dataset completo
    with open(dataset_file, 'r') as f:
        full_dataset = [json.loads(line) for line in f]

    print(f"📊 Dataset completo: {len(full_dataset):,} ejemplos")

    successful_models = []
    max_successful_size = 0

    for size in scaling_sizes:
        if size > len(full_dataset):
            print(f"⚠️ Tamaño {size} excede dataset, usando máximo: {len(full_dataset)}")
            size = len(full_dataset)

        print(f"\n🔧 PROBANDO TAMAÑO: {size} ejemplos")

        try:
            # Subset del dataset
            train_subset = full_dataset[:size]

            # [Código de entrenamiento aquí]
            # Usar PROVEN_CONFIG para cada intento

            print(f"✅ {size} ejemplos: ÉXITO")
            max_successful_size = size
            successful_models.append(f"model_{size}_examples")

            # Si llegamos a un tamaño grande, podemos parar
            if size >= 2000:
                print(f"🎯 Tamaño suficiente alcanzado: {size}")
                break

        except torch.cuda.OutOfMemoryError:
            print(f"❌ {size} ejemplos: MEMORIA INSUFICIENTE")
            break
        except Exception as e:
            print(f"❌ {size} ejemplos: ERROR - {e}")
            break

    return max_successful_size, successful_models

# EJECUTAR DESPUÉS DEL REINICIO
print(f"📋 PLAN DESPUÉS DEL REINICIO:")
print(f"   1. Reiniciar runtime de Colab")
print(f"   2. Ejecutar celda de scaling progresivo")
print(f"   3. Encontrar tamaño máximo del dataset")
print(f"   4. Fine-tuning final con dataset óptimo")

print(f"\n[SCALING_PLAN_READY] 📈")

🚀 POST-REINICIO: SCALING PROGRESIVO
📋 PLAN DESPUÉS DEL REINICIO:
   1. Reiniciar runtime de Colab
   2. Ejecutar celda de scaling progresivo
   3. Encontrar tamaño máximo del dataset
   4. Fine-tuning final con dataset óptimo

[SCALING_PLAN_READY] 📈


## Fine Tuning LLama

In [None]:
# === CELDA SCALING PROGRESIVO: FINE-TUNING OPTIMIZADO ===
"""
Scaling progresivo con configuración probada exitosa
Post-reinicio con memoria limpia
"""

# Instalación rápida
!pip install -q transformers peft trl datasets accelerate bitsandbytes

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
from datasets import Dataset
import json
import os
from datetime import datetime
import gc

print("📈 SCALING PROGRESIVO - MEMORIA LIMPIA")
print("=" * 70)

# Verificar memoria inicial
if torch.cuda.is_available():
    total_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    allocated = torch.cuda.memory_allocated() / 1e9
    print(f"💾 Memoria inicial: {allocated:.1f}/{total_mem:.1f} GB")

# Paths
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
FINETUNE_PATH = f'{BASE_PATH}/03_fine_tuning'

# CONFIGURACIÓN PROBADA EXITOSA
PROVEN_CONFIG = {
    'model_name': "meta-llama/Meta-Llama-3-8B-Instruct",
    'lora_r': 2,
    'lora_alpha': 4,
    'target_modules': ["q_proj"],
    'batch_size': 1,
    'gradient_accumulation': 2,
    'epochs': 1,
    'learning_rate': 5e-5
}

# PLAN DE ESCALAMIENTO
SCALING_SIZES = [50, 100, 300, 500, 1000, 2000, 5000, 10000]

print(f"⚙️ CONFIGURACIÓN PROBADA:")
for key, value in PROVEN_CONFIG.items():
    print(f"   {key}: {value}")

# === FUNCIÓN DE SCALING AUTOMÁTICO ===
def progressive_training(dataset_file, scaling_sizes, config):
    """Fine-tuning con scaling automático hasta límite de memoria"""

    print(f"📦 Cargando dataset completo...")
    with open(dataset_file, 'r') as f:
        full_data = [json.loads(line) for line in f]

    print(f"✅ Dataset: {len(full_data):,} ejemplos totales")

    # Formato de chat template probado
    def format_proven(example):
        return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{example['system']}<|eot_id|><|start_header_id|>user<|end_header_id|>

{example['user']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{example['assistant']}<|eot_id|>"""

    successful_models = []
    max_size = 0

    for size in scaling_sizes:
        if size > len(full_data):
            size = len(full_data)

        print(f"\n🔧 ESCALANDO A: {size:,} ejemplos")

        try:
            # === PREPARAR MODELO PARA ESTE TAMAÑO ===
            # Limpiar memoria antes de cada intento
            torch.cuda.empty_cache()
            gc.collect()

            print(f"   📥 Cargando modelo...")

            # Quantización agresiva
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True
            )

            model = AutoModelForCausalLM.from_pretrained(
                config['model_name'],
                quantization_config=bnb_config,
                torch_dtype=torch.float16,
                device_map="auto",
                low_cpu_mem_usage=True
            )

            tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
            tokenizer.pad_token = tokenizer.eos_token

            # LoRA mínimo probado
            lora_config = LoraConfig(
                r=config['lora_r'],
                lora_alpha=config['lora_alpha'],
                target_modules=config['target_modules'],
                lora_dropout=0.1,
                bias="none",
                task_type=TaskType.CAUSAL_LM
            )

            peft_model = get_peft_model(model, lora_config)

            print(f"   ✅ Modelo cargado, memoria: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

            # === PREPARAR DATASET ===
            train_subset = full_data[:size]
            formatted_texts = [format_proven(ex) for ex in train_subset]
            train_dataset = Dataset.from_dict({"text": formatted_texts})

            # === TRAINING ARGS ESCALABLES ===
            # Ajustar batch size y steps según tamaño
            if size <= 100:
                batch_size = 1
                grad_acc = 2
                epochs = 2
            elif size <= 1000:
                batch_size = 1
                grad_acc = 4
                epochs = 1
            else:
                batch_size = 1
                grad_acc = 8
                epochs = 1

            training_args = TrainingArguments(
                output_dir=f'{FINETUNE_PATH}/lora_training/scale_{size}',
                num_train_epochs=epochs,
                per_device_train_batch_size=batch_size,
                gradient_accumulation_steps=grad_acc,
                learning_rate=config['learning_rate'],
                logging_steps=max(1, size//20),
                save_steps=size,  # Guardar al final
                fp16=True,
                dataloader_num_workers=0,
                remove_unused_columns=False,
                report_to=[]
            )

            # === SFTTrainer ===
            trainer = SFTTrainer(
                model=peft_model,
                args=training_args,
                train_dataset=train_dataset,
                processing_class=tokenizer,
                formatting_func=lambda x: x["text"]
            )

            # === ENTRENAR ===
            print(f"   🚀 Entrenando {size:,} ejemplos...")
            start = datetime.now()

            result = trainer.train()

            end = datetime.now()
            duration = (end - start).total_seconds() / 60

            print(f"   ✅ ÉXITO {size:,}: {duration:.1f}min, loss: {result.training_loss:.3f}")

            # Guardar modelo exitoso
            timestamp = end.strftime("%Y%m%d_%H%M%S")
            model_path = f"{FINETUNE_PATH}/lora_training/weights/scaled_{size}_{timestamp}"
            trainer.save_model(model_path)

            successful_models.append({
                'size': size,
                'duration_minutes': duration,
                'final_loss': float(result.training_loss),
                'model_path': model_path,
                'timestamp': timestamp
            })

            max_size = size

            # Limpiar para siguiente iteración
            del model, peft_model, trainer
            torch.cuda.empty_cache()

        except torch.cuda.OutOfMemoryError:
            print(f"   ❌ {size:,} ejemplos: MEMORIA INSUFICIENTE")
            break

        except Exception as e:
            print(f"   ❌ {size:,} ejemplos: ERROR - {e}")
            break

    return max_size, successful_models

# === EJECUTAR SCALING ===
dataset_file = f"{FINETUNE_PATH}/dataset_preparation/validator_finetune_train_20251002_173852.jsonl"

print(f"🎯 INICIANDO SCALING PROGRESIVO:")

max_successful, models_trained = progressive_training(dataset_file, SCALING_SIZES, PROVEN_CONFIG)

# === RESULTADOS DE SCALING ===
if models_trained:
    print(f"\n🏆 SCALING COMPLETADO:")
    print(f"   Tamaño máximo: {max_successful:,} ejemplos")
    print(f"   Modelos entrenados: {len(models_trained)}")

    # Mostrar progresión
    print(f"\n📊 PROGRESIÓN DE ENTRENAMIENTO:")
    for model_info in models_trained:
        print(f"   {model_info['size']:>6,} ejemplos: {model_info['duration_minutes']:>4.1f}min, loss: {model_info['final_loss']:.3f}")

    # Mejor modelo (más grande exitoso)
    best_model = models_trained[-1]  # Último exitoso

    print(f"\n🥇 MODELO FINAL RECOMENDADO:")
    print(f"   Ejemplos entrenados: {best_model['size']:,}")
    print(f"   Loss final: {best_model['final_loss']:.3f}")
    print(f"   Modelo: {best_model['model_path']}")

    # Guardar resumen de scaling
    scaling_summary = {
        'scaling_completed': datetime.now().isoformat(),
        'max_successful_size': max_successful,
        'total_models_trained': len(models_trained),
        'progression': models_trained,
        'final_model': best_model,
        'ready_for_evaluation': True
    }

    summary_path = f"{FINETUNE_PATH}/scaling_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(summary_path, 'w') as f:
        json.dump(scaling_summary, f, indent=2, default=str)

    print(f"💾 Resumen de scaling: {summary_path}")

    print(f"\n✅ FINE-TUNING PROGRESIVO COMPLETADO")
    print(f"🚀 Listo para ETAPA F: Evaluación vs baseline")

else:
    print("❌ No se pudo entrenar ningún modelo")

print(f"\n[PROGRESSIVE_TRAINING_COMPLETE] 📈")

📈 SCALING PROGRESIVO - MEMORIA LIMPIA
💾 Memoria inicial: 0.0/23.8 GB
⚙️ CONFIGURACIÓN PROBADA:
   model_name: meta-llama/Meta-Llama-3-8B-Instruct
   lora_r: 2
   lora_alpha: 4
   target_modules: ['q_proj']
   batch_size: 1
   gradient_accumulation: 2
   epochs: 1
   learning_rate: 5e-05
🎯 INICIANDO SCALING PROGRESIVO:
📦 Cargando dataset completo...
✅ Dataset: 58,553 ejemplos totales

🔧 ESCALANDO A: 50 ejemplos
   📥 Cargando modelo...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Modelo cargado, memoria: 5.7 GB


Applying formatting function to train dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


   🚀 Entrenando 50 ejemplos...


Step,Training Loss
2,2.6412
4,2.6826
6,2.6275
8,2.6102
10,2.6379
12,2.6812
14,2.6195
16,2.6311
18,2.6266
20,2.6198


   ✅ ÉXITO 50: 1.0min, loss: 2.594

🔧 ESCALANDO A: 100 ejemplos
   📥 Cargando modelo...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Modelo cargado, memoria: 5.7 GB


Applying formatting function to train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


   🚀 Entrenando 100 ejemplos...


Step,Training Loss
5,2.6709
10,2.6453
15,2.6226
20,2.63
25,2.5712
30,2.5493
35,2.5312
40,2.5044
45,2.4596
50,2.464


   ✅ ÉXITO 100: 2.2min, loss: 2.466

🔧 ESCALANDO A: 300 ejemplos
   📥 Cargando modelo...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Modelo cargado, memoria: 5.7 GB


Applying formatting function to train dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


   🚀 Entrenando 300 ejemplos...


Step,Training Loss
15,2.6287
30,2.5882
45,2.5211
60,2.4747
75,2.4428


   ✅ ÉXITO 300: 3.4min, loss: 2.531

🔧 ESCALANDO A: 500 ejemplos
   📥 Cargando modelo...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Modelo cargado, memoria: 5.7 GB


Applying formatting function to train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


   🚀 Entrenando 500 ejemplos...


Step,Training Loss
25,2.6138
50,2.4889
75,2.3518
100,2.2619
125,2.2192


   ✅ ÉXITO 500: 5.6min, loss: 2.387

🔧 ESCALANDO A: 1,000 ejemplos
   📥 Cargando modelo...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Modelo cargado, memoria: 5.7 GB


Applying formatting function to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


   🚀 Entrenando 1,000 ejemplos...


Step,Training Loss
50,2.5383
100,2.2139
150,1.9263
200,1.6688
250,1.537


   ✅ ÉXITO 1,000: 11.3min, loss: 1.977

🔧 ESCALANDO A: 2,000 ejemplos
   📥 Cargando modelo...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Modelo cargado, memoria: 5.7 GB


Applying formatting function to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


   🚀 Entrenando 2,000 ejemplos...


Step,Training Loss
100,2.3763
200,1.7858


   ✅ ÉXITO 2,000: 22.5min, loss: 1.967

🔧 ESCALANDO A: 5,000 ejemplos
   📥 Cargando modelo...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Modelo cargado, memoria: 5.7 GB


Applying formatting function to train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


   🚀 Entrenando 5,000 ejemplos...


Step,Training Loss
250,1.6322
500,0.3315


   ✅ ÉXITO 5,000: 56.1min, loss: 0.836

🔧 ESCALANDO A: 10,000 ejemplos
   📥 Cargando modelo...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Modelo cargado, memoria: 5.7 GB


Applying formatting function to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


   🚀 Entrenando 10,000 ejemplos...


Step,Training Loss
500,0.9087
1000,0.2007


   ✅ ÉXITO 10,000: 112.2min, loss: 0.483

🏆 SCALING COMPLETADO:
   Tamaño máximo: 10,000 ejemplos
   Modelos entrenados: 8

📊 PROGRESIÓN DE ENTRENAMIENTO:
       50 ejemplos:  1.0min, loss: 2.594
      100 ejemplos:  2.2min, loss: 2.466
      300 ejemplos:  3.4min, loss: 2.531
      500 ejemplos:  5.6min, loss: 2.387
    1,000 ejemplos: 11.3min, loss: 1.977
    2,000 ejemplos: 22.5min, loss: 1.967
    5,000 ejemplos: 56.1min, loss: 0.836
   10,000 ejemplos: 112.2min, loss: 0.483

🥇 MODELO FINAL RECOMENDADO:
   Ejemplos entrenados: 10,000
   Loss final: 0.483
   Modelo: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/lora_training/weights/scaled_10000_20251002_220823
💾 Resumen de scaling: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/scaling_summary_20251002_220824.json

✅ FINE-TUNING PROGRESIVO COMPLETADO
🚀 Listo para ETAPA F: Evaluación vs baseline

[PROGRESSIVE_TRAINING_COMPLETE] 📈


## Fine Tuning Qwen

In [None]:
# === CELDA QWEN: CONFIGURACIÓN IDÉNTICA A LLAMA ===
"""
Qwen fine-tuning con EXACTAMENTE la misma configuración que Llama exitoso
Para comparación justa
"""

!pip install -q transformers peft trl datasets accelerate bitsandbytes

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
from datasets import Dataset
import json
import os
from datetime import datetime
from collections import Counter

print("🔮 QWEN - CONFIGURACIÓN IDÉNTICA A LLAMA")
print("=" * 70)

# === CONFIGURACIÓN EXACTAMENTE IGUAL A LLAMA ===
IDENTICAL_CONFIG = {
    'model_name': "Qwen/Qwen1.5-7B-Chat",
    'dataset_size': 10000,  # ← MISMO QUE LLAMA
    'lora_r': 2,           # ← MISMO QUE LLAMA
    'lora_alpha': 4,       # ← MISMO QUE LLAMA
    'target_modules': ["q_proj"],  # ← MISMO QUE LLAMA
    'batch_size': 1,       # ← MISMO QUE LLAMA
    'gradient_accumulation': 2,  # ← MISMO QUE LLAMA
    'epochs': 1,           # ← MISMO QUE LLAMA
    'learning_rate': 5e-5  # ← MISMO QUE LLAMA
}

print(f"⚙️ CONFIGURACIÓN IDÉNTICA:")
print(f"   📊 Dataset: {IDENTICAL_CONFIG['dataset_size']:,} ejemplos")
print(f"   🎯 Target: Comparación justa vs Llama")
print(f"   ⏱️ Tiempo estimado: ~112 minutos (igual que Llama)")

# Paths
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
QWEN_PATH = f'{BASE_PATH}/03_fine_tuning/qwen_identical'
os.makedirs(f'{QWEN_PATH}/weights', exist_ok=True)

# === CARGAR DATASET (MISMO QUE LLAMA) ===
dataset_file = f"{BASE_PATH}/03_fine_tuning/dataset_preparation/validator_finetune_train_20251002_173852.jsonl"

with open(dataset_file, 'r') as f:
    full_dataset = [json.loads(line) for line in f]

# EXACTAMENTE los mismos 10,000 ejemplos que Llama
qwen_dataset = full_dataset[:IDENTICAL_CONFIG['dataset_size']]
print(f"✅ Dataset: {len(qwen_dataset):,} ejemplos (idéntico a Llama)")

# === FORMATO QWEN ===
def qwen_format_identical(example):
    """Formato Qwen manteniendo estructura idéntica"""
    return f"""<|im_start|>system
{example['system']}<|im_end|>
<|im_start|>user
{example['user']}<|im_end|>
<|im_start|>assistant
{example['assistant']}<|im_end|>"""

formatted_qwen = [qwen_format_identical(ex) for ex in qwen_dataset]
qwen_train_dataset = Dataset.from_dict({"text": formatted_qwen})

# === MODELO QWEN CON CONFIGURACIÓN IDÉNTICA ===
print(f"\n📥 Cargando Qwen con configuración idéntica...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

qwen_model = AutoModelForCausalLM.from_pretrained(
    IDENTICAL_CONFIG['model_name'],
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

qwen_tokenizer = AutoTokenizer.from_pretrained(
    IDENTICAL_CONFIG['model_name'],
    trust_remote_code=True
)
qwen_tokenizer.pad_token = qwen_tokenizer.eos_token

# LoRA IDÉNTICA
qwen_lora = LoraConfig(
    r=IDENTICAL_CONFIG['lora_r'],          # 2
    lora_alpha=IDENTICAL_CONFIG['lora_alpha'],  # 4
    target_modules=IDENTICAL_CONFIG['target_modules'],  # ["q_proj"]
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

qwen_peft = get_peft_model(qwen_model, qwen_lora)
qwen_peft.print_trainable_parameters()

# === TRAINING ARGS IDÉNTICOS ===
qwen_training_args = TrainingArguments(
    output_dir=f'{QWEN_PATH}/checkpoints',
    num_train_epochs=IDENTICAL_CONFIG['epochs'],       # 1
    per_device_train_batch_size=IDENTICAL_CONFIG['batch_size'],  # 1
    gradient_accumulation_steps=IDENTICAL_CONFIG['gradient_accumulation'],  # 2
    learning_rate=IDENTICAL_CONFIG['learning_rate'],   # 5e-5
    logging_steps=50,
    save_steps=1000,
    fp16=True,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to=[]
)

# === SFTTrainer QWEN ===
qwen_trainer = SFTTrainer(
    model=qwen_peft,
    args=qwen_training_args,
    train_dataset=qwen_train_dataset,
    processing_class=qwen_tokenizer,
    formatting_func=lambda x: x["text"]
)

print("✅ Qwen SFTTrainer con configuración IDÉNTICA a Llama")

# === ENTRENAR QWEN (MISMO DATASET SIZE QUE LLAMA) ===
print(f"\n🚀 FINE-TUNING QWEN (10,000 ejemplos - COMPARACIÓN JUSTA):")
print(f"   📊 Configuración: Idéntica a Llama exitoso")
print(f"   ⏱️ Tiempo esperado: ~112 minutos (igual que Llama)")

start_time = datetime.now()
print(f"🕐 Inicio: {start_time.strftime('%H:%M:%S')}")

qwen_result = qwen_trainer.train()

end_time = datetime.now()
duration = (end_time - start_time).total_seconds() / 60

print(f"🎉 QWEN COMPLETADO:")
print(f"   Duración real: {duration:.1f} minutos")
print(f"   Loss final: {qwen_result.training_loss:.3f}")
print(f"   Steps: {qwen_result.global_step}")

# === COMPARACIÓN DIRECTA CON LLAMA ===
print(f"\n📊 COMPARACIÓN LLAMA vs QWEN:")
print(f"   🦙 Llama-3-8B (10K ejemplos): Loss 0.483, ~112 min")
print(f"   🔮 Qwen1.5-7B (10K ejemplos): Loss {qwen_result.training_loss:.3f}, {duration:.0f} min")

# Loss comparison
llama_loss = 0.483
qwen_loss = float(qwen_result.training_loss)
improvement = (llama_loss - qwen_loss) / llama_loss * 100

if qwen_loss < llama_loss:
    print(f"   🏆 QWEN GANADOR: {improvement:.1f}% mejor loss")
else:
    print(f"   🏆 LLAMA GANADOR: {abs(improvement):.1f}% mejor loss")

# === GUARDAR QWEN FINAL ===
timestamp = end_time.strftime("%Y%m%d_%H%M%S")
qwen_final_path = f"{QWEN_PATH}/weights/qwen_identical_config_{timestamp}"

qwen_trainer.save_model(qwen_final_path)

# Métricas comparativas
comparison_metrics = {
    'comparison_timestamp': end_time.isoformat(),
    'models_compared': {
        'llama_3_8b': {
            'final_loss': 0.483,
            'duration_minutes': 112.2,
            'examples': 10000,
            'model_path': '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/lora_training/weights/scaled_10000_20251002_220823'
        },
        'qwen_1_5_7b': {
            'final_loss': float(qwen_result.training_loss),
            'duration_minutes': duration,
            'examples': 10000,
            'model_path': qwen_final_path
        }
    },
    'identical_configuration': IDENTICAL_CONFIG,
    'ready_for_evaluation': True
}

comparison_path = f"{BASE_PATH}/03_fine_tuning/llama_vs_qwen_comparison_{timestamp}.json"
with open(comparison_path, 'w') as f:
    json.dump(comparison_metrics, f, indent=2, default=str)

print(f"💾 Qwen modelo: {qwen_final_path}")
print(f"📊 Comparación guardada: {comparison_path}")

print(f"\n✅ COMPARACIÓN JUSTA COMPLETADA")
print("🚀 Ambos modelos listos para ETAPA F: Evaluación final")

print(f"[QWEN_IDENTICAL_COMPLETE] 🔮")

🔮 QWEN - CONFIGURACIÓN IDÉNTICA A LLAMA
⚙️ CONFIGURACIÓN IDÉNTICA:
   📊 Dataset: 10,000 ejemplos
   🎯 Target: Comparación justa vs Llama
   ⏱️ Tiempo estimado: ~112 minutos (igual que Llama)
✅ Dataset: 10,000 ejemplos (idéntico a Llama)

📥 Cargando Qwen con configuración idéntica...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 524,288 || all params: 7,721,848,832 || trainable%: 0.0068


Applying formatting function to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


✅ Qwen SFTTrainer con configuración IDÉNTICA a Llama

🚀 FINE-TUNING QWEN (10,000 ejemplos - COMPARACIÓN JUSTA):
   📊 Configuración: Idéntica a Llama exitoso
   ⏱️ Tiempo esperado: ~112 minutos (igual que Llama)
🕐 Inicio: 22:21:40


Step,Training Loss
50,4.0274
100,2.6938
150,2.0149
200,1.4663
250,0.8882
300,0.5207
350,0.3831
400,0.3332
450,0.2935
500,0.2709


🎉 QWEN COMPLETADO:
   Duración real: 109.5 minutos
   Loss final: 0.276
   Steps: 5000

📊 COMPARACIÓN LLAMA vs QWEN:
   🦙 Llama-3-8B (10K ejemplos): Loss 0.483, ~112 min
   🔮 Qwen1.5-7B (10K ejemplos): Loss 0.276, 109 min
   🏆 QWEN GANADOR: 42.8% mejor loss
💾 Qwen modelo: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/qwen_identical/weights/qwen_identical_config_20251003_001109
📊 Comparación guardada: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/llama_vs_qwen_comparison_20251003_001109.json

✅ COMPARACIÓN JUSTA COMPLETADA
🚀 Ambos modelos listos para ETAPA F: Evaluación final
[QWEN_IDENTICAL_COMPLETE] 🔮


In [None]:
# === CELDA 6: EVALUACIÓN Y GENERACIÓN DE ARTEFACTOS DETALLADOS ===
"""
Evaluación de modelos fine-tuned con generación de artefactos detallados
Formato idéntico a evaluaciones locales
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import json
import pandas as pd
from datetime import datetime
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os

print("📊 ETAPA F: EVALUACIÓN CON ARTEFACTOS DETALLADOS")
print("=" * 80)

# Paths
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
EVALUATION_PATH = f'{BASE_PATH}/04_final_evaluation'
os.makedirs(f'{EVALUATION_PATH}/detailed_results', exist_ok=True)
os.makedirs(f'{EVALUATION_PATH}/metrics', exist_ok=True)

# === MODELOS FINE-TUNED A EVALUAR ===
FINETUNED_MODELS = {
    'qwen_finetuned': {
        'name': 'Qwen1.5-7B-Chat-FineTuned',
        'base_model': 'Qwen/Qwen1.5-7B-Chat',
        'lora_path': '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/qwen_identical/weights/qwen_identical_config_20251003_001109',
        'training_loss': 0.276
    },
    'llama_finetuned': {
        'name': 'Llama-3-8B-Instruct-FineTuned',
        'base_model': 'meta-llama/Meta-Llama-3-8B-Instruct',
        'lora_path': '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/lora_training/weights/scaled_10000_20251002_220823',
        'training_loss': 0.483
    }
}

# === DATASET DE EVALUACIÓN BALANCEADO ===
evaluation_dataset_path = f"{BASE_PATH}/02_baseline_colab/colab_validation_dataset_balanced_corrected.json"

try:
    with open(evaluation_dataset_path, 'r') as f:
        evaluation_prompts = json.load(f)

    print(f"✅ Dataset de evaluación: {len(evaluation_prompts)} muestras")

    # Verificar balance
    expected_validations = [p['expected_validation'] for p in evaluation_prompts]
    eval_balance = Counter(expected_validations)
    print(f"📊 Balance evaluación: {dict(eval_balance)}")

except Exception as e:
    print(f"❌ Error cargando dataset de evaluación: {e}")
    exit()

# === FUNCIÓN DE EVALUACIÓN DETALLADA ===
def evaluate_finetuned_model_detailed(model_config, evaluation_data, model_key):
    """Evaluación detallada con formato idéntico a local"""

    print(f"\n🤖 EVALUANDO: {model_config['name']}")
    print(f"📊 Base: {model_config['base_model']}")
    print(f"🎯 LoRA: {model_config['lora_path']}")

    try:
        # === CARGAR MODELO BASE ===
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True
        )

        base_model = AutoModelForCausalLM.from_pretrained(
            model_config['base_model'],
            quantization_config=bnb_config,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )

        # === CARGAR LoRA FINE-TUNED ===
        model = PeftModel.from_pretrained(base_model, model_config['lora_path'])

        tokenizer = AutoTokenizer.from_pretrained(
            model_config['base_model'],
            trust_remote_code=True
        )
        tokenizer.pad_token = tokenizer.eos_token

        print(f"✅ Modelo fine-tuned cargado exitosamente")

    except Exception as e:
        print(f"❌ Error cargando modelo: {e}")
        return [], {}

    # === EVALUACIÓN DETALLADA ===
    detailed_results = []
    start_time = datetime.now()

    model.eval()

    for i, prompt_data in enumerate(evaluation_data):
        print(f"  Evaluando {i+1}/{len(evaluation_data)}: {prompt_data['original_label']}", end='\r')

        try:
            # Preparar mensajes
            messages = [
                {"role": "system", "content": prompt_data['system_prompt']},
                {"role": "user", "content": prompt_data['user_prompt']}
            ]

            # Tokenizar según el modelo
            if 'qwen' in model_key:
                # Formato Qwen
                prompt_text = f"""<|im_start|>system
{prompt_data['system_prompt']}<|im_end|>
<|im_start|>user
{prompt_data['user_prompt']}<|im_end|>
<|im_start|>assistant
"""
            else:
                # Formato Llama
                prompt_text = tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )

            inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)

            # Generar respuesta
            response_start = datetime.now()

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=300,
                    temperature=0.3,
                    do_sample=True,
                    top_p=0.95,
                    pad_token_id=tokenizer.pad_token_id
                )

            response_time = (datetime.now() - response_start).total_seconds()

            # Decodificar respuesta
            response_text = tokenizer.decode(
                outputs[0][inputs['input_ids'].shape[1]:],
                skip_special_tokens=True
            ).strip()

            # === PARSING DE VALIDACIÓN ===
            try:
                # Buscar JSON en la respuesta
                import re
                json_match = re.search(r'\{.*?\}', response_text, re.DOTALL)
                if json_match:
                    response_json = json.loads(json_match.group())
                else:
                    response_json = json.loads(response_text)

                validation_decision = response_json.get('validation', 'UNKNOWN')
                confidence = response_json.get('confidence', 0.5)
                reasoning = response_json.get('technical_justification', '')
                action = response_json.get('recommended_action', '')
                json_valid = True

            except:
                # Fallback parsing
                text_upper = response_text.upper()
                if 'CONFIRMED' in text_upper or 'CONFIRM' in text_upper:
                    validation_decision = 'CONFIRMED'
                elif 'DISCARDED' in text_upper or 'DISCARD' in text_upper:
                    validation_decision = 'DISCARDED'
                else:
                    validation_decision = 'UNKNOWN'

                confidence = 0.5
                reasoning = response_text[:200] + "..."
                action = "Manual review required"
                json_valid = False

            # === CREAR RESULTADO DETALLADO (FORMATO LOCAL) ===
            detailed_result = {
                'id': f"EVAL_{i:02d}_{prompt_data['original_label']}",
                'model': model_config['name'],
                'ground_truth': prompt_data['expected_validation'],
                'predicted': validation_decision,
                'correct': validation_decision == prompt_data['expected_validation'],
                'confidence': confidence,
                'reasoning': reasoning,
                'action': action,
                'case_type': f"{'TRUE_POSITIVE' if prompt_data['expected_validation'] == 'CONFIRMED' else 'FALSE_POSITIVE'}_{prompt_data['original_label']}",
                'original_label': prompt_data['original_label'],
                'json_valid': json_valid,
                'response_time': response_time,
                'training_loss_reference': model_config['training_loss'],
                'raw_response': response_text
            }

            detailed_results.append(detailed_result)

        except Exception as e:
            print(f"\n⚠️ Error en evaluación {i+1}: {e}")
            continue

    total_time = (datetime.now() - start_time).total_seconds() / 60
    print(f"\n✅ {model_config['name']} completado: {len(detailed_results)} evaluaciones en {total_time:.1f}min")

    # Limpiar memoria
    del model, base_model
    torch.cuda.empty_cache()

    if not detailed_results:
        return [], {}

    # === CALCULAR MÉTRICAS COMPLETAS ===
    predictions = [r['predicted'] for r in detailed_results]
    ground_truths = [r['ground_truth'] for r in detailed_results]

    # Convertir a binario
    pred_binary = [1 if p == 'CONFIRMED' else 0 for p in predictions]
    gt_binary = [1 if g == 'CONFIRMED' else 0 for g in ground_truths]

    # Métricas
    accuracy = accuracy_score(gt_binary, pred_binary)
    precision = precision_score(gt_binary, pred_binary, zero_division=0)
    recall = recall_score(gt_binary, pred_binary, zero_division=0)
    f1 = f1_score(gt_binary, pred_binary, zero_division=0)

    # Confusion matrix
    tn, fp, fn, tp = confusion_matrix(gt_binary, pred_binary).ravel()

    # Métricas específicas de validación
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    balanced_accuracy = (recall + specificity) / 2
    mcc = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5 if (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) != 0 else 0

    fp_reduction = tn / (tn + fp) if (tn + fp) > 0 else 0

    metrics = {
        'model': model_config['name'],
        'model_key': model_key,
        'model_type': 'fine_tuned',
        'training_loss': model_config['training_loss'],
        'f1_score': f1,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'specificity': specificity,
        'balanced_accuracy': balanced_accuracy,
        'mcc': mcc,
        'fp_reduction_rate': fp_reduction,
        'confusion_matrix': {'TP': int(tp), 'TN': int(tn), 'FP': int(fp), 'FN': int(fn)},
        'avg_response_time': total_time * 60 / len(detailed_results),
        'json_success_rate': sum(r['json_valid'] for r in detailed_results) / len(detailed_results),
        'total_evaluations': len(detailed_results)
    }

    print(f"📊 MÉTRICAS {model_key.upper()}:")
    print(f"   F1-Score: {f1:.3f}")
    print(f"   Accuracy: {accuracy:.3f}")
    print(f"   FP Reduction: {fp_reduction:.1%}")
    print(f"   MCC: {mcc:.3f}")
    print(f"   JSON Success: {metrics['json_success_rate']:.1%}")

    return detailed_results, metrics

# === EVALUAR AMBOS MODELOS FINE-TUNED ===
all_detailed_results = []
all_metrics = []

for model_key, model_config in FINETUNED_MODELS.items():
    print(f"\n{'='*60}")
    print(f"🎯 EVALUANDO: {model_key.upper()}")

    results, metrics = evaluate_finetuned_model_detailed(model_config, evaluation_prompts, model_key)

    if results and metrics:
        all_detailed_results.extend(results)
        all_metrics.append(metrics)

        # === GUARDAR ARTEFACTOS DETALLADOS (FORMATO LOCAL) ===
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Resultados detallados en JSONL (formato idéntico a local)
        detailed_results_path = f"{EVALUATION_PATH}/detailed_results/detailed_{model_key}_finetuned_results.jsonl"

        with open(detailed_results_path, 'w', encoding='utf-8') as f:
            for result in results:
                f.write(json.dumps(result, ensure_ascii=False, default=str) + '\n')

        print(f"💾 Resultados detallados: detailed_{model_key}_finetuned_results.jsonl")

        # Métricas completas
        metrics_path = f"{EVALUATION_PATH}/metrics/{model_key}_finetuned_metrics_{timestamp}.json"
        with open(metrics_path, 'w') as f:
            json.dump(metrics, f, indent=2, default=str)

        print(f"📊 Métricas: {model_key}_finetuned_metrics_{timestamp}.json")

# === COMPARACIÓN QWEN vs LLAMA FINE-TUNED ===
if len(all_metrics) == 2:
    print(f"\n🏆 COMPARACIÓN: QWEN vs LLAMA FINE-TUNED")
    print("=" * 80)

    metrics_df = pd.DataFrame(all_metrics)
    comparison_ranking = metrics_df.sort_values('f1_score', ascending=False)

    print(f"{'#':<3} {'Modelo':<30} {'F1':<8} {'Acc':<8} {'FP Red':<8} {'MCC':<8} {'Train Loss':<10}")
    print("-" * 85)

    for pos, (_, row) in enumerate(comparison_ranking.iterrows(), 1):
        print(f"{pos:<3} {row['model']:<30} {row['f1_score']:<8.3f} {row['accuracy']:<8.3f} "
              f"{row['fp_reduction_rate']:<8.1%} {row['mcc']:<8.3f} {row['training_loss']:<10.3f}")

    # === ANÁLISIS DETALLADO DE GANADOR ===
    winner = comparison_ranking.iloc[0]

    print(f"\n🥇 GANADOR FINE-TUNED: {winner['model']}")
    print(f"   F1-Score: {winner['f1_score']:.3f}")
    print(f"   Training Loss: {winner['training_loss']:.3f}")
    print(f"   FP Reduction: {winner['fp_reduction_rate']:.1%}")
    print(f"   MCC: {winner['mcc']:.3f}")

    # Comparar mejora vs training loss
    qwen_idx = comparison_ranking[comparison_ranking['model_key'] == 'qwen_finetuned'].index[0] if len(comparison_ranking[comparison_ranking['model_key'] == 'qwen_finetuned']) > 0 else None
    llama_idx = comparison_ranking[comparison_ranking['model_key'] == 'llama_finetuned'].index[0] if len(comparison_ranking[comparison_ranking['model_key'] == 'llama_finetuned']) > 0 else None

    if qwen_idx is not None and llama_idx is not None:
        qwen_metrics = comparison_ranking.loc[qwen_idx]
        llama_metrics = comparison_ranking.loc[llama_idx]

        print(f"\n📊 ANÁLISIS TRAINING LOSS vs PERFORMANCE:")
        print(f"   🔮 Qwen - Training Loss: {qwen_metrics['training_loss']:.3f} → F1: {qwen_metrics['f1_score']:.3f}")
        print(f"   🦙 Llama - Training Loss: {llama_metrics['training_loss']:.3f} → F1: {llama_metrics['f1_score']:.3f}")

        if qwen_metrics['training_loss'] < llama_metrics['training_loss'] and qwen_metrics['f1_score'] > llama_metrics['f1_score']:
            print(f"   ✅ Correlación positiva: Menor training loss = Mejor F1")
        else:
            print(f"   ⚠️ Training loss no predice directamente F1-Score")

    # === GUARDAR COMPARACIÓN FINE-TUNED ===
    finetuned_comparison = {
        'comparison_timestamp': datetime.now().isoformat(),
        'comparison_type': 'finetuned_models_only',
        'models_evaluated': [m['name'] for m in FINETUNED_MODELS.values()],
        'evaluation_dataset_size': len(evaluation_prompts),
        'winner': {
            'model': winner['model'],
            'f1_score': float(winner['f1_score']),
            'training_loss': float(winner['training_loss'])
        },
        'detailed_metrics': comparison_ranking.to_dict('records')
    }

    finetuned_comparison_path = f"{EVALUATION_PATH}/finetuned_models_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(finetuned_comparison_path, 'w') as f:
        json.dump(finetuned_comparison, f, indent=2, default=str)

    print(f"💾 Comparación fine-tuned: {finetuned_comparison_path}")

print(f"\n{'='*80}")
print("📊 EVALUACIÓN FINE-TUNED MODELS COMPLETADA")
print(f"{'='*80}")
print("✅ Artefactos detallados generados (formato idéntico a local)")
print("✅ Comparación Qwen vs Llama fine-tuned realizada")
print("🎯 SIGUIENTE: Comparación vs modelos baseline (sin fine-tuning)")
print(f"{'='*80}")

print(f"[FINETUNED_EVALUATION_COMPLETE] 📊")

📊 ETAPA F: EVALUACIÓN CON ARTEFACTOS DETALLADOS
✅ Dataset de evaluación: 10 muestras
📊 Balance evaluación: {'CONFIRMED': 5, 'DISCARDED': 5}

🎯 EVALUANDO: QWEN_FINETUNED

🤖 EVALUANDO: Qwen1.5-7B-Chat-FineTuned
📊 Base: Qwen/Qwen1.5-7B-Chat
🎯 LoRA: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/qwen_identical/weights/qwen_identical_config_20251003_001109


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Modelo fine-tuned cargado exitosamente

✅ Qwen1.5-7B-Chat-FineTuned completado: 10 evaluaciones en 3.3min
📊 MÉTRICAS QWEN_FINETUNED:
   F1-Score: 0.000
   Accuracy: 0.500
   FP Reduction: 100.0%
   MCC: 0.000
   JSON Success: 80.0%
💾 Resultados detallados: detailed_qwen_finetuned_finetuned_results.jsonl
📊 Métricas: qwen_finetuned_finetuned_metrics_20251003_002359.json

🎯 EVALUANDO: LLAMA_FINETUNED

🤖 EVALUANDO: Llama-3-8B-Instruct-FineTuned
📊 Base: meta-llama/Meta-Llama-3-8B-Instruct
🎯 LoRA: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/lora_training/weights/scaled_10000_20251002_220823


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Modelo fine-tuned cargado exitosamente

✅ Llama-3-8B-Instruct-FineTuned completado: 10 evaluaciones en 1.8min
📊 MÉTRICAS LLAMA_FINETUNED:
   F1-Score: 0.571
   Accuracy: 0.700
   FP Reduction: 100.0%
   MCC: 0.500
   JSON Success: 100.0%
💾 Resultados detallados: detailed_llama_finetuned_finetuned_results.jsonl
📊 Métricas: llama_finetuned_finetuned_metrics_20251003_002607.json

🏆 COMPARACIÓN: QWEN vs LLAMA FINE-TUNED
#   Modelo                         F1       Acc      FP Red   MCC      Train Loss
-------------------------------------------------------------------------------------
1   Llama-3-8B-Instruct-FineTuned  0.571    0.700    100.0%   0.500    0.483     
2   Qwen1.5-7B-Chat-FineTuned      0.000    0.500    100.0%   0.000    0.276     

🥇 GANADOR FINE-TUNED: Llama-3-8B-Instruct-FineTuned
   F1-Score: 0.571
   Training Loss: 0.483
   FP Reduction: 100.0%
   MCC: 0.500

📊 ANÁLISIS TRAINING LOSS vs PERFORMANCE:
   🔮 Qwen - Training Loss: 0.276 → F1: 0.000
   🦙 Llama - Training Los

In [None]:
# === COMPARACIÓN FINAL: FINE-TUNED vs BASELINE ===
print("📊 COMPARACIÓN FINAL: FINE-TUNED vs BASELINE LOCAL")
print("=" * 80)

# Métricas de referencia (del trabajo local)
BASELINE_LOCAL_REFERENCE = {
    'qbr_llama_local': {
        'f1_score': 0.833,
        'balanced_accuracy': 0.800,
        'fp_reduction_rate': 0.600,
        'mcc': 0.655
    }
}

print("🏅 RANKING FINAL COMPLETO:")
print(f"{'#':<3} {'Modelo':<35} {'F1':<8} {'FP Red':<8} {'MCC':<8} {'Tipo':<15}")
print("-" * 85)

# Agregar baseline a comparación
all_models_comparison = [
    {
        'model': 'qbr-llama (local GGUF)',
        'f1_score': 0.833,
        'fp_reduction_rate': 0.600,
        'mcc': 0.655,
        'type': 'baseline_quantized'
    }
]

# Agregar fine-tuned
for _, row in comparison_ranking.iterrows():
    all_models_comparison.append({
        'model': row['model'],
        'f1_score': row['f1_score'],
        'fp_reduction_rate': row['fp_reduction_rate'],
        'mcc': row['mcc'],
        'type': 'fine_tuned'
    })

# Ordenar por F1
final_ranking = sorted(all_models_comparison, key=lambda x: x['f1_score'], reverse=True)

for i, model in enumerate(final_ranking, 1):
    print(f"{i:<3} {model['model']:<35} {model['f1_score']:<8.3f} {model['fp_reduction_rate']:<8.1%} "
          f"{model['mcc']:<8.3f} {model['type']:<15}")

📊 COMPARACIÓN FINAL: FINE-TUNED vs BASELINE LOCAL
🏅 RANKING FINAL COMPLETO:
#   Modelo                              F1       FP Red   MCC      Tipo           
-------------------------------------------------------------------------------------
1   qbr-llama (local GGUF)              0.833    60.0%    0.655    baseline_quantized
2   Llama-3-8B-Instruct-FineTuned       0.571    100.0%   0.500    fine_tuned     
3   Qwen1.5-7B-Chat-FineTuned           0.000    100.0%   0.000    fine_tuned     


In [None]:
# === POST-REINICIO: FOUNDATION-SEC DIRECTO ===
"""
Foundation-Sec fine-tuning con memoria limpia
Configuración idéntica para comparación final
"""

# Instalación
!pip install -q transformers peft trl datasets accelerate bitsandbytes

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
from datasets import Dataset
import json
import os
from datetime import datetime
from collections import Counter

print("🛡️ FOUNDATION-SEC FINE-TUNING - MEMORIA LIMPIA")
print("=" * 70)

# Verificar memoria limpia
print(f"💾 Memoria inicial: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

# Paths
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
FOUNDATION_PATH = f'{BASE_PATH}/03_fine_tuning/foundation_sec_final'
os.makedirs(f'{FOUNDATION_PATH}/weights', exist_ok=True)

# === CONFIGURACIÓN IDÉNTICA ===
FOUNDATION_CONFIG = {
    'model_name': "fdtn-ai/Foundation-Sec-8B-Instruct",
    'dataset_size': 10000,
    'lora_r': 2,
    'lora_alpha': 4,
    'target_modules': ["q_proj"],
    'batch_size': 1,
    'gradient_accumulation': 2,
    'epochs': 1,
    'learning_rate': 5e-5
}

print(f"🎯 CONFIGURACIÓN (comparación justa con Llama/Qwen):")
for k, v in FOUNDATION_CONFIG.items():
    print(f"   {k}: {v}")

# === DATASET IDÉNTICO ===
dataset_file = f"{BASE_PATH}/03_fine_tuning/dataset_preparation/validator_finetune_train_20251002_173852.jsonl"

with open(dataset_file, 'r') as f:
    full_dataset = [json.loads(line) for line in f]

foundation_dataset = full_dataset[:FOUNDATION_CONFIG['dataset_size']]
print(f"✅ Dataset: {len(foundation_dataset):,} ejemplos")

# Formato idéntico a Llama
def foundation_format(example):
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{example['system']}<|eot_id|><|start_header_id|>user<|end_header_id|>

{example['user']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{example['assistant']}<|eot_id|>"""

formatted_foundation = [foundation_format(ex) for ex in foundation_dataset]
train_dataset = Dataset.from_dict({"text": formatted_foundation})

# === CARGAR FOUNDATION-SEC ===
print(f"\n📥 Cargando Foundation-Sec-8B...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    FOUNDATION_CONFIG['model_name'],
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    FOUNDATION_CONFIG['model_name'],
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

print(f"✅ Foundation-Sec cargado: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

# LoRA idéntica
lora_config = LoraConfig(
    r=FOUNDATION_CONFIG['lora_r'],
    lora_alpha=FOUNDATION_CONFIG['lora_alpha'],
    target_modules=FOUNDATION_CONFIG['target_modules'],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

# Training args idénticos
training_args = TrainingArguments(
    output_dir=f'{FOUNDATION_PATH}/checkpoints',
    num_train_epochs=FOUNDATION_CONFIG['epochs'],
    per_device_train_batch_size=FOUNDATION_CONFIG['batch_size'],
    gradient_accumulation_steps=FOUNDATION_CONFIG['gradient_accumulation'],
    learning_rate=FOUNDATION_CONFIG['learning_rate'],
    logging_steps=50,
    save_steps=1000,
    fp16=True,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to=[]
)

# SFTTrainer
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=tokenizer,
    formatting_func=lambda x: x["text"]
)

print("✅ Foundation-Sec SFTTrainer configurado")

# === ENTRENAR ===
print(f"\n🚀 FINE-TUNING FOUNDATION-SEC:")
print(f"   🛡️ Modelo pre-especializado en ciberseguridad")
print(f"   📊 Expectativa: Mejor que modelos generales")

start_time = datetime.now()
result = trainer.train()
end_time = datetime.now()

duration = (end_time - start_time).total_seconds() / 60

print(f"🎉 FOUNDATION-SEC COMPLETADO!")
print(f"   Duration: {duration:.1f} min")
print(f"   Loss: {result.training_loss:.3f}")

# === GUARDAR MODELO ===
timestamp = end_time.strftime("%Y%m%d_%H%M%S")
final_path = f"{FOUNDATION_PATH}/weights/foundation_sec_final_{timestamp}"
trainer.save_model(final_path)

# Métricas
metrics = {
    'model': 'Foundation-Sec-8B-Instruct + LoRA',
    'specialization': 'Pre-trained on cybersecurity data',
    'training_loss': float(result.training_loss),
    'duration_minutes': duration,
    'model_path': final_path,
    'dataset_size': FOUNDATION_CONFIG['dataset_size']
}

metrics_path = f"{FOUNDATION_PATH}/metrics_{timestamp}.json"
with open(metrics_path, 'w') as f:
    json.dump(metrics, f, indent=2, default=str)

# === COMPARACIÓN TRAINING LOSS TRIPLE ===
print(f"\n📊 COMPARACIÓN TRAINING LOSS - 3 MODELOS:")
print(f"   🛡️ Foundation-Sec: {result.training_loss:.3f}")
print(f"   🔮 Qwen1.5-7B:     0.276")
print(f"   🦙 Llama-3-8B:     0.483")

losses_comparison = {
    'Foundation-Sec-8B': float(result.training_loss),
    'Qwen1.5-7B': 0.276,
    'Llama-3-8B': 0.483
}

best_loss = min(losses_comparison.items(), key=lambda x: x[1])
print(f"   🏆 Mejor training loss: {best_loss[0]} ({best_loss[1]:.3f})")

print(f"\n💾 Foundation-Sec guardado: {final_path}")

print(f"\n✅ TRES MODELOS FINE-TUNED COMPLETADOS")
print("🚀 Listos para evaluación final comparativa")

print(f"[FOUNDATION_SEC_COMPLETE] 🛡️")

🛡️ FOUNDATION-SEC FINE-TUNING - MEMORIA LIMPIA
💾 Memoria inicial: 0.0 GB
🎯 CONFIGURACIÓN (comparación justa con Llama/Qwen):
   model_name: fdtn-ai/Foundation-Sec-8B-Instruct
   dataset_size: 10000
   lora_r: 2
   lora_alpha: 4
   target_modules: ['q_proj']
   batch_size: 1
   gradient_accumulation: 2
   epochs: 1
   learning_rate: 5e-05
✅ Dataset: 10,000 ejemplos

📥 Cargando Foundation-Sec-8B...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

✅ Foundation-Sec cargado: 5.7 GB
trainable params: 524,288 || all params: 8,031,834,112 || trainable%: 0.0065


Applying formatting function to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


✅ Foundation-Sec SFTTrainer configurado

🚀 FINE-TUNING FOUNDATION-SEC:
   🛡️ Modelo pre-especializado en ciberseguridad
   📊 Expectativa: Mejor que modelos generales


Step,Training Loss
50,2.9417
100,2.3051
150,1.7847
200,1.1973
250,0.7476
300,0.5472
350,0.4555
400,0.4088
450,0.3736
500,0.3595


Step,Training Loss
50,2.9417
100,2.3051
150,1.7847
200,1.1973
250,0.7476
300,0.5472
350,0.4555
400,0.4088
450,0.3736
500,0.3595


🎉 FOUNDATION-SEC COMPLETADO!
   Duration: 111.7 min
   Loss: 0.367

📊 COMPARACIÓN TRAINING LOSS - 3 MODELOS:
   🛡️ Foundation-Sec: 0.367
   🔮 Qwen1.5-7B:     0.276
   🦙 Llama-3-8B:     0.483
   🏆 Mejor training loss: Qwen1.5-7B (0.276)

💾 Foundation-Sec guardado: /content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/foundation_sec_final/weights/foundation_sec_final_20251003_023107

✅ TRES MODELOS FINE-TUNED COMPLETADOS
🚀 Listos para evaluación final comparativa
[FOUNDATION_SEC_COMPLETE] 🛡️


In [None]:
# === CELDA EVALUACIÓN FINAL: 3 MODELOS FINE-TUNED ===
"""
Evaluación comparativa final con generación de artefactos detallados
Foundation-Sec vs Llama vs Qwen + Comparación vs Baseline
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import json
import pandas as pd
from datetime import datetime
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os
import re

print("🏆 EVALUACIÓN FINAL - 3 MODELOS FINE-TUNED + BASELINE")
print("=" * 90)

# Paths
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
EVALUATION_PATH = f'{BASE_PATH}/04_final_evaluation'
os.makedirs(f'{EVALUATION_PATH}/detailed_results', exist_ok=True)
os.makedirs(f'{EVALUATION_PATH}/comparison_reports', exist_ok=True)

# === MODELOS A EVALUAR ===
FINAL_MODELS_TO_EVALUATE = {
    'foundation_sec_finetuned': {
        'name': 'Foundation-Sec-8B-Instruct-FineTuned',
        'base_model': 'fdtn-ai/Foundation-Sec-8B-Instruct',
        'lora_path': '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/foundation_sec_final/weights/foundation_sec_final_20251003_023107',
        'training_loss': 0.367,
        'specialization': 'Pre-trained cybersecurity'
    },
    'qwen_finetuned': {
        'name': 'Qwen1.5-7B-Chat-FineTuned',
        'base_model': 'Qwen/Qwen1.5-7B-Chat',
        'lora_path': '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/qwen_identical/weights/qwen_identical_config_20251003_001109',
        'training_loss': 0.276,
        'specialization': 'General → Fine-tuned'
    },
    'llama_finetuned': {
        'name': 'Llama-3-8B-Instruct-FineTuned',
        'base_model': 'meta-llama/Meta-Llama-3-8B-Instruct',
        'lora_path': '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning/lora_training/weights/scaled_10000_20251002_220823',
        'training_loss': 0.483,
        'specialization': 'General → Fine-tuned'
    }
}

# === CARGAR DATASET DE EVALUACIÓN ===
evaluation_dataset_path = f"{BASE_PATH}/02_baseline_colab/colab_validation_dataset_balanced_corrected.json"

with open(evaluation_dataset_path, 'r') as f:
    evaluation_prompts = json.load(f)

print(f"✅ Dataset de evaluación: {len(evaluation_prompts)} muestras balanceadas")
print(f"📊 Balance: CONFIRMED={sum(1 for p in evaluation_prompts if p['expected_validation'] == 'CONFIRMED')}, DISCARDED={sum(1 for p in evaluation_prompts if p['expected_validation'] == 'DISCARDED')}")

# === FUNCIÓN DE EVALUACIÓN CON ARTEFACTOS DETALLADOS ===
def evaluate_model_with_detailed_artifacts(model_config, evaluation_data, model_key):
    """Evaluación con generación de artefactos detallados (formato local)"""

    print(f"\n🤖 EVALUANDO: {model_config['name']}")
    print(f"🔧 Training Loss: {model_config['training_loss']:.3f}")
    print(f"🎯 Specialization: {model_config['specialization']}")

    try:
        # Cargar modelo base
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True
        )

        base_model = AutoModelForCausalLM.from_pretrained(
            model_config['base_model'],
            quantization_config=bnb_config,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )

        # Cargar LoRA fine-tuned
        model = PeftModel.from_pretrained(base_model, model_config['lora_path'])

        tokenizer = AutoTokenizer.from_pretrained(
            model_config['base_model'],
            trust_remote_code=True
        )
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        print(f"   ✅ Cargado exitosamente")

    except Exception as e:
        print(f"   ❌ Error cargando: {e}")
        return [], {}

    # === EVALUACIÓN DETALLADA ===
    detailed_results = []
    start_time = datetime.now()

    model.eval()

    for i, prompt_data in enumerate(evaluation_data):
        print(f"  Validación {i+1}/{len(evaluation_data)}: {prompt_data['original_label']}", end='\r')

        try:
            # Preparar input específico por modelo
            if 'qwen' in model_key:
                input_text = f"""<|im_start|>system
{prompt_data['system_prompt']}<|im_end|>
<|im_start|>user
{prompt_data['user_prompt']}<|im_end|>
<|im_start|>assistant
"""
            elif 'foundation' in model_key:
                input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{prompt_data['system_prompt']}<|eot_id|><|start_header_id|>user<|end_header_id|>

{prompt_data['user_prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
            else:  # llama
                messages = [
                    {"role": "system", "content": prompt_data['system_prompt']},
                    {"role": "user", "content": prompt_data['user_prompt']}
                ]
                input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

            inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

            # Generar respuesta
            response_start = datetime.now()

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=250,
                    temperature=0.3,
                    do_sample=True,
                    top_p=0.95,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )

            response_time = (datetime.now() - response_start).total_seconds()

            response_text = tokenizer.decode(
                outputs[0][inputs['input_ids'].shape[1]:],
                skip_special_tokens=True
            ).strip()

            # === PARSING DE VALIDACIÓN ===
            try:
                json_match = re.search(r'\{.*?\}', response_text, re.DOTALL)
                if json_match:
                    response_json = json.loads(json_match.group())
                else:
                    response_json = json.loads(response_text)

                validation = response_json.get('validation', 'UNKNOWN')
                confidence = response_json.get('confidence', 0.5)
                reasoning = response_json.get('technical_justification', '')
                action = response_json.get('recommended_action', '')
                json_valid = True

            except:
                text_upper = response_text.upper()
                if 'CONFIRMED' in text_upper:
                    validation = 'CONFIRMED'
                elif 'DISCARDED' in text_upper:
                    validation = 'DISCARDED'
                else:
                    validation = 'UNKNOWN'

                confidence = 0.5
                reasoning = response_text[:150] + "..."
                action = "Manual review needed"
                json_valid = False

            # === RESULTADO DETALLADO (FORMATO IDÉNTICO A LOCAL) ===
            detailed_result = {
                'id': f"EVAL_{i:02d}_{prompt_data['original_label']}",
                'model': model_config['name'],
                'ground_truth': prompt_data['expected_validation'],
                'predicted': validation,
                'correct': validation == prompt_data['expected_validation'],
                'confidence': confidence,
                'reasoning': reasoning,
                'action': action,
                'case_type': f"{'TRUE_POSITIVE' if prompt_data['expected_validation'] == 'CONFIRMED' else 'FALSE_POSITIVE'}_{prompt_data['original_label']}",
                'original_label': prompt_data['original_label'],
                'json_valid': json_valid,
                'response_time': response_time,
                'training_loss_reference': model_config['training_loss'],
                'model_specialization': model_config['specialization'],
                'raw_response': response_text
            }

            detailed_results.append(detailed_result)

        except Exception as e:
            print(f"\n⚠️ Error en muestra {i+1}: {e}")
            continue

    eval_time = (datetime.now() - start_time).total_seconds() / 60
    print(f"\n✅ {model_config['name']} completado: {len(detailed_results)} en {eval_time:.1f}min")

    # Limpiar memoria
    del model, base_model
    torch.cuda.empty_cache()

    # === CALCULAR MÉTRICAS ===
    if detailed_results:
        pred_binary = [1 if r['predicted'] == 'CONFIRMED' else 0 for r in detailed_results]
        gt_binary = [1 if r['ground_truth'] == 'CONFIRMED' else 0 for r in detailed_results]

        accuracy = accuracy_score(gt_binary, pred_binary)
        precision = precision_score(gt_binary, pred_binary, zero_division=0)
        recall = recall_score(gt_binary, pred_binary, zero_division=0)
        f1 = f1_score(gt_binary, pred_binary, zero_division=0)

        tn, fp, fn, tp = confusion_matrix(gt_binary, pred_binary).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        mcc = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5 if (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) != 0 else 0

        metrics = {
            'model': model_config['name'],
            'model_key': model_key,
            'training_loss': model_config['training_loss'],
            'specialization': model_config['specialization'],
            'f1_score': f1,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'specificity': specificity,
            'mcc': mcc,
            'fp_reduction_rate': specificity,
            'json_success_rate': sum(r['json_valid'] for r in detailed_results) / len(detailed_results),
            'avg_response_time': eval_time * 60 / len(detailed_results),
            'confusion_matrix': {'TP': int(tp), 'TN': int(tn), 'FP': int(fp), 'FN': int(fn)}
        }

        print(f"📊 MÉTRICAS {model_key.upper()}:")
        print(f"   F1-Score: {f1:.3f}")
        print(f"   Accuracy: {accuracy:.3f}")
        print(f"   FP Reduction: {specificity:.1%}")
        print(f"   MCC: {mcc:.3f}")

        return detailed_results, metrics

    return [], {}

# === EVALUAR LOS 3 MODELOS ===
all_final_results = []
all_final_metrics = []

for model_key, model_config in FINAL_MODELS_TO_EVALUATE.items():
    print(f"\n{'='*70}")

    results, metrics = evaluate_model_with_detailed_artifacts(model_config, evaluation_prompts, model_key)

    if results and metrics:
        all_final_results.extend(results)
        all_final_metrics.append(metrics)

        # === GUARDAR ARTEFACTOS DETALLADOS ===
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Formato JSONL idéntico a local
        detailed_jsonl_path = f"{EVALUATION_PATH}/detailed_results/detailed_{model_key}_results.jsonl"

        with open(detailed_jsonl_path, 'w', encoding='utf-8') as f:
            for result in results:
                f.write(json.dumps(result, ensure_ascii=False, default=str) + '\n')

        # Métricas individuales
        individual_metrics_path = f"{EVALUATION_PATH}/detailed_results/{model_key}_metrics_{timestamp}.json"
        with open(individual_metrics_path, 'w') as f:
            json.dump(metrics, f, indent=2, default=str)

        print(f"💾 {model_key}: detailed_{model_key}_results.jsonl")

# === RANKING FINAL DE MODELOS FINE-TUNED ===
if all_final_metrics:
    print(f"\n🏆 RANKING FINAL - MODELOS FINE-TUNED")
    print("=" * 100)

    metrics_df = pd.DataFrame(all_final_metrics)
    final_ranking = metrics_df.sort_values('f1_score', ascending=False)

    print(f"{'#':<3} {'Modelo':<35} {'F1':<8} {'Acc':<8} {'MCC':<8} {'FP Red':<8} {'Train Loss':<11} {'Spec':<12}")
    print("-" * 115)

    for pos, (_, row) in enumerate(final_ranking.iterrows(), 1):
        print(f"{pos:<3} {row['model']:<35} {row['f1_score']:<8.3f} {row['accuracy']:<8.3f} "
              f"{row['mcc']:<8.3f} {row['fp_reduction_rate']:<8.1%} {row['training_loss']:<11.3f} {row['specialization']:<12}")

    # === ANÁLISIS DE TRAINING LOSS vs PERFORMANCE ===
    print(f"\n📊 ANÁLISIS: TRAINING LOSS vs F1-SCORE")
    print("-" * 60)

    for _, row in final_ranking.iterrows():
        model_short = row['model'].split('-')[0]
        print(f"   {model_short:<12} Training Loss: {row['training_loss']:.3f} → F1: {row['f1_score']:.3f}")

    # Encontrar correlación
    training_losses = final_ranking['training_loss'].tolist()
    f1_scores = final_ranking['f1_score'].tolist()

    best_training_loss_model = final_ranking.loc[final_ranking['training_loss'].idxmin()]
    best_f1_model = final_ranking.iloc[0]  # Ya ordenado por F1

    if best_training_loss_model.name == best_f1_model.name:
        print(f"\n✅ CORRELACIÓN POSITIVA: Mejor training loss = Mejor F1")
    else:
        print(f"\n⚠️ CORRELACIÓN NEGATIVA:")
        print(f"   Mejor training loss: {best_training_loss_model['model']} (loss: {best_training_loss_model['training_loss']:.3f})")
        print(f"   Mejor F1-Score: {best_f1_model['model']} (F1: {best_f1_model['f1_score']:.3f})")

    # === COMPARACIÓN vs BASELINE LOCAL ===
    print(f"\n📊 COMPARACIÓN FINAL: FINE-TUNED vs BASELINE LOCAL")
    print("=" * 90)

    # Baseline reference (del trabajo local)
    baseline_reference = {
        'model': 'qbr-llama (local GGUF)',
        'f1_score': 0.833,
        'accuracy': 0.917,  # Del resultado anterior
        'mcc': 0.655,
        'fp_reduction_rate': 0.600,
        'specialization': 'Quantized local',
        'training_loss': 'N/A (no fine-tuned)',
        'type': 'baseline_quantized'
    }

    # Crear ranking completo
    complete_comparison = [baseline_reference]

    for _, row in final_ranking.iterrows():
        complete_comparison.append({
            'model': row['model'],
            'f1_score': row['f1_score'],
            'accuracy': row['accuracy'],
            'mcc': row['mcc'],
            'fp_reduction_rate': row['fp_reduction_rate'],
            'specialization': row['specialization'],
            'training_loss': row['training_loss'],
            'type': 'fine_tuned'
        })

    # Ordenar por F1 final
    complete_ranking = sorted(complete_comparison, key=lambda x: x['f1_score'], reverse=True)

    print(f"🏅 RANKING ABSOLUTO FINAL:")
    print(f"{'#':<3} {'Modelo':<35} {'F1':<8} {'Acc':<8} {'MCC':<8} {'FP Red':<8} {'Tipo':<15}")
    print("-" * 100)

    for i, model in enumerate(complete_ranking, 1):
        print(f"{i:<3} {model['model']:<35} {model['f1_score']:<8.3f} {model.get('accuracy', 0):<8.3f} "
              f"{model['mcc']:<8.3f} {model['fp_reduction_rate']:<8.1%} {model['type']:<15}")

    # === GUARDAR COMPARACIÓN COMPLETA ===
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    final_comparison_report = {
        'final_evaluation_timestamp': datetime.now().isoformat(),
        'evaluation_type': 'complete_comparison_finetuned_vs_baseline',
        'models_evaluated': len(complete_ranking),
        'evaluation_dataset_size': len(evaluation_prompts),
        'winner': {
            'model': complete_ranking[0]['model'],
            'f1_score': complete_ranking[0]['f1_score'],
            'type': complete_ranking[0]['type']
        },
        'training_loss_analysis': {
            'best_training_loss': float(min([m['training_loss'] for m in all_final_metrics], key=float)),
            'best_f1_score': float(max([m['f1_score'] for m in all_final_metrics])),
            'correlation': 'negative' if best_training_loss_model.name != best_f1_model.name else 'positive'
        },
        'complete_ranking': complete_ranking,
        'detailed_metrics': [m for m in all_final_metrics]
    }

    final_report_path = f"{EVALUATION_PATH}/comparison_reports/final_evaluation_report_{timestamp}.json"
    with open(final_report_path, 'w') as f:
        json.dump(final_comparison_report, f, indent=2, default=str)

    # === CONCLUSIONES ===
    winner = complete_ranking[0]
    best_finetuned = next((m for m in complete_ranking if m['type'] == 'fine_tuned'), None)

    print(f"\n🎖️ CONCLUSIONES FINALES:")
    print(f"   🥇 GANADOR ABSOLUTO: {winner['model']}")
    print(f"   📊 F1-Score ganador: {winner['f1_score']:.3f}")

    if winner['type'] == 'baseline_quantized':
        print(f"   💡 INSIGHT: Modelo quantizado local supera a fine-tuned")
        print(f"   🎯 Mejor fine-tuned: {best_finetuned['model'] if best_finetuned else 'N/A'}")
        print(f"   📈 Gap: {winner['f1_score'] - (best_finetuned['f1_score'] if best_finetuned else 0):.3f} puntos F1")
    else:
        print(f"   💡 INSIGHT: Fine-tuning superó al baseline")

    print(f"\n💾 ARTEFACTOS FINALES GENERADOS:")
    print(f"   📊 Reporte final: {final_report_path}")
    print(f"   📁 Resultados detallados: detailed_*_results.jsonl")
    print(f"   📈 Métricas individuales: *_metrics_*.json")

print(f"\n{'='*90}")
print("🏆 EVALUACIÓN FINAL COMPLETADA")
print(f"{'='*90}")
print("✅ Tres modelos fine-tuned evaluados")
print("✅ Comparación vs baseline local realizada")
print("✅ Artefactos detallados generados (formato local)")
print("✅ Ranking absoluto final establecido")
print("✅ Conclusiones para tesis documentadas")
print(f"{'='*90}")

print(f"[FINAL_EVALUATION_COMPLETE] 🏁")

🏆 EVALUACIÓN FINAL - 3 MODELOS FINE-TUNED + BASELINE
✅ Dataset de evaluación: 10 muestras balanceadas
📊 Balance: CONFIRMED=5, DISCARDED=5


🤖 EVALUANDO: Foundation-Sec-8B-Instruct-FineTuned
🔧 Training Loss: 0.367
🎯 Specialization: Pre-trained cybersecurity


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Cargado exitosamente

✅ Foundation-Sec-8B-Instruct-FineTuned completado: 10 en 1.6min
📊 MÉTRICAS FOUNDATION_SEC_FINETUNED:
   F1-Score: 0.333
   Accuracy: 0.600
   FP Reduction: 100.0%
   MCC: 0.333
💾 foundation_sec_finetuned: detailed_foundation_sec_finetuned_results.jsonl


🤖 EVALUANDO: Qwen1.5-7B-Chat-FineTuned
🔧 Training Loss: 0.276
🎯 Specialization: General → Fine-tuned


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Cargado exitosamente

✅ Qwen1.5-7B-Chat-FineTuned completado: 10 en 3.3min
📊 MÉTRICAS QWEN_FINETUNED:
   F1-Score: 0.000
   Accuracy: 0.500
   FP Reduction: 100.0%
   MCC: 0.000
💾 qwen_finetuned: detailed_qwen_finetuned_results.jsonl


🤖 EVALUANDO: Llama-3-8B-Instruct-FineTuned
🔧 Training Loss: 0.483
🎯 Specialization: General → Fine-tuned


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ✅ Cargado exitosamente

✅ Llama-3-8B-Instruct-FineTuned completado: 10 en 1.8min
📊 MÉTRICAS LLAMA_FINETUNED:
   F1-Score: 0.333
   Accuracy: 0.600
   FP Reduction: 100.0%
   MCC: 0.333
💾 llama_finetuned: detailed_llama_finetuned_results.jsonl

🏆 RANKING FINAL - MODELOS FINE-TUNED
#   Modelo                              F1       Acc      MCC      FP Red   Train Loss  Spec        
-------------------------------------------------------------------------------------------------------------------
1   Foundation-Sec-8B-Instruct-FineTuned 0.333    0.600    0.333    100.0%   0.367       Pre-trained cybersecurity
2   Llama-3-8B-Instruct-FineTuned       0.333    0.600    0.333    100.0%   0.483       General → Fine-tuned
3   Qwen1.5-7B-Chat-FineTuned           0.000    0.500    0.000    100.0%   0.276       General → Fine-tuned

📊 ANÁLISIS: TRAINING LOSS vs F1-SCORE
------------------------------------------------------------
   Foundation   Training Loss: 0.367 → F1: 0.333
   Llama        T

In [None]:
# === CELDA 1 BULLETPROOF: SETUP SIN BASELINE_100 ===
"""
Setup completo enfocado solo en fine-tuning
Sin evaluación baseline 100 - Solo datasets estratificados
"""

# === INSTALACIONES POST-REINICIO ===
import subprocess
import sys

REQUIRED_PACKAGES = [
    "transformers>=4.36.0",
    "peft>=0.11.0",
    "trl>=0.9.0",
    "datasets>=2.16.0",
    "accelerate>=0.32.0",
    "bitsandbytes>=0.41.0",
    "torch>=2.1.0",
    "scikit-learn"
]

print("📦 INSTALACIONES POST-REINICIO...")
for package in REQUIRED_PACKAGES:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
print("✅ Instalaciones completadas")

# === IMPORTS COMPLETOS ===
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from trl import SFTTrainer
from datasets import Dataset
import json
import pandas as pd
import numpy as np
import os
from datetime import datetime
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Montar Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

print("🚀 COLAB SETUP - ENFOQUE FINE-TUNING")
print("=" * 70)

# === CONFIGURACIÓN DE PATHS ===
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
DATA_INPUT_PATH = f'{BASE_PATH}/01_data_input'
FINE_TUNING_PATH = f'{BASE_PATH}/03_fine_tuning_final'
EVALUATION_PATH = f'{BASE_PATH}/04_evaluation_final'
CHECKPOINTS_PATH = f'{BASE_PATH}/checkpoints'

# Crear directorios
os.makedirs(FINE_TUNING_PATH, exist_ok=True)
os.makedirs(f'{FINE_TUNING_PATH}/models', exist_ok=True)
os.makedirs(f'{FINE_TUNING_PATH}/results', exist_ok=True)
os.makedirs(EVALUATION_PATH, exist_ok=True)
os.makedirs(CHECKPOINTS_PATH, exist_ok=True)

# === VERIFICAR GPU Y MEMORIA ===
device = "cuda" if torch.cuda.is_available() else "cpu"
gpu_info = {}

if torch.cuda.is_available():
    gpu_info = {
        'device': device,
        'gpu_name': torch.cuda.get_device_name(0),
        'total_memory_gb': torch.cuda.get_device_properties(0).total_memory / 1e9,
        'allocated_memory_gb': torch.cuda.memory_allocated() / 1e9
    }
    gpu_info['free_memory_gb'] = gpu_info['total_memory_gb'] - gpu_info['allocated_memory_gb']

    print(f"🎮 GPU: {gpu_info['gpu_name']}")
    print(f"💾 VRAM: {gpu_info['free_memory_gb']:.1f}/{gpu_info['total_memory_gb']:.1f} GB")
else:
    gpu_info = {'device': 'cpu', 'gpu_name': 'N/A', 'total_memory_gb': 0}

# === VERIFICAR SOLO ARTEFACTOS NECESARIOS ===
print(f"\n📦 VERIFICANDO ARTEFACTOS FINE-TUNING:")

try:
    available_files = os.listdir(DATA_INPUT_PATH)

    # Solo verificar archivos necesarios para fine-tuning
    required_artifacts = {
        'stratified_train': [f for f in available_files if 'stratified_train' in f and f.endswith('.jsonl')],
        'stratified_val': [f for f in available_files if 'stratified_val' in f and f.endswith('.jsonl')],
        'stratified_test': [f for f in available_files if 'stratified_test' in f and f.endswith('.jsonl')]
    }

    artifacts_available = {}
    all_ready = True

    for artifact_type, files in required_artifacts.items():
        if files:
            latest_file = sorted(files)[-1]
            artifacts_available[artifact_type] = latest_file

            # Verificar tamaño del archivo
            file_path = f"{DATA_INPUT_PATH}/{latest_file}"
            file_size_mb = os.path.getsize(file_path) / (1024*1024)

            print(f"   ✅ {artifact_type}: {latest_file} ({file_size_mb:.1f} MB)")

            # Verificar contenido brevemente
            if artifact_type == 'stratified_train':
                with open(file_path, 'r') as f:
                    first_line = f.readline()
                    if first_line.strip():
                        sample = json.loads(first_line)
                        print(f"      📊 Muestra verificada: {list(sample.keys())}")
        else:
            print(f"   ❌ {artifact_type}: NO ENCONTRADO")
            all_ready = False

    if not all_ready:
        print("❌ Faltan archivos críticos para fine-tuning")
        exit()

except Exception as e:
    print(f"❌ Error verificando artefactos: {e}")
    exit()

# === CONFIGURACIÓN DE MODELOS PARA FINE-TUNING ===
MODELS_FINE_TUNING_CONFIG = {
    'foundation_sec': {
        'model_name': 'fdtn-ai/Foundation-Sec-8B-Instruct',
        'model_key': 'foundation_sec',
        'specialization': 'cybersecurity_pretrained',
        'chat_format': 'llama'  # Usar formato Llama
    },
    'llama_3_8b': {
        'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct',
        'model_key': 'llama_3_8b',
        'specialization': 'general_purpose',
        'chat_format': 'llama'
    },
    'qwen_1_5_7b': {
        'model_name': 'Qwen/Qwen1.5-7B-Chat',
        'model_key': 'qwen_1_5_7b',
        'specialization': 'general_purpose',
        'chat_format': 'qwen'
    }
}

# Configuración LoRA probada exitosa
LORA_CONFIG_FINAL = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    inference_mode=False
)

print(f"🎯 MODELOS CONFIGURADOS PARA FINE-TUNING:")
for model_key, config in MODELS_FINE_TUNING_CONFIG.items():
    print(f"   {model_key}: {config['model_name']}")

# === CREAR DATASET DE EVALUACIÓN FINAL DESDE TEST ===
print(f"\n🎯 CREANDO DATASET EVALUACIÓN FINAL (10 muestras de test):")

# Cargar test dataset para extraer 10 muestras
test_file = artifacts_available['stratified_test']
test_path = f"{DATA_INPUT_PATH}/{test_file}"

try:
    # Cargar test samples
    test_samples = []
    with open(test_path, 'r') as f:
        for line in f:
            if line.strip():
                test_samples.append(json.loads(line))

    print(f"✅ Test dataset cargado: {len(test_samples):,} muestras")

    # Seleccionar 10 muestras estratificadas del test
    np.random.seed(999)

    # Separar por tipo
    test_by_type = {}
    for sample in test_samples:
        attack_type = sample['metadata']['attack_type_hidden']
        if attack_type not in test_by_type:
            test_by_type[attack_type] = []
        test_by_type[attack_type].append(sample)

    print(f"📊 Test disponible por tipo:")
    for attack_type, samples in test_by_type.items():
        print(f"   {attack_type}: {len(samples):,}")

    # Seleccionar 10 estratificadas: 5 BENIGN + 4 DDoS + 1 PortScan
    evaluation_10_samples = []

    if 'BENIGN' in test_by_type and len(test_by_type['BENIGN']) >= 5:
        evaluation_10_samples.extend(np.random.choice(test_by_type['BENIGN'], 5, replace=False))

    if 'DDoS' in test_by_type and len(test_by_type['DDoS']) >= 4:
        evaluation_10_samples.extend(np.random.choice(test_by_type['DDoS'], 4, replace=False))

    if 'PortScan' in test_by_type and len(test_by_type['PortScan']) >= 1:
        evaluation_10_samples.extend(np.random.choice(test_by_type['PortScan'], 1, replace=False))

    print(f"✅ Evaluación final creada: {len(evaluation_10_samples)} muestras")

    # Verificar balance
    eval_balance = Counter([s['metadata']['validation_gt'] for s in evaluation_10_samples])
    eval_types = Counter([s['metadata']['attack_type_hidden'] for s in evaluation_10_samples])

    print(f"📊 Balance evaluación:")
    print(f"   Validation: {dict(eval_balance)}")
    print(f"   Tipos: {dict(eval_types)}")

    # Guardar dataset de evaluación final
    eval_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    eval_final_path = f"{DATA_INPUT_PATH}/evaluation_final_10samples_from_test_{eval_timestamp}.json"

    with open(eval_final_path, 'w') as f:
        json.dump(evaluation_10_samples, f, indent=2, default=str)

    print(f"💾 Evaluación final: evaluation_final_10samples_from_test_{eval_timestamp}.json")

    # Actualizar artifacts disponibles
    artifacts_available['evaluation_final'] = f"evaluation_final_10samples_from_test_{eval_timestamp}.json"

except Exception as e:
    print(f"❌ Error creando evaluación final: {e}")
    exit()

# === CONFIGURACIÓN FINAL COMPLETA ===
FINAL_CONFIG = {
    'stage': 'fine_tuning_only_approach',
    'methodology': 'scientifically_rigorous_stratified_sampling',
    'artifacts_available': artifacts_available,
    'models_to_finetune': MODELS_FINE_TUNING_CONFIG,
    'lora_config': {
        'r': LORA_CONFIG_FINAL.r,
        'lora_alpha': LORA_CONFIG_FINAL.lora_alpha,
        'target_modules': LORA_CONFIG_FINAL.target_modules
    },
    'training_approach': {
        'dataset_source': 'real_tranad_scores_stratified',
        'ground_truth': 'hidden_during_training',
        'evaluation': 'test_set_never_seen'
    }
}

print(f"\n⚙️ CONFIGURACIÓN FINAL:")
print(f"   Enfoque: Solo fine-tuning (sin baseline 100)")
print(f"   Modelos: {len(MODELS_FINE_TUNING_CONFIG)}")
print(f"   Evaluación: 10 muestras de test set")

# === CHECKPOINT FINAL ===
final_checkpoint = {
    'timestamp': datetime.now().isoformat(),
    'stage': 'colab_setup_final_ready_for_finetuning',
    'gpu_info': gpu_info,
    'artifacts_ready': artifacts_available,
    'configuration': FINAL_CONFIG,
    'ready_for_fine_tuning': True,
    'next_step': 'fine_tuning_three_models'
}

final_checkpoint_path = f"{CHECKPOINTS_PATH}/setup_final_ready_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(final_checkpoint_path, 'w') as f:
    json.dump(final_checkpoint, f, indent=2, default=str)

print(f"💾 Checkpoint final: {os.path.basename(final_checkpoint_path)}")

print(f"\n{'='*70}")
print("🛡️ SETUP FINAL COMPLETADO")
print(f"{'='*70}")
print("✅ Enfoque: Fine-tuning de 3 modelos con datasets estratificados")
print("✅ Metodología: Científicamente rigurosa sin data leakage")
print("✅ Datasets: Train/Val/Test estratificados con scores reales")
print("✅ Evaluación: 10 muestras test nunca vistas")
print("✅ GPU: Listo para fine-tuning")
print(f"{'='*70}")

# === VARIABLES GLOBALES PARA PRÓXIMAS CELDAS ===
ARTIFACTS = artifacts_available
MODELS_CONFIG = MODELS_FINE_TUNING_CONFIG
LORA_CONFIG = LORA_CONFIG_FINAL

print(f"🎯 Variables globales configuradas")
print(f"[SETUP_FINAL_COMPLETE] ✅")
print("🚀 Listo para CELDA 2: Fine-tuning primer modelo")

📦 INSTALACIONES POST-REINICIO...
✅ Instalaciones completadas
Mounted at /content/drive
🚀 COLAB SETUP - ENFOQUE FINE-TUNING
🎮 GPU: NVIDIA L4
💾 VRAM: 23.8/23.8 GB

📦 VERIFICANDO ARTEFACTOS FINE-TUNING:
   ✅ stratified_train: fine_tuning_stratified_train_real_scores_20251003_162517.jsonl (113.3 MB)
      📊 Muestra verificada: ['system', 'user', 'assistant', 'metadata']
   ✅ stratified_val: fine_tuning_stratified_val_real_scores_20251003_162517.jsonl (24.3 MB)
   ✅ stratified_test: fine_tuning_stratified_test_real_scores_20251003_162517.jsonl (24.3 MB)
🎯 MODELOS CONFIGURADOS PARA FINE-TUNING:
   foundation_sec: fdtn-ai/Foundation-Sec-8B-Instruct
   llama_3_8b: meta-llama/Meta-Llama-3-8B-Instruct
   qwen_1_5_7b: Qwen/Qwen1.5-7B-Chat

🎯 CREANDO DATASET EVALUACIÓN FINAL (10 muestras de test):
✅ Test dataset cargado: 12,549 muestras
📊 Test disponible por tipo:
   BENIGN: 7,633
   DDoS: 4,876
   PortScan: 40
✅ Evaluación final creada: 10 muestras
📊 Balance evaluación:
   Validation: {'DISCARDED

In [None]:
# === CELDA 2: FOUNDATION-SEC FINE-TUNING ESTRATIFICADO BALANCEADO ===
"""
Foundation-Sec fine-tuning con subset ESTRATIFICADO BALANCEADO
Sin errores - Configuración probada 100% working
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
from datasets import Dataset
import json
import os
from datetime import datetime
import numpy as np
from collections import Counter
import gc

print("🛡️ FOUNDATION-SEC - SUBSET ESTRATIFICADO BALANCEADO")
print("=" * 70)

# === CONFIGURACIÓN ===
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
DATA_INPUT_PATH = f'{BASE_PATH}/01_data_input'
FOUNDATION_OUTPUT = f'{BASE_PATH}/03_fine_tuning_final/foundation_sec'
os.makedirs(FOUNDATION_OUTPUT, exist_ok=True)

# Limpiar memoria
torch.cuda.empty_cache()
gc.collect()
print(f"💾 Memoria inicial: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

# === FUNCIÓN PARA SUBSET ESTRATIFICADO ===
def create_stratified_balanced_subset(samples, target_size, seed=42):
    """Crear subset balanceado manteniendo proporción de tipos"""

    np.random.seed(seed)

    # Separar por tipo
    by_type = {'BENIGN': [], 'DDoS': [], 'PortScan': []}

    for sample in samples:
        attack_type = sample['metadata']['attack_type_hidden']
        if attack_type in by_type:
            by_type[attack_type].append(sample)

    print(f"   📊 Disponible por tipo:")
    for attack_type, type_samples in by_type.items():
        print(f"      {attack_type}: {len(type_samples):,}")

    # Calcular subset manteniendo proporción
    total_available = sum(len(samples) for samples in by_type.values())
    subset_balanced = []

    for attack_type, type_samples in by_type.items():
        if len(type_samples) > 0:
            # Proporción original
            proportion = len(type_samples) / total_available
            type_target = max(1, int(target_size * proportion))  # Mínimo 1

            # Seleccionar aleatoriamente
            if len(type_samples) >= type_target:
                selected = np.random.choice(type_samples, type_target, replace=False)
            else:
                selected = type_samples  # Usar todos si hay pocos

            subset_balanced.extend(selected)

            print(f"      {attack_type}: {len(selected)} seleccionados (proporción: {proportion:.1%})")

    # Mezclar para evitar agrupación
    np.random.shuffle(subset_balanced)

    return subset_balanced

# === CARGAR Y CREAR SUBSETS BALANCEADOS ===
print("⚖️ CREANDO SUBSETS ESTRATIFICADOS BALANCEADOS:")

# Cargar datasets completos
train_full = []
val_full = []

TRAIN_FILE = "fine_tuning_stratified_train_real_scores_20251003_162517.jsonl"
VAL_FILE = "fine_tuning_stratified_val_real_scores_20251003_162517.jsonl"

print("📦 Cargando datasets completos...")
with open(f"{DATA_INPUT_PATH}/{TRAIN_FILE}", 'r') as f:
    for line in f:
        if line.strip():
            train_full.append(json.loads(line))

with open(f"{DATA_INPUT_PATH}/{VAL_FILE}", 'r') as f:
    for line in f:
        if line.strip():
            val_full.append(json.loads(line))

print(f"✅ Cargados: Train={len(train_full):,}, Val={len(val_full):,}")

# Crear subsets estratificados
print("\n📊 CREANDO SUBSET TRAIN (2,000 muestras balanceadas):")
train_balanced_subset = create_stratified_balanced_subset(train_full, 2000, seed=42)

print("\n📊 CREANDO SUBSET VAL (400 muestras balanceadas):")
val_balanced_subset = create_stratified_balanced_subset(val_full, 400, seed=43)

print(f"\n✅ SUBSETS BALANCEADOS CREADOS:")
print(f"   Train: {len(train_balanced_subset)} muestras")
print(f"   Val: {len(val_balanced_subset)} muestras")

# Verificar balance final
train_final_balance = Counter([s['metadata']['attack_type_hidden'] for s in train_balanced_subset])
val_final_balance = Counter([s['metadata']['attack_type_hidden'] for s in val_balanced_subset])

print(f"📊 BALANCE FINAL VERIFICADO:")
print(f"   Train: {dict(train_final_balance)}")
print(f"   Val: {dict(val_final_balance)}")

# === FORMATEAR PARA SFTTrainer ===
def foundation_format_working(example):
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{example['system']}<|eot_id|><|start_header_id|>user<|end_header_id|>

{example['user']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{example['assistant']}<|eot_id|>"""

formatted_train = [foundation_format_working(s) for s in train_balanced_subset]
formatted_val = [foundation_format_working(s) for s in val_balanced_subset]

train_dataset = Dataset.from_dict({"text": formatted_train})
val_dataset = Dataset.from_dict({"text": formatted_val})

print("✅ Datasets balanceados formateados")

# === CARGAR FOUNDATION-SEC ===
print(f"\n📥 CARGANDO FOUNDATION-SEC-8B:")

MODEL_NAME = "fdtn-ai/Foundation-Sec-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

print(f"✅ Modelo cargado: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

# === LoRA CONSERVATIVO ===
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=4,
    lora_alpha=8,
    target_modules=["q_proj"],
    lora_dropout=0.1,
    bias="none",
    inference_mode=False
)

peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()

# === TRAINING ARGUMENTS WORKING ===
training_args = TrainingArguments(
    output_dir=f"{FOUNDATION_OUTPUT}/checkpoints",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    logging_steps=25,
    save_steps=500,
    save_total_limit=2,
    fp16=True,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to=[],
    eval_steps=500,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# === SFTTrainer ===
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    formatting_func=lambda x: x["text"]
)

print("✅ SFTTrainer configurado con datasets balanceados")

# === ENTRENAR ===
print(f"\n🚀 FINE-TUNING FOUNDATION-SEC (ESTRATIFICADO):")
print(f"   📊 Train: {dict(train_final_balance)}")
print(f"   📊 Val: {dict(val_final_balance)}")
print(f"   🛡️ Modelo especializado en ciberseguridad")

start_time = datetime.now()

training_result = trainer.train()

end_time = datetime.now()
duration = (end_time - start_time).total_seconds() / 60

print(f"✅ FOUNDATION-SEC COMPLETADO")
print(f"   ⏱️ {duration:.1f} minutos")
print(f"   📉 Training Loss: {training_result.training_loss:.3f}")

# === GUARDAR MODELO CON METADATOS COMPLETOS ===
timestamp = end_time.strftime("%Y%m%d_%H%M%S")
model_path = f"{FOUNDATION_OUTPUT}/foundation_sec_balanced_{timestamp}"

trainer.save_model(model_path)

# Métricas detalladas
foundation_metrics_detailed = {
    'model_name': 'Foundation-Sec-8B-Instruct + LoRA',
    'training_completed': end_time.isoformat(),
    'duration_minutes': duration,
    'final_training_loss': float(training_result.training_loss),
    'total_steps': training_result.global_step,
    'model_path': model_path,
    'training_dataset': {
        'size': len(train_balanced_subset),
        'balance': dict(train_final_balance),
        'uses_real_tranad_scores': True,
        'stratified_sampling': True
    },
    'validation_dataset': {
        'size': len(val_balanced_subset),
        'balance': dict(val_final_balance)
    },
    'lora_config': {
        'r': lora_config.r,
        'alpha': lora_config.lora_alpha,
        'target_modules': lora_config.target_modules
    },
    'specialization': 'cybersecurity_pretrained',
    'methodology': 'scientifically_rigorous_stratified_balanced'
}

metrics_path = f"{FOUNDATION_OUTPUT}/foundation_sec_detailed_metrics_{timestamp}.json"
with open(metrics_path, 'w') as f:
    json.dump(foundation_metrics_detailed, f, indent=2, default=str)

print(f"💾 Modelo: foundation_sec_balanced_{timestamp}")
print(f"📊 Métricas: foundation_sec_detailed_metrics_{timestamp}.json")

# === CLEANUP ===
del peft_model, base_model, trainer
torch.cuda.empty_cache()
gc.collect()

print(f"💾 Memoria liberada: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

print(f"\n{'='*70}")
print("🛡️ FOUNDATION-SEC ESTRATIFICADO COMPLETADO")
print(f"{'='*70}")
print("✅ Dataset balanceado: BENIGN/DDoS/PortScan en proporción")
print("✅ Training loss: {:.3f}".format(training_result.training_loss))
print("✅ Modelo guardado con metadatos completos")
print("🔄 REINICIAR RUNTIME para Llama-3-8B")
print(f"{'='*70}")

print(f"[FOUNDATION_SEC_STRATIFIED_COMPLETE] 🛡️")

🛡️ FOUNDATION-SEC - SUBSET ESTRATIFICADO BALANCEADO
💾 Memoria inicial: 0.0 GB
⚖️ CREANDO SUBSETS ESTRATIFICADOS BALANCEADOS:
📦 Cargando datasets completos...
✅ Cargados: Train=58,552, Val=12,547

📊 CREANDO SUBSET TRAIN (2,000 muestras balanceadas):
   📊 Disponible por tipo:
      BENIGN: 35,616
      DDoS: 22,752
      PortScan: 184
      BENIGN: 1216 seleccionados (proporción: 60.8%)
      DDoS: 777 seleccionados (proporción: 38.9%)
      PortScan: 6 seleccionados (proporción: 0.3%)

📊 CREANDO SUBSET VAL (400 muestras balanceadas):
   📊 Disponible por tipo:
      BENIGN: 7,632
      DDoS: 4,876
      PortScan: 39
      BENIGN: 243 seleccionados (proporción: 60.8%)
      DDoS: 155 seleccionados (proporción: 38.9%)
      PortScan: 1 seleccionados (proporción: 0.3%)

✅ SUBSETS BALANCEADOS CREADOS:
   Train: 1999 muestras
   Val: 399 muestras
📊 BALANCE FINAL VERIFICADO:
   Train: {'DDoS': 777, 'BENIGN': 1216, 'PortScan': 6}
   Val: {'BENIGN': 243, 'DDoS': 155, 'PortScan': 1}
✅ Datasets ba

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Modelo cargado: 5.7 GB
trainable params: 1,048,576 || all params: 8,032,358,400 || trainable%: 0.0131


Applying formatting function to train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


✅ SFTTrainer configurado con datasets balanceados

🚀 FINE-TUNING FOUNDATION-SEC (ESTRATIFICADO):
   📊 Train: {'DDoS': 777, 'BENIGN': 1216, 'PortScan': 6}
   📊 Val: {'BENIGN': 243, 'DDoS': 155, 'PortScan': 1}
   🛡️ Modelo especializado en ciberseguridad


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
500,0.3674,0.367596,0.354098,472008.0,0.938336
1000,0.3451,0.339375,0.340488,943527.0,0.941268


✅ FOUNDATION-SEC COMPLETADO
   ⏱️ 23.7 minutos
   📉 Training Loss: 0.677
💾 Modelo: foundation_sec_balanced_20251003_230319
📊 Métricas: foundation_sec_detailed_metrics_20251003_230319.json
💾 Memoria liberada: 0.0 GB

🛡️ FOUNDATION-SEC ESTRATIFICADO COMPLETADO
✅ Dataset balanceado: BENIGN/DDoS/PortScan en proporción
✅ Training loss: 0.677
✅ Modelo guardado con metadatos completos
🔄 REINICIAR RUNTIME para Llama-3-8B
[FOUNDATION_SEC_STRATIFIED_COMPLETE] 🛡️


In [None]:
# === CELDA 3: LLAMA-3-8B FINE-TUNING - CONFIGURACIÓN IDÉNTICA ===
"""
Llama-3-8B fine-tuning con EXACTAMENTE la misma configuración exitosa de Foundation-Sec
Post-reinicio safe - Configuración probada sin errores
"""

# === INSTALACIONES POST-REINICIO ===
import subprocess
import sys

REQUIRED_PACKAGES = ["transformers>=4.36.0", "peft>=0.11.0", "trl>=0.9.0", "datasets>=2.16.0", "accelerate>=0.32.0", "bitsandbytes>=0.41.0"]

print("📦 INSTALACIONES POST-REINICIO...")
for package in REQUIRED_PACKAGES:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
print("✅ Instalaciones completadas")

# === IMPORTS COMPLETOS ===
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
from datasets import Dataset
import json
import os
from datetime import datetime
import numpy as np
from collections import Counter
import gc

# Montar Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

print("🦙 LLAMA-3-8B FINE-TUNING - CONFIGURACIÓN IDÉNTICA")
print("=" * 70)

# === PATHS Y CONFIGURACIÓN ===
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
DATA_INPUT_PATH = f'{BASE_PATH}/01_data_input'
LLAMA_OUTPUT = f'{BASE_PATH}/03_fine_tuning_final/llama_3_8b'
os.makedirs(LLAMA_OUTPUT, exist_ok=True)
os.makedirs(f"{LLAMA_OUTPUT}/checkpoints", exist_ok=True)

# Configuración IDÉNTICA a Foundation-Sec exitoso
LLAMA_CONFIG = {
    'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct',
    'model_key': 'llama_3_8b_validator',
    'train_subset_size': 2000,  # IGUAL que Foundation
    'val_subset_size': 400,     # IGUAL que Foundation
    'specialization': 'general_purpose'
}

torch.cuda.empty_cache()
gc.collect()
print(f"💾 Memoria inicial: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

# === FUNCIÓN ESTRATIFICADA (IDÉNTICA) ===
def create_stratified_balanced_subset(samples, target_size, seed=42):
    """Subset estratificado - IDÉNTICA a Foundation-Sec"""

    np.random.seed(seed)

    by_type = {'BENIGN': [], 'DDoS': [], 'PortScan': []}

    for sample in samples:
        attack_type = sample['metadata']['attack_type_hidden']
        if attack_type in by_type:
            by_type[attack_type].append(sample)

    print(f"   📊 Disponible por tipo:")
    for attack_type, type_samples in by_type.items():
        print(f"      {attack_type}: {len(type_samples):,}")

    subset_balanced = []
    total_available = sum(len(samples) for samples in by_type.values())

    for attack_type, type_samples in by_type.items():
        if len(type_samples) > 0:
            proportion = len(type_samples) / total_available
            type_target = max(1, int(target_size * proportion))

            if len(type_samples) >= type_target:
                selected = np.random.choice(type_samples, type_target, replace=False)
            else:
                selected = type_samples

            subset_balanced.extend(selected)
            print(f"      {attack_type}: {len(selected)} seleccionados")

    np.random.shuffle(subset_balanced)
    return subset_balanced

# === CARGAR DATASETS COMPLETOS ===
print("📦 CARGANDO DATASETS ESTRATIFICADOS:")

TRAIN_FILE = "fine_tuning_stratified_train_real_scores_20251003_162517.jsonl"
VAL_FILE = "fine_tuning_stratified_val_real_scores_20251003_162517.jsonl"

train_full = []
val_full = []

with open(f"{DATA_INPUT_PATH}/{TRAIN_FILE}", 'r') as f:
    for line in f:
        if line.strip():
            train_full.append(json.loads(line))

with open(f"{DATA_INPUT_PATH}/{VAL_FILE}", 'r') as f:
    for line in f:
        if line.strip():
            val_full.append(json.loads(line))

print(f"✅ Cargados: Train={len(train_full):,}, Val={len(val_full):,}")

# === CREAR SUBSETS BALANCEADOS (CONFIGURACIÓN IDÉNTICA) ===
print("\n📊 CREANDO SUBSETS BALANCEADOS (SEEDS IGUALES):")

# Mismas seeds que Foundation-Sec para reproducibilidad
train_balanced = create_stratified_balanced_subset(train_full, 2000, seed=42)
val_balanced = create_stratified_balanced_subset(val_full, 400, seed=43)

print(f"✅ Subsets creados: Train={len(train_balanced)}, Val={len(val_balanced)}")

# Verificar balance (debe ser idéntico a Foundation)
train_balance = Counter([s['metadata']['attack_type_hidden'] for s in train_balanced])
val_balance = Counter([s['metadata']['attack_type_hidden'] for s in val_balanced])

print(f"📊 BALANCE VERIFICADO:")
print(f"   Train: {dict(train_balance)} (idéntico a Foundation)")
print(f"   Val: {dict(val_balance)} (idéntico a Foundation)")

# === FORMATEAR PARA LLAMA ===
def llama_format_working(example):
    """Formato Llama - IDÉNTICO proceso que Foundation"""
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{example['system']}<|eot_id|><|start_header_id|>user<|end_header_id|>

{example['user']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{example['assistant']}<|eot_id|>"""

formatted_train = [llama_format_working(s) for s in train_balanced]
formatted_val = [llama_format_working(s) for s in val_balanced]

train_dataset = Dataset.from_dict({"text": formatted_train})
val_dataset = Dataset.from_dict({"text": formatted_val})

# === CARGAR LLAMA-3-8B (CONFIGURACIÓN IDÉNTICA) ===
print(f"\n📥 CARGANDO LLAMA-3-8B:")

# BitsAndBytes IDÉNTICO a Foundation
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

base_model = AutoModelForCausalLM.from_pretrained(
    LLAMA_CONFIG['model_name'],
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(LLAMA_CONFIG['model_name'])
tokenizer.pad_token = tokenizer.eos_token

print(f"✅ Llama cargado: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

# === LoRA IDÉNTICO ===
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=4,                    # IDÉNTICO a Foundation
    lora_alpha=8,           # IDÉNTICO a Foundation
    target_modules=["q_proj"],  # IDÉNTICO a Foundation
    lora_dropout=0.1,
    bias="none",
    inference_mode=False
)

peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()

# === TRAINING ARGUMENTS IDÉNTICOS ===
training_args = TrainingArguments(
    output_dir=f"{LLAMA_OUTPUT}/checkpoints",
    num_train_epochs=1,            # IDÉNTICO
    per_device_train_batch_size=1, # IDÉNTICO
    gradient_accumulation_steps=2, # IDÉNTICO
    learning_rate=5e-5,           # IDÉNTICO
    logging_steps=25,             # IDÉNTICO
    save_steps=500,               # IDÉNTICO
    save_total_limit=2,
    fp16=True,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to=[],
    eval_steps=500,               # IDÉNTICO
    eval_strategy="steps",        # WORKING parameter
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# === SFTTrainer IDÉNTICO ===
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    formatting_func=lambda x: x["text"]
)

print("✅ Llama SFTTrainer configurado (configuración idéntica)")

# === ENTRENAR LLAMA ===
print(f"\n🚀 FINE-TUNING LLAMA-3-8B (CONFIGURACIÓN IDÉNTICA):")
print(f"   📊 Train balance: {dict(train_balance)}")
print(f"   📊 Val balance: {dict(val_balance)}")
print(f"   🦙 Modelo general purpose")
print(f"   🎯 Comparación directa vs Foundation-Sec")

start_time = datetime.now()

training_result = trainer.train()

end_time = datetime.now()
duration = (end_time - start_time).total_seconds() / 60

print(f"✅ LLAMA-3-8B COMPLETADO")
print(f"   ⏱️ {duration:.1f} minutos")
print(f"   📉 Training Loss: {training_result.training_loss:.3f}")
print(f"   📈 Steps: {training_result.global_step}")

# === COMPARACIÓN DIRECTA CON FOUNDATION ===
foundation_loss = 0.677  # Del resultado anterior
llama_loss = float(training_result.training_loss)

print(f"\n📊 COMPARACIÓN TRAINING LOSS:")
print(f"   🛡️ Foundation-Sec: {foundation_loss:.3f}")
print(f"   🦙 Llama-3-8B:     {llama_loss:.3f}")

if llama_loss < foundation_loss:
    improvement = (foundation_loss - llama_loss) / foundation_loss * 100
    print(f"   🏆 LLAMA SUPERIOR: {improvement:.1f}% mejor loss")
else:
    degradation = (llama_loss - foundation_loss) / foundation_loss * 100
    print(f"   🏆 FOUNDATION SUPERIOR: {degradation:.1f}% mejor que Llama")

# === GUARDAR LLAMA CON METADATOS COMPARATIVOS ===
timestamp = end_time.strftime("%Y%m%d_%H%M%S")
llama_model_path = f"{LLAMA_OUTPUT}/llama_3_8b_balanced_{timestamp}"

trainer.save_model(llama_model_path)

# Métricas comparativas
llama_metrics_comparative = {
    'model_name': 'Llama-3-8B-Instruct + LoRA',
    'training_completed': end_time.isoformat(),
    'duration_minutes': duration,
    'final_training_loss': float(training_result.training_loss),
    'total_steps': training_result.global_step,
    'model_path': llama_model_path,
    'training_dataset': {
        'size': len(train_balanced),
        'balance': dict(train_balance),
        'identical_to_foundation': True
    },
    'validation_dataset': {
        'size': len(val_balanced),
        'balance': dict(val_balance),
        'identical_to_foundation': True
    },
    'comparison_vs_foundation': {
        'foundation_loss': foundation_loss,
        'llama_loss': llama_loss,
        'winner': 'Llama' if llama_loss < foundation_loss else 'Foundation',
        'improvement_pct': abs((llama_loss - foundation_loss) / foundation_loss * 100)
    },
    'lora_config': {'r': 4, 'alpha': 8, 'target_modules': ["q_proj"]},
    'specialization': 'general_purpose',
    'methodology': 'identical_to_foundation_sec_for_fair_comparison'
}

llama_metrics_path = f"{LLAMA_OUTPUT}/llama_comparative_metrics_{timestamp}.json"
with open(llama_metrics_path, 'w') as f:
    json.dump(llama_metrics_comparative, f, indent=2, default=str)

print(f"💾 Llama modelo: llama_3_8b_balanced_{timestamp}")
print(f"📊 Métricas comparativas: llama_comparative_metrics_{timestamp}.json")

# === CLEANUP IDÉNTICO ===
del peft_model, base_model, trainer
torch.cuda.empty_cache()
gc.collect()

print(f"💾 Memoria liberada: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

print(f"\n{'='*70}")
print("🦙 LLAMA-3-8B FINE-TUNING COMPLETADO")
print(f"{'='*70}")
print("✅ Configuración IDÉNTICA a Foundation-Sec aplicada")
print("✅ Balance estratificado preservado")
print(f"✅ Training loss: {llama_loss:.3f}")
print(f"✅ Comparación directa vs Foundation disponible")
print("🔄 REINICIAR RUNTIME para Qwen1.5-7B (último modelo)")
print(f"{'='*70}")

print(f"[LLAMA_3_8B_COMPLETE] 🦙")
print("🚀 Siguiente: CELDA 4 - Qwen1.5-7B (configuración idéntica)")

📦 INSTALACIONES POST-REINICIO...
✅ Instalaciones completadas
Mounted at /content/drive
🦙 LLAMA-3-8B FINE-TUNING - CONFIGURACIÓN IDÉNTICA
💾 Memoria inicial: 0.0 GB
📦 CARGANDO DATASETS ESTRATIFICADOS:
✅ Cargados: Train=58,552, Val=12,547

📊 CREANDO SUBSETS BALANCEADOS (SEEDS IGUALES):
   📊 Disponible por tipo:
      BENIGN: 35,616
      DDoS: 22,752
      PortScan: 184
      BENIGN: 1216 seleccionados
      DDoS: 777 seleccionados
      PortScan: 6 seleccionados
   📊 Disponible por tipo:
      BENIGN: 7,632
      DDoS: 4,876
      PortScan: 39
      BENIGN: 243 seleccionados
      DDoS: 155 seleccionados
      PortScan: 1 seleccionados
✅ Subsets creados: Train=1999, Val=399
📊 BALANCE VERIFICADO:
   Train: {'DDoS': 777, 'BENIGN': 1216, 'PortScan': 6} (idéntico a Foundation)
   Val: {'BENIGN': 243, 'DDoS': 155, 'PortScan': 1} (idéntico a Foundation)

📥 CARGANDO LLAMA-3-8B:


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

✅ Llama cargado: 5.7 GB
trainable params: 1,048,576 || all params: 8,031,309,824 || trainable%: 0.0131


Applying formatting function to train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.


✅ Llama SFTTrainer configurado (configuración idéntica)

🚀 FINE-TUNING LLAMA-3-8B (CONFIGURACIÓN IDÉNTICA):
   📊 Train balance: {'DDoS': 777, 'BENIGN': 1216, 'PortScan': 6}
   📊 Val balance: {'BENIGN': 243, 'DDoS': 155, 'PortScan': 1}
   🦙 Modelo general purpose
   🎯 Comparación directa vs Foundation-Sec


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
500,0.2262,0.228222,0.225919,471008.0,0.954466
1000,0.2188,0.214423,0.210235,941528.0,0.957031


✅ LLAMA-3-8B COMPLETADO
   ⏱️ 23.5 minutos
   📉 Training Loss: 0.523
   📈 Steps: 1000

📊 COMPARACIÓN TRAINING LOSS:
   🛡️ Foundation-Sec: 0.677
   🦙 Llama-3-8B:     0.523
   🏆 LLAMA SUPERIOR: 22.8% mejor loss
💾 Llama modelo: llama_3_8b_balanced_20251003_234422
📊 Métricas comparativas: llama_comparative_metrics_20251003_234422.json
💾 Memoria liberada: 0.0 GB

🦙 LLAMA-3-8B FINE-TUNING COMPLETADO
✅ Configuración IDÉNTICA a Foundation-Sec aplicada
✅ Balance estratificado preservado
✅ Training loss: 0.523
✅ Comparación directa vs Foundation disponible
🔄 REINICIAR RUNTIME para Qwen1.5-7B (último modelo)
[LLAMA_3_8B_COMPLETE] 🦙
🚀 Siguiente: CELDA 4 - Qwen1.5-7B (configuración idéntica)


In [None]:
# === CELDA 4: QWEN1.5-7B FINE-TUNING - CONFIGURACIÓN IDÉNTICA FINAL ===
"""
Qwen1.5-7B fine-tuning con EXACTAMENTE la misma configuración exitosa
Completando trio: Foundation-Sec vs Llama vs Qwen
"""

# === INSTALACIONES POST-REINICIO ===
import subprocess
import sys

REQUIRED_PACKAGES = ["transformers>=4.36.0", "peft>=0.11.0", "trl>=0.9.0", "datasets>=2.16.0", "accelerate>=0.32.0", "bitsandbytes>=0.41.0"]

print("📦 INSTALACIONES POST-REINICIO...")
for package in REQUIRED_PACKAGES:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
print("✅ Instalaciones completadas")

# === IMPORTS COMPLETOS ===
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
from datasets import Dataset
import json
import os
from datetime import datetime
import numpy as np
from collections import Counter
import gc

# Montar Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

print("🔮 QWEN1.5-7B FINE-TUNING - CONFIGURACIÓN IDÉNTICA FINAL")
print("=" * 70)

# === CONFIGURACIÓN IDÉNTICA ===
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
DATA_INPUT_PATH = f'{BASE_PATH}/01_data_input'
QWEN_OUTPUT = f'{BASE_PATH}/03_fine_tuning_final/qwen_1_5_7b'
os.makedirs(QWEN_OUTPUT, exist_ok=True)
os.makedirs(f"{QWEN_OUTPUT}/checkpoints", exist_ok=True)

QWEN_CONFIG = {
    'model_name': 'Qwen/Qwen1.5-7B-Chat',
    'train_subset_size': 2000,  # IDÉNTICO
    'val_subset_size': 400,     # IDÉNTICO
    'specialization': 'general_purpose_efficient'
}

torch.cuda.empty_cache()
gc.collect()
print(f"💾 Memoria inicial: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

# === FUNCIÓN ESTRATIFICADA IDÉNTICA ===
def create_stratified_balanced_subset(samples, target_size, seed=42):
    """MISMA función que Foundation y Llama"""

    np.random.seed(seed)

    by_type = {'BENIGN': [], 'DDoS': [], 'PortScan': []}

    for sample in samples:
        attack_type = sample['metadata']['attack_type_hidden']
        if attack_type in by_type:
            by_type[attack_type].append(sample)

    print(f"   📊 Disponible por tipo:")
    for attack_type, type_samples in by_type.items():
        print(f"      {attack_type}: {len(type_samples):,}")

    subset_balanced = []
    total_available = sum(len(samples) for samples in by_type.values())

    for attack_type, type_samples in by_type.items():
        if len(type_samples) > 0:
            proportion = len(type_samples) / total_available
            type_target = max(1, int(target_size * proportion))

            if len(type_samples) >= type_target:
                selected = np.random.choice(type_samples, type_target, replace=False)
            else:
                selected = type_samples

            subset_balanced.extend(selected)
            print(f"      {attack_type}: {len(selected)} seleccionados")

    np.random.shuffle(subset_balanced)
    return subset_balanced

# === CARGAR DATASETS (IDÉNTICO) ===
print("📦 CARGANDO DATASETS ESTRATIFICADOS:")

TRAIN_FILE = "fine_tuning_stratified_train_real_scores_20251003_162517.jsonl"
VAL_FILE = "fine_tuning_stratified_val_real_scores_20251003_162517.jsonl"

train_full = []
val_full = []

with open(f"{DATA_INPUT_PATH}/{TRAIN_FILE}", 'r') as f:
    for line in f:
        if line.strip():
            train_full.append(json.loads(line))

with open(f"{DATA_INPUT_PATH}/{VAL_FILE}", 'r') as f:
    for line in f:
        if line.strip():
            val_full.append(json.loads(line))

print(f"✅ Datasets completos: Train={len(train_full):,}, Val={len(val_full):,}")

# === SUBSETS ESTRATIFICADOS (SEEDS IDÉNTICAS) ===
print("\n📊 CREANDO SUBSETS BALANCEADOS (SEEDS IDÉNTICAS):")

train_balanced = create_stratified_balanced_subset(train_full, 2000, seed=42)  # MISMA seed
val_balanced = create_stratified_balanced_subset(val_full, 400, seed=43)      # MISMA seed

print(f"✅ Subsets: Train={len(train_balanced)}, Val={len(val_balanced)}")

# Verificar balance (debe ser IDÉNTICO a modelos anteriores)
train_balance = Counter([s['metadata']['attack_type_hidden'] for s in train_balanced])
val_balance = Counter([s['metadata']['attack_type_hidden'] for s in val_balanced])

print(f"📊 BALANCE IDÉNTICO VERIFICADO:")
print(f"   Train: {dict(train_balance)} (debe ser: DDoS=777, BENIGN=1216, PortScan=6)")
print(f"   Val: {dict(val_balance)} (debe ser: BENIGN=243, DDoS=155, PortScan=1)")

# === FORMATO QWEN ESPECÍFICO ===
def qwen_format_working(example):
    """Formato específico para Qwen"""
    return f"""<|im_start|>system
{example['system']}<|im_end|>
<|im_start|>user
{example['user']}<|im_end|>
<|im_start|>assistant
{example['assistant']}<|im_end|>"""

formatted_train = [qwen_format_working(s) for s in train_balanced]
formatted_val = [qwen_format_working(s) for s in val_balanced]

train_dataset = Dataset.from_dict({"text": formatted_train})
val_dataset = Dataset.from_dict({"text": formatted_val})

print("✅ Datasets formateados para Qwen")

# === CARGAR QWEN1.5-7B ===
print(f"\n📥 CARGANDO QWEN1.5-7B:")

# BitsAndBytes IDÉNTICO
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

base_model = AutoModelForCausalLM.from_pretrained(
    QWEN_CONFIG['model_name'],
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    QWEN_CONFIG['model_name'],
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

print(f"✅ Qwen cargado: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

# === LoRA IDÉNTICO ===
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=4,                    # IDÉNTICO
    lora_alpha=8,           # IDÉNTICO
    target_modules=["q_proj"],  # IDÉNTICO
    lora_dropout=0.1,
    bias="none",
    inference_mode=False
)

peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()

# === TRAINING ARGUMENTS IDÉNTICOS ===
training_args = TrainingArguments(
    output_dir=f"{QWEN_OUTPUT}/checkpoints",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    logging_steps=25,
    save_steps=500,
    save_total_limit=2,
    fp16=True,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to=[],
    eval_steps=500,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# === SFTTrainer IDÉNTICO ===
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    formatting_func=lambda x: x["text"]
)

print("✅ Qwen SFTTrainer configurado (configuración idéntica)")

# === ENTRENAR QWEN ===
print(f"\n🚀 FINE-TUNING QWEN1.5-7B (TRIO FINAL):")
print(f"   📊 Train: {dict(train_balance)}")
print(f"   📊 Val: {dict(val_balance)}")
print(f"   🔮 Modelo general eficiente")
print(f"   🎯 Completando comparación triple")

start_time = datetime.now()

training_result = trainer.train()

end_time = datetime.now()
duration = (end_time - start_time).total_seconds() / 60

print(f"✅ QWEN1.5-7B COMPLETADO")
print(f"   ⏱️ {duration:.1f} minutos")
print(f"   📉 Training Loss: {training_result.training_loss:.3f}")

# === COMPARACIÓN TRIPLE FINAL ===
foundation_loss = 0.677
llama_loss = 0.523
qwen_loss = float(training_result.training_loss)

print(f"\n📊 COMPARACIÓN TRIPLE TRAINING LOSS:")
print(f"   🛡️ Foundation-Sec: {foundation_loss:.3f}")
print(f"   🦙 Llama-3-8B:     {llama_loss:.3f}")
print(f"   🔮 Qwen1.5-7B:     {qwen_loss:.3f}")

# Encontrar ganador
losses = {'Foundation-Sec': foundation_loss, 'Llama-3-8B': llama_loss, 'Qwen1.5-7B': qwen_loss}
winner = min(losses.items(), key=lambda x: x[1])

print(f"   🏆 GANADOR TRAINING LOSS: {winner[0]} ({winner[1]:.3f})")

# === GUARDAR QWEN ===
timestamp = end_time.strftime("%Y%m%d_%H%M%S")
qwen_model_path = f"{QWEN_OUTPUT}/qwen_1_5_7b_balanced_{timestamp}"

trainer.save_model(qwen_model_path)

# Métricas triple comparativas
qwen_metrics_triple = {
    'model_name': 'Qwen1.5-7B-Chat + LoRA',
    'training_completed': end_time.isoformat(),
    'duration_minutes': duration,
    'final_training_loss': qwen_loss,
    'model_path': qwen_model_path,
    'triple_comparison': {
        'foundation_sec_loss': foundation_loss,
        'llama_3_8b_loss': llama_loss,
        'qwen_1_5_7b_loss': qwen_loss,
        'winner_by_training_loss': winner[0],
        'winner_loss': winner[1]
    },
    'training_dataset': {'size': len(train_balanced), 'balance': dict(train_balance)},
    'validation_dataset': {'size': len(val_balanced), 'balance': dict(val_balance)},
    'methodology': 'identical_config_for_fair_comparison'
}

qwen_metrics_path = f"{QWEN_OUTPUT}/qwen_triple_comparison_{timestamp}.json"
with open(qwen_metrics_path, 'w') as f:
    json.dump(qwen_metrics_triple, f, indent=2, default=str)

print(f"💾 Qwen modelo: qwen_1_5_7b_balanced_{timestamp}")
print(f"📊 Comparación triple: qwen_triple_comparison_{timestamp}.json")

# === CLEANUP FINAL ===
del peft_model, base_model, trainer
torch.cuda.empty_cache()
gc.collect()

print(f"💾 Memoria liberada: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

print(f"\n{'='*70}")
print("🔮 QWEN1.5-7B FINE-TUNING COMPLETADO")
print(f"{'='*70}")
print("✅ TRIO DE MODELOS FINE-TUNED COMPLETADO")
print(f"✅ Qwen training loss: {qwen_loss:.3f}")
print(f"✅ Configuración idéntica aplicada a los 3")
print(f"✅ Comparación directa válida disponible")
print()
print("🏆 RANKING PRELIMINAR (por training loss):")
print(f"   1. {winner[0]}: {winner[1]:.3f}")
print(f"   2. {sorted(losses.items(), key=lambda x: x[1])[1][0]}: {sorted(losses.items(), key=lambda x: x[1])[1][1]:.3f}")
print(f"   3. {sorted(losses.items(), key=lambda x: x[1])[2][0]}: {sorted(losses.items(), key=lambda x: x[1])[2][1]:.3f}")
print()
print("🎯 SIGUIENTE: EVALUACIÓN FINAL en test set (10 muestras)")
print(f"{'='*70}")

print(f"[QWEN_1_5_7B_COMPLETE] 🔮")
print("🚀 3 MODELOS LISTOS PARA EVALUACIÓN FINAL")

📦 INSTALACIONES POST-REINICIO...
✅ Instalaciones completadas
Mounted at /content/drive
🔮 QWEN1.5-7B FINE-TUNING - CONFIGURACIÓN IDÉNTICA FINAL
💾 Memoria inicial: 0.0 GB
📦 CARGANDO DATASETS ESTRATIFICADOS:
✅ Datasets completos: Train=58,552, Val=12,547

📊 CREANDO SUBSETS BALANCEADOS (SEEDS IDÉNTICAS):
   📊 Disponible por tipo:
      BENIGN: 35,616
      DDoS: 22,752
      PortScan: 184
      BENIGN: 1216 seleccionados
      DDoS: 777 seleccionados
      PortScan: 6 seleccionados
   📊 Disponible por tipo:
      BENIGN: 7,632
      DDoS: 4,876
      PortScan: 39
      BENIGN: 243 seleccionados
      DDoS: 155 seleccionados
      PortScan: 1 seleccionados
✅ Subsets: Train=1999, Val=399
📊 BALANCE IDÉNTICO VERIFICADO:
   Train: {'DDoS': 777, 'BENIGN': 1216, 'PortScan': 6} (debe ser: DDoS=777, BENIGN=1216, PortScan=6)
   Val: {'BENIGN': 243, 'DDoS': 155, 'PortScan': 1} (debe ser: BENIGN=243, DDoS=155, PortScan=1)
✅ Datasets formateados para Qwen

📥 CARGANDO QWEN1.5-7B:


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

✅ Qwen cargado: 5.8 GB
trainable params: 1,048,576 || all params: 7,722,373,120 || trainable%: 0.0136


Applying formatting function to train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/399 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


✅ Qwen SFTTrainer configurado (configuración idéntica)

🚀 FINE-TUNING QWEN1.5-7B (TRIO FINAL):
   📊 Train: {'DDoS': 777, 'BENIGN': 1216, 'PortScan': 6}
   📊 Val: {'BENIGN': 243, 'DDoS': 155, 'PortScan': 1}
   🔮 Modelo general eficiente
   🎯 Completando comparación triple


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
500,0.251,0.252072,0.274223,524451.0,0.935224
1000,0.2315,0.226354,0.238102,1048641.0,0.9404


✅ QWEN1.5-7B COMPLETADO
   ⏱️ 25.0 minutos
   📉 Training Loss: 0.632

📊 COMPARACIÓN TRIPLE TRAINING LOSS:
   🛡️ Foundation-Sec: 0.677
   🦙 Llama-3-8B:     0.523
   🔮 Qwen1.5-7B:     0.632
   🏆 GANADOR TRAINING LOSS: Llama-3-8B (0.523)
💾 Qwen modelo: qwen_1_5_7b_balanced_20251004_001937
📊 Comparación triple: qwen_triple_comparison_20251004_001937.json
💾 Memoria liberada: 0.0 GB

🔮 QWEN1.5-7B FINE-TUNING COMPLETADO
✅ TRIO DE MODELOS FINE-TUNED COMPLETADO
✅ Qwen training loss: 0.632
✅ Configuración idéntica aplicada a los 3
✅ Comparación directa válida disponible

🏆 RANKING PRELIMINAR (por training loss):
   1. Llama-3-8B: 0.523
   2. Qwen1.5-7B: 0.632
   3. Foundation-Sec: 0.677

🎯 SIGUIENTE: EVALUACIÓN FINAL en test set (10 muestras)
[QWEN_1_5_7B_COMPLETE] 🔮
🚀 3 MODELOS LISTOS PARA EVALUACIÓN FINAL


In [None]:
# === CELDA 5: EVALUACIÓN SIN QUANTIZACIÓN (SOLUCIÓN DEFINITIVA) ===
"""
Evaluación final SIN bitsandbytes - Modelos full precision
Solución definitiva para problemas de compatibilidad CUDA
"""

# === INSTALACIONES BÁSICAS ÚNICAMENTE ===
import subprocess
import sys

BASIC_PACKAGES = ["transformers", "peft", "torch", "scikit-learn", "datasets"]
print("📦 INSTALACIONES BÁSICAS...")
for package in BASIC_PACKAGES:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

# === IMPORTS SIN BITSANDBYTES ===
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import json
import os
from datetime import datetime
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import re
import gc

# Montar Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

print("🚀 EVALUACIÓN FINAL - SIN QUANTIZACIÓN")
print("=" * 70)

# Verificar GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🎮 GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"💾 VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# === PATHS ===
BASE_PATH = '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection'
DATA_INPUT_PATH = f'{BASE_PATH}/01_data_input'
EVALUATION_PATH = f'{BASE_PATH}/04_final_evaluation_no_quant'
os.makedirs(EVALUATION_PATH, exist_ok=True)

# === CREAR TEST SET ESTRATIFICADO 400 ===
print("⚖️ CREANDO TEST SET 400 (proporción val):")

TEST_FILE = "fine_tuning_stratified_test_real_scores_20251003_162517.jsonl"

test_full = []
with open(f"{DATA_INPUT_PATH}/{TEST_FILE}", 'r') as f:
    for line in f:
        if line.strip():
            test_full.append(json.loads(line))

print(f"✅ Test completo: {len(test_full):,}")

# Subset estratificado
np.random.seed(100)
by_type = {'BENIGN': [], 'DDoS': [], 'PortScan': []}

for sample in test_full:
    attack_type = sample['metadata']['attack_type_hidden']
    if attack_type in by_type:
        by_type[attack_type].append(sample)

# Proporción val: 243 BENIGN + 155 DDoS + 1 PortScan
test_400 = []
test_400.extend(np.random.choice(by_type['BENIGN'], 243, replace=False))
test_400.extend(np.random.choice(by_type['DDoS'], 155, replace=False))
test_400.extend(np.random.choice(by_type['PortScan'], 1, replace=False))

np.random.shuffle(test_400)
print(f"✅ Test 400 estratificado creado")

test_balance = Counter([s['metadata']['attack_type_hidden'] for s in test_400])
print(f"📊 Balance: {dict(test_balance)}")

# === MODELOS A EVALUAR ===
MODELS = {
    'foundation_sec': {
        'base': 'fdtn-ai/Foundation-Sec-8B-Instruct',
        'lora': '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning_final/foundation_sec/foundation_sec_balanced_20251003_230319',
        'train_loss': 0.677
    },
    'llama_3_8b': {
        'base': 'meta-llama/Meta-Llama-3-8B-Instruct',
        'lora': '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning_final/llama_3_8b/llama_3_8b_balanced_20251003_234422',
        'train_loss': 0.523
    },
    'qwen_1_5_7b': {
        'base': 'Qwen/Qwen1.5-7B-Chat',
        'lora': '/content/drive/MyDrive/TFM_CIC_Anomaly_Detection/03_fine_tuning_final/qwen_1_5_7b/qwen_1_5_7b_balanced_20251004_001937',
        'train_loss': 0.632
    }
}

# === FUNCIÓN DE EVALUACIÓN SIN QUANTIZACIÓN ===
def evaluate_without_quantization(model_config, test_data, model_key):
    """Evaluación sin quantización para evitar problemas CUDA"""

    print(f"\n🎯 EVALUANDO: {model_key.upper()}")
    print(f"   Training Loss: {model_config['train_loss']:.3f}")

    try:
        # === CARGAR SIN QUANTIZACIÓN ===
        base_model = AutoModelForCausalLM.from_pretrained(
            model_config['base'],
            torch_dtype=torch.float16,  # Solo float16, sin quantización
            device_map="auto",
            trust_remote_code=True
        )

        model = PeftModel.from_pretrained(base_model, model_config['lora'])

        tokenizer = AutoTokenizer.from_pretrained(
            model_config['base'],
            trust_remote_code=True
        )
        tokenizer.pad_token = tokenizer.eos_token

        print(f"   ✅ Cargado sin quantización: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

    except Exception as e:
        print(f"   ❌ Error cargando: {e}")
        return [], {}

    # === EVALUACIÓN COMPLETA ===
    results = []
    start_time = datetime.now()

    model.eval()

    # Evaluar subset primero (50 muestras para verificar)
    test_subset = test_data[:50]  # Subset para verificar funcionalidad

    for i, sample in enumerate(test_subset):
        print(f"  {i+1}/{len(test_subset)}: {sample['metadata']['attack_type_hidden']}", end='\r')

        try:
            # Input según modelo
            if 'qwen' in model_key:
                input_text = f"""<|im_start|>system
{sample['system']}<|im_end|>
<|im_start|>user
{sample['user']}<|im_end|>
<|im_start|>assistant
"""
            else:
                input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{sample['system']}<|eot_id|><|start_header_id|>user<|end_header_id|>

{sample['user']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

            inputs = tokenizer(input_text, return_tensors="pt").to(device)

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=150,
                    temperature=0.3,
                    do_sample=True,
                    pad_token_id=tokenizer.pad_token_id
                )

            response = tokenizer.decode(
                outputs[0][inputs['input_ids'].shape[1]:],
                skip_special_tokens=True
            ).strip()

            # Parsing
            try:
                json_match = re.search(r'\{.*?\}', response, re.DOTALL)
                if json_match:
                    resp_json = json.loads(json_match.group())
                    validation = resp_json.get('validation', 'UNKNOWN')
                    confidence = resp_json.get('confidence', 0.5)
                    json_valid = True
                else:
                    raise ValueError("No JSON found")
            except:
                validation = 'CONFIRMED' if 'CONFIRMED' in response.upper() else 'DISCARDED'
                confidence = 0.5
                json_valid = False

            result = {
                'test_id': f"TEST_{i:03d}",
                'model': model_key,
                'ground_truth': sample['metadata']['validation_gt'],
                'predicted': validation,
                'correct': validation == sample['metadata']['validation_gt'],
                'confidence': confidence,
                'json_valid': json_valid,
                'original_label': sample['metadata']['attack_type_hidden'],
                'tranad_score': sample['metadata']['tranad_score_real'],
                'response': response[:200] + "..."
            }

            results.append(result)

        except Exception as e:
            print(f"\n⚠️ Error en muestra {i+1}: {e}")
            continue

    eval_time = (datetime.now() - start_time).total_seconds() / 60
    print(f"\n✅ {model_key} completado: {len(results)}/50 en {eval_time:.1f}min")

    # Limpiar memoria
    del model, base_model
    torch.cuda.empty_cache()
    gc.collect()

    # Métricas
    if results:
        pred_binary = [1 if r['predicted'] == 'CONFIRMED' else 0 for r in results]
        gt_binary = [1 if r['ground_truth'] == 'CONFIRMED' else 0 for r in results]

        accuracy = accuracy_score(gt_binary, pred_binary)
        f1 = f1_score(gt_binary, pred_binary, zero_division=0)

        print(f"📊 MÉTRICAS {model_key}: Accuracy={accuracy:.3f}, F1={f1:.3f}")

        return results, {
            'model': model_key,
            'accuracy': accuracy,
            'f1_score': f1,
            'train_loss': model_config['train_loss']
        }

    return [], {}

# === EVALUAR LOS 3 MODELOS ===
print(f"\n🚀 EVALUANDO 3 MODELOS (subset 50 para verificar):")

evaluation_results = []

for model_key, model_config in MODELS.items():
    results, metrics = evaluate_without_quantization(model_config, test_400, model_key)

    if results and metrics:
        evaluation_results.append(metrics)

        # Guardar resultados
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_path = f"{EVALUATION_PATH}/{model_key}_test_results_{timestamp}.json"

        with open(results_path, 'w') as f:
            json.dump(results, f, indent=2, default=str)

        print(f"💾 {model_key}: {model_key}_test_results_{timestamp}.json")

# === RANKING FINAL ===
if evaluation_results:
    print(f"\n🏆 RANKING FINAL - TEST SET EVALUATION")
    print("=" * 60)

    ranking = sorted(evaluation_results, key=lambda x: x['f1_score'], reverse=True)

    for i, model in enumerate(ranking, 1):
        print(f"{i}. {model['model']}: F1={model['f1_score']:.3f}, Train Loss={model['train_loss']:.3f}")

    # Training loss vs F1 analysis
    print(f"\n📊 TRAINING LOSS vs F1-SCORE:")
    for model in ranking:
        print(f"   {model['model']}: {model['train_loss']:.3f} → {model['f1_score']:.3f}")

print(f"\n[EVALUATION_WITHOUT_QUANTIZATION_COMPLETE] ✅")

📦 INSTALACIONES BÁSICAS...
Mounted at /content/drive
🚀 EVALUACIÓN FINAL - SIN QUANTIZACIÓN
🎮 GPU: NVIDIA A100-SXM4-80GB
💾 VRAM: 85.2 GB
⚖️ CREANDO TEST SET 400 (proporción val):
✅ Test completo: 12,549
✅ Test 400 estratificado creado
📊 Balance: {'DDoS': 155, 'BENIGN': 243, 'PortScan': 1}

🚀 EVALUANDO 3 MODELOS (subset 50 para verificar):

🎯 EVALUANDO: FOUNDATION_SEC
   Training Loss: 0.677


config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

   ✅ Cargado sin quantización: 16.1 GB

✅ foundation_sec completado: 50/50 en 3.8min
📊 MÉTRICAS foundation_sec: Accuracy=0.480, F1=0.458
💾 foundation_sec: foundation_sec_test_results_20251004_013539.json

🎯 EVALUANDO: LLAMA_3_8B
   Training Loss: 0.523


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

   ✅ Cargado sin quantización: 16.1 GB

✅ llama_3_8b completado: 50/50 en 5.4min
📊 MÉTRICAS llama_3_8b: Accuracy=0.560, F1=0.214
💾 llama_3_8b: llama_3_8b_test_results_20251004_014240.json

🎯 EVALUANDO: QWEN_1_5_7B
   Training Loss: 0.632


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

   ✅ Cargado sin quantización: 15.5 GB

✅ qwen_1_5_7b completado: 50/50 en 4.7min
📊 MÉTRICAS qwen_1_5_7b: Accuracy=0.620, F1=0.000
💾 qwen_1_5_7b: qwen_1_5_7b_test_results_20251004_014854.json

🏆 RANKING FINAL - TEST SET EVALUATION
1. foundation_sec: F1=0.458, Train Loss=0.677
2. llama_3_8b: F1=0.214, Train Loss=0.523
3. qwen_1_5_7b: F1=0.000, Train Loss=0.632

📊 TRAINING LOSS vs F1-SCORE:
   foundation_sec: 0.677 → 0.458
   llama_3_8b: 0.523 → 0.214
   qwen_1_5_7b: 0.632 → 0.000

[EVALUATION_WITHOUT_QUANTIZATION_COMPLETE] ✅


In [None]:
# === CELDA DIAGNÓSTICO: SFTTrainer API ACTUAL ===
"""
Diagnóstico completo de SFTTrainer para evitar errores de API
"""

import inspect
from trl import SFTTrainer
from transformers import TrainingArguments
import json
from datetime import datetime

print("🔍 DIAGNÓSTICO COMPLETO: SFTTrainer API")
print("=" * 60)

# === 1. INSPECCIONAR SFTTrainer ===
print("📊 SFTTrainer signature actual:")

try:
    sft_signature = inspect.signature(SFTTrainer.__init__)
    sft_params = sft_signature.parameters

    print("✅ Parámetros SFTTrainer:")
    core_params = []
    optional_params = []

    for param_name, param in sft_params.items():
        if param_name == 'self':
            continue

        is_required = param.default == inspect.Parameter.empty
        param_info = f"{param_name}: {'REQUIRED' if is_required else str(param.default)}"

        if is_required:
            core_params.append(param_info)
        else:
            optional_params.append(param_info)

    print("📋 CORE (requeridos):")
    for param in core_params:
        print(f"   {param}")

    print("📋 OPTIONAL (primeros 10):")
    for param in optional_params[:10]:
        print(f"   {param}")

except Exception as e:
    print(f"❌ Error inspeccionando SFTTrainer: {e}")

# === 2. INSPECCIONAR TrainingArguments ===
print(f"\n📊 TrainingArguments signature:")

try:
    ta_signature = inspect.signature(TrainingArguments.__init__)
    ta_params = ta_signature.parameters

    # Buscar parámetros problemáticos específicos
    problematic_params = ['evaluation_strategy', 'eval_strategy', 'save_strategy']

    print("🔍 Parámetros de evaluación/guardado:")
    for param_name in problematic_params:
        if param_name in ta_params:
            param = ta_params[param_name]
            print(f"   ✅ {param_name}: {param.default}")
        else:
            print(f"   ❌ {param_name}: NO EXISTE")

    # Buscar parámetros correctos
    existing_eval_params = [p for p in ta_params.keys() if 'eval' in p.lower()]
    existing_save_params = [p for p in ta_params.keys() if 'save' in p.lower()]

    print(f"📋 Parámetros eval existentes: {existing_eval_params}")
    print(f"📋 Parámetros save existentes: {existing_save_params}")

except Exception as e:
    print(f"❌ Error inspeccionando TrainingArguments: {e}")

# === 3. PROBAR CONFIGURACIÓN MÍNIMA ===
print(f"\n🧪 PROBANDO CONFIGURACIÓN MÍNIMA:")

try:
    # TrainingArguments mínimos
    test_args = TrainingArguments(
        output_dir="./test",
        num_train_epochs=1
    )
    print("✅ TrainingArguments mínimo funciona")

    # Probar con eval parameters
    test_args_with_eval = TrainingArguments(
        output_dir="./test",
        num_train_epochs=1,
        eval_steps=100,
        # evaluation_strategy="steps"  # Comentado para probar
    )
    print("✅ TrainingArguments sin evaluation_strategy funciona")

except Exception as e:
    print(f"❌ Error en configuración mínima: {e}")

# === 4. CONFIGURACIÓN WORKING ===
print(f"\n⚙️ CONFIGURACIÓN QUE FUNCIONA:")

WORKING_TRAINING_ARGS = {
    'output_dir': './output',
    'num_train_epochs': 1,
    'per_device_train_batch_size': 1,
    'gradient_accumulation_steps': 2,
    'learning_rate': 5e-5,
    'logging_steps': 10,
    'save_steps': 500,
    'save_total_limit': 2,
    'fp16': True,
    'dataloader_num_workers': 0,
    'remove_unused_columns': False,
    'report_to': []
}

WORKING_SFTT_ARGS = {
    'model': 'MODEL_PLACEHOLDER',
    'args': 'TRAINING_ARGS_PLACEHOLDER',
    'train_dataset': 'DATASET_PLACEHOLDER',
    'processing_class': 'TOKENIZER_PLACEHOLDER',
    'formatting_func': 'FUNCTION_PLACEHOLDER'
}

print("✅ Configuración working identificada:")
print("📋 TrainingArguments working:")
for key, value in WORKING_TRAINING_ARGS.items():
    print(f"   {key}: {value}")

print("📋 SFTTrainer working:")
for key, value in WORKING_SFTT_ARGS.items():
    print(f"   {key}: {value}")

# === GUARDAR DIAGNÓSTICO ===
diagnosis_result = {
    'diagnosis_timestamp': datetime.now().isoformat(),
    'sft_trainer_working_params': WORKING_SFTT_ARGS,
    'training_args_working': WORKING_TRAINING_ARGS,
    'problematic_params': ['evaluation_strategy'],
    'working_alternative': 'Use eval_steps without evaluation_strategy',
    'memory_optimization': 'Use minimal config with quantization'
}

diagnosis_path = f"{BASE_PATH}/checkpoints/sft_diagnosis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(diagnosis_path, 'w') as f:
    json.dump(diagnosis_result, f, indent=2)

print(f"💾 Diagnóstico: {os.path.basename(diagnosis_path)}")

print(f"\n{'='*60}")
print("🔍 DIAGNÓSTICO COMPLETADO")
print("✅ Configuración working identificada")
print("✅ Parámetros problemáticos detectados")
print("✅ Listo para implementar fine-tuning sin errores")
print(f"{'='*60}")

print(f"[SFT_DIAGNOSIS_COMPLETE] 🔍")

🔍 DIAGNÓSTICO COMPLETO: SFTTrainer API
📊 SFTTrainer signature actual:
✅ Parámetros SFTTrainer:
📋 CORE (requeridos):
   model: REQUIRED
📋 OPTIONAL (primeros 10):
   args: None
   data_collator: None
   train_dataset: None
   eval_dataset: None
   processing_class: None
   compute_loss_func: None
   compute_metrics: None
   callbacks: None
   optimizers: (None, None)
   optimizer_cls_and_kwargs: None

📊 TrainingArguments signature:
🔍 Parámetros de evaluación/guardado:
   ❌ evaluation_strategy: NO EXISTE
   ✅ eval_strategy: no
   ✅ save_strategy: steps
📋 Parámetros eval existentes: ['do_eval', 'eval_strategy', 'per_device_eval_batch_size', 'per_gpu_eval_batch_size', 'eval_accumulation_steps', 'eval_delay', 'jit_mode_eval', 'bf16_full_eval', 'fp16_full_eval', 'eval_steps', 'eval_do_concat_batches', 'batch_eval_metrics', 'eval_on_start', 'eval_use_gather_object']
📋 Parámetros save existentes: ['save_strategy', 'save_steps', 'save_total_limit', 'save_safetensors', 'save_on_each_node', 'save_

In [None]:
# === CÁLCULO LOCAL DE MÉTRICAS COMPLETAS ===
"""
Calcular métricas robustas desde artefactos ya generados
SIN NECESIDAD DE GPU - Solo procesamiento de resultados
"""

import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from collections import Counter

# === CARGAR RESULTADOS DESDE ARTEFACTOS ===
results_files = {
    'foundation_sec': '/Users/javimore/Documents/Virtualenv/viupyforai/Trabajo_Final_Maestria/CIC/outputs/finetuned/foundation_sec/foundation_sec_test_results_20251004_013539.json,
    'llama_3_8b': '/Users/javimore/Documents/Virtualenv/viupyforai/Trabajo_Final_Maestria/CIC/outputs/finetuned/llama_3_8b/llama_3_8b_test_results_20251004_014240.json',
    'qwen_1_5_7b': '/Users/javimore/Documents/Virtualenv/viupyforai/Trabajo_Final_Maestria/CIC/outputs/finetuned/qwen_1_5_7b/qwen_1_5_7b_test_results_20251004_014854.json'
}

# Para cada modelo, calcular métricas completas
for model_key, results_file in results_files.items():
    with open(results_file, 'r') as f:
        results = json.load(f)

    # Extraer predicciones y ground truth
    pred_binary = [1 if r['predicted'] == 'CONFIRMED' else 0 for r in results]
    gt_binary = [1 if r['ground_truth'] == 'CONFIRMED' else 0 for r in results]

    # Métricas completas
    tn, fp, fn, tp = confusion_matrix(gt_binary, pred_binary).ravel()

    accuracy = accuracy_score(gt_binary, pred_binary)
    precision = precision_score(gt_binary, pred_binary, zero_division=0)
    recall = recall_score(gt_binary, pred_binary, zero_division=0)
    f1 = f1_score(gt_binary, pred_binary, zero_division=0)

    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # FP Reduction
    balanced_accuracy = (recall + specificity) / 2
    mcc = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5 if (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) != 0 else 0

    print(f"{model_key}:")
    print(f"   F1: {f1:.3f}, MCC: {mcc:.3f}, Specificity: {specificity:.3f}")

SyntaxError: unterminated string literal (detected at line 14) (ipython-input-3457193893.py, line 14)