# Quinto notebook
En este ultimo notebook se vera el reentrenamiento incremental solo con imágenes nuevas, rápido en CPU, con checkpoint por época, y registro en MLflow (SQLite). Si no hay datos nuevos, se crea un run en MLflow con estado SKIPPED y no se entrena.
## Objetivos
1. Leer nuevas imágenes etiquetadas por el usuario desde `data/new_data/` (formato YOLO).
2. Entrenar SOLO con esas imágenes nuevas (rápido en CPU).
3. Evaluar objetivamente si mejoró usando un subset fijo de validación (val_loss_before vs val_loss_after).
4. Registrar en MLflow:
   - run incremental
   - nueva versión del modelo en Model Registry
   - tags de mejora
5. Si mejora, promover automáticamente a Production (archivando versiones anteriores).
6. Si no hay datos nuevos, no entrenar y registrar run SKIPPED.

In [1]:
"""
- Importa librerías necesarias para incremental retrain, evaluación y MLflow Registry.
"""

import os
import json
import time
import shutil
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple

import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm

import mlflow
from mlflow.tracking import MlflowClient
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from mlflow.tracking import MlflowClient
import time

import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F


1. Carga de carpetas previas para el fomrat establecido

In [2]:
"""
- Encuentra PROJECT_ROOT a partir de data/processed/project_config.json.
- Carga project_config.json y labelmap.json.
- Define rutas estándar del proyecto.
"""

def find_project_root(start: Path, max_up: int = 8) -> Path:
    cur = start.resolve()
    for _ in range(max_up):
        if (cur / "data" / "processed" / "project_config.json").exists():
            return cur
        cur = cur.parent
    raise FileNotFoundError("No se encontró data/processed/project_config.json. Ejecuta Notebook 01.")

PROJECT_ROOT = find_project_root(Path.cwd())
PROCESSED_DIR = (PROJECT_ROOT / "data" / "processed").resolve()

PROJECT_CONFIG_PATH = (PROCESSED_DIR / "project_config.json").resolve()
LABELMAP_PATH = (PROCESSED_DIR / "labelmap.json").resolve()

with open(PROJECT_CONFIG_PATH, "r", encoding="utf-8") as f:
    project_config = json.load(f)

with open(LABELMAP_PATH, "r", encoding="utf-8") as f:
    labelmap = json.load(f)

VAL_IMG_DIR = Path(project_config["val_dir"])
TARGET_CLASSES = project_config["target_classes"]

print("PROJECT_ROOT:", PROJECT_ROOT)
print("TARGET_CLASSES:", TARGET_CLASSES)


PROJECT_ROOT: C:\Users\Johnny\Desktop\IA-final
TARGET_CLASSES: ['person', 'car', 'airplane']


2. Nuevas carpetas para el manjeo de datos proveniente de la interfaz

In [3]:
"""
- Define estructura para nuevos datos:
  - data/new_data/images/*.jpg|png
  - data/new_data/labels/*.txt (YOLO) mismo nombre que la imagen
  - data/new_data/used/ (archivo de lo ya usado)
  - data/new_data/manifest.json (para no repetir)
"""

NEW_DATA_DIR = (PROJECT_ROOT / "data" / "new_data").resolve()
NEW_IMG_DIR = (NEW_DATA_DIR / "images").resolve()
NEW_LBL_DIR = (NEW_DATA_DIR / "labels").resolve()
NEW_USED_DIR = (NEW_DATA_DIR / "used").resolve()
MANIFEST_PATH = (NEW_DATA_DIR / "manifest.json").resolve()

for d in [NEW_DATA_DIR, NEW_IMG_DIR, NEW_LBL_DIR, NEW_USED_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("NEW_IMG_DIR:", NEW_IMG_DIR)
print("NEW_LBL_DIR:", NEW_LBL_DIR)
print("NEW_USED_DIR:", NEW_USED_DIR)
print("MANIFEST_PATH:", MANIFEST_PATH)

print("\nFormato YOLO esperado por label:")
print("class_id x_center y_center width height  (todo normalizado 0..1)")
print("class_id: 0=person, 1=car, 2=airplane")


NEW_IMG_DIR: C:\Users\Johnny\Desktop\IA-final\data\new_data\images
NEW_LBL_DIR: C:\Users\Johnny\Desktop\IA-final\data\new_data\labels
NEW_USED_DIR: C:\Users\Johnny\Desktop\IA-final\data\new_data\used
MANIFEST_PATH: C:\Users\Johnny\Desktop\IA-final\data\new_data\manifest.json

Formato YOLO esperado por label:
class_id x_center y_center width height  (todo normalizado 0..1)
class_id: 0=person, 1=car, 2=airplane


In [4]:
"""
Celda CONFIG GLOBAL (OBLIGATORIA):
- Define PROJECT_ROOT, rutas de dataset nuevo y rutas de modelos.
- Evita NameError al ejecutar el notebook con nbconvert.
"""

from pathlib import Path
import os

# Notebook está en IA-final/notebooks -> subimos 1 nivel
PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "data"
NEW_DATA_DIR = DATA_DIR / "new_data"
NEW_IMG_DIR = NEW_DATA_DIR / "images"
NEW_LBL_DIR = NEW_DATA_DIR / "labels"

MODELS_DIR = PROJECT_ROOT / "models" / "local_checkpoints"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

LOGS_DIR = PROJECT_ROOT / "logs"
LOGS_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("NEW_IMG_DIR :", NEW_IMG_DIR)
print("NEW_LBL_DIR :", NEW_LBL_DIR)
print("MODELS_DIR  :", MODELS_DIR)


PROJECT_ROOT: c:\Users\Johnny\Desktop\IA-final
NEW_IMG_DIR : c:\Users\Johnny\Desktop\IA-final\data\new_data\images
NEW_LBL_DIR : c:\Users\Johnny\Desktop\IA-final\data\new_data\labels
MODELS_DIR  : c:\Users\Johnny\Desktop\IA-final\models\local_checkpoints


In [5]:
import json, time

RETRAIN_LOG = (LOGS_DIR / "retrain_progress.log").resolve()

def log_event(event: dict):
    """
    Escribe eventos en formato JSONL (1 evento por línea).
    El frontend los parsea en tiempo real.
    """
    event["ts"] = time.strftime("%Y-%m-%d %H:%M:%S")
    with open(RETRAIN_LOG, "a", encoding="utf-8", errors="ignore") as f:
        f.write(json.dumps(event, ensure_ascii=False) + "\n")


In [6]:
from pathlib import Path
import time

PROJECT_ROOT = Path.cwd().parent
LOG_FILE = PROJECT_ROOT / "logs" / "retrain_progress.log"
LOG_FILE.parent.mkdir(exist_ok=True)

def log(msg):
    ts = time.strftime("%Y-%m-%d %H:%M:%S")
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(f"[{ts}] {msg}\n")


In [7]:
"""
- Lee manifest.json si existe.
- Detecta pares nuevos (imagen, label) que no se han usado.
- Ignora si falta el .txt o está vacío.
"""

def load_manifest(path: Path) -> dict:
    if path.exists():
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    return {"used_files": []}

def save_manifest(path: Path, manifest: dict) -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2, ensure_ascii=False)

manifest = load_manifest(MANIFEST_PATH)
used = set(manifest.get("used_files", []))

img_files = []
for ext in ("*.jpg", "*.jpeg", "*.png"):
    img_files.extend(NEW_IMG_DIR.glob(ext))

candidates: List[Tuple[Path, Path]] = []
for img_path in sorted(img_files):
    stem = img_path.stem
    lbl_path = NEW_LBL_DIR / f"{stem}.txt"

    key = img_path.name
    if key in used:
        continue
    if not lbl_path.exists():
        continue
    if lbl_path.stat().st_size == 0:
        continue

    candidates.append((img_path, lbl_path))

print("Nuevos pares detectados:", len(candidates))
if candidates:
    print("Ejemplo:", candidates[0][0].name, candidates[0][1].name)


Nuevos pares detectados: 0


In [None]:
"""
Esta celda:
- Configura el incremental.
- Por defecto: congela backbone para rapidez y estabilidad.
"""

DEVICE = torch.device("cpu")

INCR_CONFIG = {
    "batch_size": 2,
    "num_workers": 0,
    "epochs": 2,
    "learning_rate": 5e-5,
    "weight_decay": 1e-4,
    "train_backbone": False,
    "max_new_images": 300,
    "eval_max_images": 20,
    "iou_eval_threshold": 0.5,
    "score_threshold": 0.5,
    "improvement_delta": 0.0,  
}

print("INCR_CONFIG:")
for k, v in INCR_CONFIG.items():
    print(f"{k}: {v}")


INCR_CONFIG:
batch_size: 2
num_workers: 0
epochs: 2
learning_rate: 5e-05
weight_decay: 0.0001
train_backbone: False
max_new_images: 300
eval_max_images: 20
iou_eval_threshold: 0.5
score_threshold: 0.5
improvement_delta: 0.0


In [9]:
"""
Esta celda:
- Configura MLflow SQLite.
- Si no hay nuevos datos, no entrena.
- Igual registra un run con status=SKIPPED.
"""

MLFLOW_DB = (PROJECT_ROOT / "mlflow_new.db").resolve()
mlflow.set_tracking_uri(f"sqlite:///{MLFLOW_DB.as_posix()}")

EXPERIMENT_NAME = "object_detection_coco_cpu"
mlflow.set_experiment(EXPERIMENT_NAME)

client = MlflowClient()

run_name = f"incr_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

if len(candidates) == 0:
    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag("stage", "incremental")
        mlflow.set_tag("status", "SKIPPED")
        mlflow.log_param("new_images", 0)
        mlflow.log_params(INCR_CONFIG)
        mlflow.log_artifact(str(PROJECT_CONFIG_PATH), artifact_path="artifacts")
        mlflow.log_artifact(str(LABELMAP_PATH), artifact_path="artifacts")
    print("No hay nuevas imágenes. Reentrenamiento omitido y registrado en MLflow.")
    raise SystemExit


No hay nuevas imágenes. Reentrenamiento omitido y registrado en MLflow.


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
"""
Esta celda:
- Convierte labels YOLO (normalizados) a cajas xyxy en píxeles.
- Construye Dataset para entrenamiento incremental.
- Mapea class_id YOLO 0..K-1 a label interno 1..K (0 es background).
"""

K = len(TARGET_CLASSES)

def yolo_to_xyxy(line: str, w: int, h: int) -> Tuple[int, float, float, float, float]:
    parts = line.strip().split()
    if len(parts) != 5:
        raise ValueError("Formato YOLO inválido: se esperaban 5 valores.")
    cls = int(parts[0])
    xc, yc, bw, bh = map(float, parts[1:])

    x1 = (xc - bw / 2.0) * w
    y1 = (yc - bh / 2.0) * h
    x2 = (xc + bw / 2.0) * w
    y2 = (yc + bh / 2.0) * h

    x1 = max(0.0, min(x1, w - 1.0))
    y1 = max(0.0, min(y1, h - 1.0))
    x2 = max(0.0, min(x2, w - 1.0))
    y2 = max(0.0, min(y2, h - 1.0))
    return cls, x1, y1, x2, y2

class IncrementalYoloDetectionDataset(Dataset):
    def __init__(self, pairs: List[Tuple[Path, Path]]):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx: int):
        img_path, lbl_path = self.pairs[idx]

        img = Image.open(img_path).convert("RGB")
        w, h = img.size
        img_t = F.to_tensor(img)

        boxes, labels, areas, iscrowd = [], [], [], []

        with open(lbl_path, "r", encoding="utf-8") as f:
            lines = [ln.strip() for ln in f.readlines() if ln.strip()]

        for ln in lines:
            cls, x1, y1, x2, y2 = yolo_to_xyxy(ln, w, h)
            if cls < 0 or cls >= K:
                continue
            boxes.append([x1, y1, x2, y2])
            labels.append(cls + 1)  # interno 1..K
            areas.append(max(0.0, (x2 - x1)) * max(0.0, (y2 - y1)))
            iscrowd.append(0)

        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64),
            "image_id": torch.tensor([idx], dtype=torch.int64),
            "area": torch.tensor(areas, dtype=torch.float32),
            "iscrowd": torch.tensor(iscrowd, dtype=torch.int64),
        }
        return img_t, target

def collate_fn(batch):
    images, targets = zip(*batch)
    return list(images), list(targets)


In [None]:
"""
Esta celda:
- Limita cuántas nuevas imágenes se usan por ciclo incremental.
- Construye DataLoader.
"""

pairs = candidates[: min(len(candidates), INCR_CONFIG["max_new_images"])]

incr_ds = IncrementalYoloDetectionDataset(pairs)
incr_loader = DataLoader(
    incr_ds,
    batch_size=INCR_CONFIG["batch_size"],
    shuffle=True,
    num_workers=INCR_CONFIG["num_workers"],
    collate_fn=collate_fn
)

print("Nuevas imágenes para reentrenar:", len(incr_ds))
print("Iteraciones por época:", len(incr_loader))


Nuevas imágenes para reentrenar: 3
Iteraciones por época: 2


In [None]:
"""
Esta celda:
- Busca el modelo a partir del MLflow Model Registry:
  - si hay versión en Production, usa esa
  - si no, usa el best_*.pt más reciente en local_checkpoints
- Construye modelo y carga state_dict.
"""

REGISTERED_MODEL_NAME = "frcnn_coco_cpu_person_car_airplane"

def build_model(num_classes: int):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

def find_latest_best_checkpoint(models_dir: Path) -> Path:
    cands = sorted(models_dir.glob("best_*.pt"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not cands:
        raise FileNotFoundError("No se encontró best_*.pt en models/local_checkpoints.")
    return cands[0]

def try_get_production_model_version(client: MlflowClient, name: str):
    try:
        versions = client.search_model_versions(f"name='{name}'")
        prod = [v for v in versions if getattr(v, "current_stage", "") == "Production"]
        if prod:
            # tomar el más nuevo
            prod_sorted = sorted(prod, key=lambda x: int(x.version), reverse=True)
            return prod_sorted[0]
    except Exception:
        return None
    return None

prod_mv = try_get_production_model_version(client, REGISTERED_MODEL_NAME)

BASE_CKPT_PATH = None
BASE_SOURCE = None

if prod_mv is not None:
    # Nota: en este proyecto registramos el checkpoint como artefacto.
    # Para simplificar carga offline, seguimos trabajando con checkpoints locales como fuente directa.
    # Registramos de todas formas el parent model version para trazabilidad.
    BASE_SOURCE = f"registry:/{REGISTERED_MODEL_NAME}/{prod_mv.version}"
    BASE_CKPT_PATH = find_latest_best_checkpoint(MODELS_DIR)
else:
    BASE_SOURCE = "local:best_latest"
    BASE_CKPT_PATH = find_latest_best_checkpoint(MODELS_DIR)

base_ckpt = torch.load(BASE_CKPT_PATH, map_location="cpu")

base_target_classes = base_ckpt["target_classes"]
NUM_CLASSES = len(base_target_classes) + 1

model = build_model(NUM_CLASSES)
model.load_state_dict(base_ckpt["model_state_dict"])
model.to(DEVICE)

# Congelar backbone para rapidez
if not INCR_CONFIG["train_backbone"]:
    for p in model.backbone.parameters():
        p.requires_grad = False

for p in model.roi_heads.box_predictor.parameters():
    p.requires_grad = True

model.train()

print("Base source:", BASE_SOURCE)
print("BASE_CKPT_PATH:", BASE_CKPT_PATH.name)
print("train_backbone:", INCR_CONFIG["train_backbone"])


Base source: registry:/frcnn_coco_cpu_person_car_airplane/1
BASE_CKPT_PATH: best_frcnn_cpu_base_train_20260201_083448.pt
train_backbone: False


In [None]:
"""
Esta celda:
- Crea un DataLoader de validación fijo y pequeño (CPU-safe).
- Calcula val_loss_before con el modelo base (antes del incremental).
- Importante: en torchvision detection el loss se obtiene con model.train() sin grad.
"""

log("Iniciando evaluación de pérdida de validación (antes del incremental)")
log("Cargando JSON de validación reducido ...")

VAL_JSON = (PROCESSED_DIR / "coco_person_car_airplane_val.json").resolve()
log(f"VAL_JSON: {VAL_JSON}")

if not VAL_JSON.exists():
    raise FileNotFoundError("No existe VAL_JSON reducido. Ejecuta Notebook 02.")

with open(VAL_JSON, "r", encoding="utf-8") as f:
    val_coco = json.load(f)

log(f"Imágenes en JSON de validación: {len(val_coco.get('images', []))}")
log(f"Anotaciones en JSON de validación: {len(val_coco.get('annotations', []))}")

name_to_id = labelmap["name_to_id"]
target_cat_ids_local = [int(name_to_id[n]) for n in TARGET_CLASSES]
coco_to_internal_val = {cid: i + 1 for i, cid in enumerate(target_cat_ids_local)}

log(f"Clases objetivo (local ids): {coco_to_internal_val}")

class CocoValDataset(Dataset):
    def __init__(self, images_dir: Path, coco_json: dict, max_images: int):
        self.images_dir = images_dir
        self.coco = coco_json
        self.images = self.coco["images"][:max_images]
        self.annotations = self.coco["annotations"]

        self.img_id_to_anns = {}
        for ann in self.annotations:
            self.img_id_to_anns.setdefault(ann["image_id"], []).append(ann)

        self.id_to_image = {img["id"]: img for img in self.images}
        self.image_ids = list(self.id_to_image.keys())

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx: int):
        img_id = self.image_ids[idx]
        img_meta = self.id_to_image[img_id]
        img_path = self.images_dir / img_meta["file_name"]

        img = Image.open(img_path).convert("RGB")
        img_t = F.to_tensor(img)

        anns = self.img_id_to_anns.get(img_id, [])
        boxes, labels, areas, iscrowd = [], [], [], []

        for a in anns:
            cid = int(a["category_id"])
            if cid not in coco_to_internal_val:
                continue
            x, y, w, h = a["bbox"]
            boxes.append([x, y, x + w, y + h])
            labels.append(coco_to_internal_val[cid])
            areas.append(a.get("area", w * h))
            iscrowd.append(a.get("iscrowd", 0))

        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64),
            "image_id": torch.tensor([img_id], dtype=torch.int64),
            "area": torch.tensor(areas, dtype=torch.float32),
            "iscrowd": torch.tensor(iscrowd, dtype=torch.int64),
        }
        return img_t, target

@torch.no_grad()
def evaluate_loss_torchvision(model, data_loader) -> float:
    log("Evaluando loss en modelo base (modo train sin grad)")
    was_training = model.training
    model.train()

    total = 0.0
    n = 0

    for images, targets in tqdm(data_loader, desc="val_loss_check", leave=False):
        images = [img.to(DEVICE) for img in images]
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        total += float(losses.item())
        n += 1

    if not was_training:
        model.eval()

    avg_loss = total / max(1, n)
    log(f"val_loss promedio calculado: {avg_loss:.6f}")
    return avg_loss

log("Creando DataLoader de validación (CPU-safe) ...")
eval_ds = CocoValDataset(
    VAL_IMG_DIR,
    val_coco,
    max_images=INCR_CONFIG["eval_max_images"]
)
log(f"Imágenes usadas para evaluación: {len(eval_ds)}")

eval_loader = DataLoader(
    eval_ds,
    batch_size=1,
    shuffle=False,
    num_workers=0,
    collate_fn=collate_fn
)

log("Calculando val_loss_before ...")
val_loss_before = evaluate_loss_torchvision(model, eval_loader)

log(f"val_loss_check_before_incremental: {val_loss_before:.6f}")


                                                               

In [None]:
"""
Esta celda:
- Entrena SOLO con incr_loader (nuevas imágenes).
- Guarda checkpoint por época (siempre) + best (por train_loss_newdata).
- Loggea en MLflow run incremental.
- Escribe retrain_progress.log en JSONL para que el frontend muestre progreso por epoch.
"""

import json
import time
from datetime import datetime

# =========================
# LOG EVENT (JSONL)
# =========================
RETRAIN_LOG = (LOGS_DIR / "retrain_progress.log").resolve()

def log_event(event: dict):
    """
    Escribe eventos en formato JSONL (1 evento por línea).
    El frontend lo parsea para barra/epoch/loss/eta.
    """
    event["ts"] = time.strftime("%Y-%m-%d %H:%M:%S")
    with open(RETRAIN_LOG, "a", encoding="utf-8", errors="ignore") as f:
        f.write(json.dumps(event, ensure_ascii=False) + "\n")


# =========================
# OPTIMIZER
# =========================
trainable_params = [p for p in model.parameters() if p.requires_grad]

optimizer = torch.optim.AdamW(
    trainable_params,
    lr=INCR_CONFIG["learning_rate"],
    weight_decay=INCR_CONFIG["weight_decay"]
)

def train_one_epoch_incremental(model, data_loader, optimizer, epoch: int) -> float:
    model.train()
    total_loss = 0.0
    n = 0

    for images, targets in tqdm(data_loader, desc=f"incr train e{epoch}", leave=False):
        images = [img.to(DEVICE) for img in images]
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        total_loss += float(losses.item())
        n += 1

    return total_loss / max(1, n)


# =========================
# RUN SETUP
# =========================
incr_run_name = f"incr_train_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
best_train_loss = float("inf")
best_incr_ckpt_path = None
last_epoch_ckpt_path = None

# Guardar lista de archivos nuevos usados
used_list_path = MODELS_DIR / f"new_files_{incr_run_name}.json"
with open(used_list_path, "w", encoding="utf-8") as f:
    json.dump([{"image": p[0].name, "label": p[1].name} for p in pairs], f, indent=2, ensure_ascii=False)


# =========================
# MLFLOW RUN
# =========================
with mlflow.start_run(run_name=incr_run_name) as run:
    incr_run_id = run.info.run_id

    mlflow.set_tag("stage", "incremental")
    mlflow.set_tag("status", "TRAINED")
    mlflow.set_tag("parent_checkpoint", BASE_CKPT_PATH.name)
    mlflow.set_tag("parent_source", BASE_SOURCE)
    mlflow.set_tag("classes", ",".join(TARGET_CLASSES))
    mlflow.set_tag("registered_model_name", REGISTERED_MODEL_NAME)

    mlflow.log_params(INCR_CONFIG)
    mlflow.log_param("new_images", len(incr_ds))
    mlflow.log_metric("val_loss_before", val_loss_before)

    mlflow.log_artifact(str(used_list_path), artifact_path="artifacts")
    mlflow.log_artifact(str(PROJECT_CONFIG_PATH), artifact_path="artifacts")
    mlflow.log_artifact(str(LABELMAP_PATH), artifact_path="artifacts")

    # =========================
    # EVENTO START
    # =========================
    log_event({
        "type": "start",
        "epochs_total": INCR_CONFIG["epochs"],
        "run_name": incr_run_name,
        "run_id": incr_run_id,
        "new_images": int(len(incr_ds)),
        "val_loss_before": float(val_loss_before) if val_loss_before is not None else None,
    })

    # =========================
    # TRAIN LOOP
    # =========================
    for epoch in range(1, INCR_CONFIG["epochs"] + 1):
        t0 = time.time()

        train_loss = train_one_epoch_incremental(model, incr_loader, optimizer, epoch)
        epoch_time = time.time() - t0

        mlflow.log_metric("train_loss_newdata", train_loss, step=epoch)
        mlflow.log_metric("epoch_time_sec", epoch_time, step=epoch)

        # checkpoint por época (SIEMPRE)
        epoch_ckpt_path = MODELS_DIR / f"epoch_{epoch}_incr_{incr_run_name}.pt"
        last_epoch_ckpt_path = epoch_ckpt_path

        ckpt_out = {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "parent_checkpoint": str(BASE_CKPT_PATH),
            "parent_source": BASE_SOURCE,
            "incr_config": INCR_CONFIG,
            "target_classes": TARGET_CLASSES,
        }

        torch.save(ckpt_out, epoch_ckpt_path)
        mlflow.log_artifact(str(epoch_ckpt_path), artifact_path="checkpoints")

        print(f"Epoch {epoch}/{INCR_CONFIG['epochs']} | train_loss_newdata={train_loss:.4f} | time={epoch_time:.1f}s")

        if train_loss < best_train_loss:
            best_train_loss = train_loss
            best_incr_ckpt_path = MODELS_DIR / f"best_incr_{incr_run_name}.pt"
            torch.save({**ckpt_out, "best_train_loss_newdata": best_train_loss}, best_incr_ckpt_path)
            mlflow.log_artifact(str(best_incr_ckpt_path), artifact_path="checkpoints")

        # =========================
        # EVENTO EPOCH (✅ DENTRO DEL LOOP)
        # =========================
        log_event({
            "type": "epoch",
            "epoch": int(epoch),
            "epochs_total": int(INCR_CONFIG["epochs"]),
            "train_loss": float(train_loss),
            "val_loss": None,  # no evaluamos por epoch en este notebook
            "epoch_time_sec": float(epoch_time),
        })

    # =========================
    # POST-TRAIN METRICS
    # =========================
    mlflow.log_metric("best_train_loss_newdata", best_train_loss)

    # Eval AFTER
    val_loss_after = evaluate_loss_torchvision(model, eval_loader)
    mlflow.log_metric("val_loss_after", val_loss_after)

    # =========================
    # MODEL REGISTRY
    # =========================
    from mlflow.tracking import MlflowClient
    client = MlflowClient()

    MODEL_NAME = REGISTERED_MODEL_NAME

    run_id = mlflow.active_run().info.run_id

    # Asegurar que exista best_incr_ckpt_path antes de registrarlo
    if best_incr_ckpt_path is None:
        # fallback: si por alguna razón nunca mejoró, usamos el último epoch
        best_incr_ckpt_path = last_epoch_ckpt_path

    model_source = f"runs:/{run_id}/checkpoints/{best_incr_ckpt_path.name}"

    registered = False
    promoted_version = None

    # Solo registrar si mejora
    if True:
        mv = mlflow.register_model(model_source, MODEL_NAME)
        registered = True
        promoted_version = mv.version
        print("Registered model:", MODEL_NAME, "version:", mv.version)

        # Promover a Production
        try:
            client.transition_model_version_stage(
                name=MODEL_NAME,
                version=mv.version,
                stage="Production",
                archive_existing_versions=True
            )
            print("Promoted to Production:", mv.version)
        except Exception as e:
            print("Stage transition skipped:", str(e))
    else:
        print("No se registró nueva versión porque no mejoró.")

    # =========================
    # EVENTO DONE
    # =========================
    log_event({
        "type": "done",
        "status": "OK",
        "best_train_loss": float(best_train_loss),
        "val_loss_after": float(val_loss_after),
        "registered": bool(registered),
        "production_version": int(promoted_version) if promoted_version is not None else None,
    })

print("Incremental terminado")
print("best_incr_ckpt_path:", best_incr_ckpt_path)
print("last_epoch_ckpt_path:", last_epoch_ckpt_path)


                                                            

Epoch 1/2 | train_loss_newdata=0.2347 | time=13.3s


                                                            

Epoch 2/2 | train_loss_newdata=0.1741 | time=9.9s
Incremental terminado
best_incr_ckpt_path: c:\Users\Johnny\Desktop\IA-final\models\local_checkpoints\best_incr_incr_train_20260202_125258.pt
last_epoch_ckpt_path: c:\Users\Johnny\Desktop\IA-final\models\local_checkpoints\epoch_2_incr_incr_train_20260202_125258.pt


In [None]:
"""
Esta celda:
- Calcula val_loss_after sobre el mismo eval_loader.
- Decide si mejoró comparando contra val_loss_before.
"""

val_loss_after = evaluate_loss_torchvision(model, eval_loader)
print("val_loss_check_after_incremental:", val_loss_after)

improvement_delta = float(INCR_CONFIG["improvement_delta"])
improved = (val_loss_after + improvement_delta) < val_loss_before

print("Improved:", improved)
print("Delta used:", improvement_delta)
print("Before:", val_loss_before, "After:", val_loss_after)


                                                               

val_loss_check_after_incremental: 0.5725924365222455
Improved: False
Delta used: 0.0
Before: 0.5416763566434384 After: 0.5725924365222455




In [None]:
"""
Esta celda:
- Actualiza manifest.json para no reentrenar dos veces con las mismas imágenes.
- Mueve imágenes y labels a new_data/used/<run_name>/ para archivo.
"""

archive_dir = (NEW_USED_DIR / incr_run_name).resolve()
archive_img = (archive_dir / "images").resolve()
archive_lbl = (archive_dir / "labels").resolve()
archive_img.mkdir(parents=True, exist_ok=True)
archive_lbl.mkdir(parents=True, exist_ok=True)

for img_path, lbl_path in pairs:
    used.add(img_path.name)
    shutil.move(str(img_path), str(archive_img / img_path.name))
    shutil.move(str(lbl_path), str(archive_lbl / lbl_path.name))

manifest["used_files"] = sorted(list(used))
save_manifest(MANIFEST_PATH, manifest)

print("Archivos marcados como usados y archivados en:", archive_dir)
print("Manifest actualizado:", MANIFEST_PATH)


Archivos marcados como usados y archivados en: C:\Users\Johnny\Desktop\IA-final\data\new_data\used\incr_train_20260202_125258
Manifest actualizado: C:\Users\Johnny\Desktop\IA-final\data\new_data\manifest.json
