# Fifth Notebook

In this final notebook, incremental retraining is performed using only newly labeled images, executed quickly on CPU, with per-epoch checkpoint saving and MLflow (SQLite) logging.  
If no new data is available, an MLflow run is created with status SKIPPED and no training is performed.

## Objectives

1. Read newly labeled user images from `data/new_data/` (YOLO format).
2. Train ONLY on those new images (fast CPU training).
3. Objectively evaluate improvement using a fixed validation subset (val_loss_before vs val_loss_after).
4. Log in MLflow:
   - incremental run
   - new model version in the Model Registry
   - improvement tags
5. If improvement is observed, automatically promote the model to Production (archiving previous versions).
6. If no new data is found, skip training and log a SKIPPED run.

In [None]:
"""
- Imports the necessary libraries for incremental retraining, evaluation, and MLflow Registry.
"""

import os
import json
import time
import shutil
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple

import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm

import mlflow
from mlflow.tracking import MlflowClient
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from mlflow.tracking import MlflowClient
import time

import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F


1. Carga de carpetas previas para el fomrat establecido

In [None]:
"""
- Locates PROJECT_ROOT based on data/processed/project_config.json.
- Loads project_config.json and labelmap.json.
- Defines the project’s standard paths.
"""

def find_project_root(start: Path, max_up: int = 8) -> Path:
    cur = start.resolve()
    for _ in range(max_up):
        if (cur / "data" / "processed" / "project_config.json").exists():
            return cur
        cur = cur.parent
    raise FileNotFoundError("No se encontró data/processed/project_config.json. Ejecuta Notebook 01.")

PROJECT_ROOT = find_project_root(Path.cwd())
PROCESSED_DIR = (PROJECT_ROOT / "data" / "processed").resolve()

PROJECT_CONFIG_PATH = (PROCESSED_DIR / "project_config.json").resolve()
LABELMAP_PATH = (PROCESSED_DIR / "labelmap.json").resolve()

with open(PROJECT_CONFIG_PATH, "r", encoding="utf-8") as f:
    project_config = json.load(f)

with open(LABELMAP_PATH, "r", encoding="utf-8") as f:
    labelmap = json.load(f)

VAL_IMG_DIR = Path(project_config["val_dir"])
TARGET_CLASSES = project_config["target_classes"]

print("PROJECT_ROOT:", PROJECT_ROOT)
print("TARGET_CLASSES:", TARGET_CLASSES)


PROJECT_ROOT: C:\Users\Johnny\Desktop\IA-final
TARGET_CLASSES: ['person', 'car', 'airplane']


2. Nuevas carpetas para el manjeo de datos proveniente de la interfaz

In [None]:
"""
- Defines the structure for new data:
  - data/new_data/images/*.jpg|png
  - data/new_data/labels/*.txt (YOLO format), same filename as the image
  - data/new_data/used/ (records of already used files)
  - data/new_data/manifest.json (to avoid duplicate processing)
"""

NEW_DATA_DIR = (PROJECT_ROOT / "data" / "new_data").resolve()
NEW_IMG_DIR = (NEW_DATA_DIR / "images").resolve()
NEW_LBL_DIR = (NEW_DATA_DIR / "labels").resolve()
NEW_USED_DIR = (NEW_DATA_DIR / "used").resolve()
MANIFEST_PATH = (NEW_DATA_DIR / "manifest.json").resolve()

for d in [NEW_DATA_DIR, NEW_IMG_DIR, NEW_LBL_DIR, NEW_USED_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("NEW_IMG_DIR:", NEW_IMG_DIR)
print("NEW_LBL_DIR:", NEW_LBL_DIR)
print("NEW_USED_DIR:", NEW_USED_DIR)
print("MANIFEST_PATH:", MANIFEST_PATH)

print("\nFormato YOLO esperado por label:")
print("class_id x_center y_center width height  (todo normalizado 0..1)")
print("class_id: 0=person, 1=car, 2=airplane")


NEW_IMG_DIR: C:\Users\Johnny\Desktop\IA-final\data\new_data\images
NEW_LBL_DIR: C:\Users\Johnny\Desktop\IA-final\data\new_data\labels
NEW_USED_DIR: C:\Users\Johnny\Desktop\IA-final\data\new_data\used
MANIFEST_PATH: C:\Users\Johnny\Desktop\IA-final\data\new_data\manifest.json

Formato YOLO esperado por label:
class_id x_center y_center width height  (todo normalizado 0..1)
class_id: 0=person, 1=car, 2=airplane


In [None]:
"""
GLOBAL CONFIG cell (MANDATORY):
- Defines PROJECT_ROOT, new dataset paths, and model paths.
- Prevents NameError when executing the notebook with nbconvert.
"""

from pathlib import Path
import os

# Notebook está en IA-final/notebooks -> subimos 1 nivel
PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "data"
NEW_DATA_DIR = DATA_DIR / "new_data"
NEW_IMG_DIR = NEW_DATA_DIR / "images"
NEW_LBL_DIR = NEW_DATA_DIR / "labels"

MODELS_DIR = PROJECT_ROOT / "models" / "local_checkpoints"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

LOGS_DIR = PROJECT_ROOT / "logs"
LOGS_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("NEW_IMG_DIR :", NEW_IMG_DIR)
print("NEW_LBL_DIR :", NEW_LBL_DIR)
print("MODELS_DIR  :", MODELS_DIR)


PROJECT_ROOT: c:\Users\Johnny\Desktop\IA-final
NEW_IMG_DIR : c:\Users\Johnny\Desktop\IA-final\data\new_data\images
NEW_LBL_DIR : c:\Users\Johnny\Desktop\IA-final\data\new_data\labels
MODELS_DIR  : c:\Users\Johnny\Desktop\IA-final\models\local_checkpoints


In [None]:
import json, time
"""
Writes events in JSONL format (one event per line).
The frontend parses them in real time.
"""

RETRAIN_LOG = (LOGS_DIR / "retrain_progress.log").resolve()

def log_event(event: dict):

    event["ts"] = time.strftime("%Y-%m-%d %H:%M:%S")
    with open(RETRAIN_LOG, "a", encoding="utf-8", errors="ignore") as f:
        f.write(json.dumps(event, ensure_ascii=False) + "\n")


In [None]:
from pathlib import Path
import time
"""
Simple logging setup for incremental retraining.

- Defines PROJECT_ROOT and creates a logs directory if it does not exist.
- LOG_FILE stores progress messages in append mode.
- The log() function writes timestamped messages to help track execution.
"""

PROJECT_ROOT = Path.cwd().parent
LOG_FILE = PROJECT_ROOT / "logs" / "retrain_progress.log"
LOG_FILE.parent.mkdir(exist_ok=True)

def log(msg):
    ts = time.strftime("%Y-%m-%d %H:%M:%S")
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(f"[{ts}] {msg}\n")


In [None]:
"""
- Reads manifest.json if it exists.
- Detects new (image, label) pairs that have not been used before.
- Ignores files if the corresponding .txt label is missing or empty.
"""

def load_manifest(path: Path) -> dict:
    if path.exists():
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    return {"used_files": []}

def save_manifest(path: Path, manifest: dict) -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2, ensure_ascii=False)

manifest = load_manifest(MANIFEST_PATH)
used = set(manifest.get("used_files", []))

img_files = []
for ext in ("*.jpg", "*.jpeg", "*.png"):
    img_files.extend(NEW_IMG_DIR.glob(ext))

all_pairs: List[Tuple[Path, Path]] = []
new_pairs: List[Tuple[Path, Path]] = []

for img_path in sorted(img_files):
    stem = img_path.stem
    lbl_path = NEW_LBL_DIR / f"{stem}.txt"

    # válido = label existe y no está vacío
    if (not lbl_path.exists()) or (lbl_path.stat().st_size == 0):
        continue

    all_pairs.append((img_path, lbl_path))

    key = img_path.name
    if key not in used:
        new_pairs.append((img_path, lbl_path))

print("Pares válidos totales en new_data:", len(all_pairs))
print("Nuevos pares detectados:", len(new_pairs))
if new_pairs:
    print("Ejemplo nuevo:", new_pairs[0][0].name, new_pairs[0][1].name)




Pares válidos totales en new_data: 7
Nuevos pares detectados: 7
Ejemplo nuevo: imagen_20260204_121828_imagen_2026-02-04_120732568.png imagen_20260204_121828_imagen_2026-02-04_120732568.txt


In [None]:
"""
This cell:
- Configures the incremental training process.
- By default, freezes the backbone for faster and more stable retraining.
"""

DEVICE = torch.device("cpu")

INCR_CONFIG = {
    "batch_size": 2,
    "num_workers": 0,
    "epochs": 2,
    "learning_rate": 5e-5,
    "weight_decay": 1e-4,
    "train_backbone": False,
    "max_new_images": 300,
    "eval_max_images": 20,
    "iou_eval_threshold": 0.5,
    "score_threshold": 0.5,
    "improvement_delta": 0.0,  
}

print("INCR_CONFIG:")
for k, v in INCR_CONFIG.items():
    print(f"{k}: {v}")


INCR_CONFIG:
batch_size: 2
num_workers: 0
epochs: 2
learning_rate: 5e-05
weight_decay: 0.0001
train_backbone: False
max_new_images: 300
eval_max_images: 20
iou_eval_threshold: 0.5
score_threshold: 0.5
improvement_delta: 0.0


In [None]:
"""
This cell:
- Configures MLflow with SQLite.
- If NO new images are found:
  - Logs a run with status=SKIPPED
  - BUT does not stop the training process
- Training will continue using pairs_train (entire new_data set).
"""

MLFLOW_DB = (PROJECT_ROOT / "mlflow_new.db").resolve()
mlflow.set_tracking_uri(f"sqlite:///{MLFLOW_DB.as_posix()}")

EXPERIMENT_NAME = "object_detection_coco_cpu"
mlflow.set_experiment(EXPERIMENT_NAME)

client = MlflowClient()

run_name = f"incr_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

# ============================================================
# CASO: NO HAY NUEVAS IMÁGENES
# ============================================================
if len(new_pairs) == 0:
    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag("stage", "incremental")
        mlflow.set_tag("status", "SKIPPED")
        mlflow.log_param("new_images", 0)
        mlflow.log_param("train_images_total", len(pairs_train))
        mlflow.log_params(INCR_CONFIG)
        mlflow.log_artifact(str(PROJECT_CONFIG_PATH), artifact_path="artifacts")
        mlflow.log_artifact(str(LABELMAP_PATH), artifact_path="artifacts")

    print(
        "No hay nuevas imágenes, "
        "pero se continuará el entrenamiento con new_data existente."
    )




In [None]:
"""
This cell:
- Converts YOLO labels (normalized) into pixel-space xyxy bounding boxes.
- Builds the Dataset for incremental training.
- Maps YOLO class_id 0..K-1 to internal labels 1..K (0 is reserved for background).
"""

K = len(TARGET_CLASSES)

def yolo_to_xyxy(line: str, w: int, h: int) -> Tuple[int, float, float, float, float]:
    parts = line.strip().split()
    if len(parts) != 5:
        raise ValueError("Formato YOLO inválido: se esperaban 5 valores.")
    cls = int(parts[0])
    xc, yc, bw, bh = map(float, parts[1:])

    x1 = (xc - bw / 2.0) * w
    y1 = (yc - bh / 2.0) * h
    x2 = (xc + bw / 2.0) * w
    y2 = (yc + bh / 2.0) * h

    x1 = max(0.0, min(x1, w - 1.0))
    y1 = max(0.0, min(y1, h - 1.0))
    x2 = max(0.0, min(x2, w - 1.0))
    y2 = max(0.0, min(y2, h - 1.0))
    return cls, x1, y1, x2, y2

class IncrementalYoloDetectionDataset(Dataset):
    def __init__(self, pairs: List[Tuple[Path, Path]]):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx: int):
        img_path, lbl_path = self.pairs[idx]

        img = Image.open(img_path).convert("RGB")
        w, h = img.size
        img_t = F.to_tensor(img)

        boxes, labels, areas, iscrowd = [], [], [], []

        with open(lbl_path, "r", encoding="utf-8") as f:
            lines = [ln.strip() for ln in f.readlines() if ln.strip()]

        for ln in lines:
            cls, x1, y1, x2, y2 = yolo_to_xyxy(ln, w, h)
            if cls < 0 or cls >= K:
                continue
            boxes.append([x1, y1, x2, y2])
            labels.append(cls + 1)  # interno 1..K
            areas.append(max(0.0, (x2 - x1)) * max(0.0, (y2 - y1)))
            iscrowd.append(0)

        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64),
            "image_id": torch.tensor([idx], dtype=torch.int64),
            "area": torch.tensor(areas, dtype=torch.float32),
            "iscrowd": torch.tensor(iscrowd, dtype=torch.int64),
        }
        return img_t, target

def collate_fn(batch):
    images, targets = zip(*batch)
    return list(images), list(targets)


In [46]:
# Para mostrar "nuevas"
pairs_new = new_pairs[: min(len(new_pairs), INCR_CONFIG["max_new_images"])]

# Para ENTRENAR: TODO lo que hay en new_data (válido)
pairs_train = all_pairs  # <- aquí está el cambio clave

incr_ds = IncrementalYoloDetectionDataset(pairs_train)
incr_loader = DataLoader(
    incr_ds,
    batch_size=INCR_CONFIG["batch_size"],
    shuffle=True,
    num_workers=INCR_CONFIG["num_workers"],
    collate_fn=collate_fn
)

print("Nuevas imágenes detectadas (para reporte):", len(pairs_new))
print("Total imágenes usadas para ENTRENAR (new_data completo):", len(pairs_train))
print("Iteraciones por época:", len(incr_loader))


Nuevas imágenes detectadas (para reporte): 7
Total imágenes usadas para ENTRENAR (new_data completo): 7
Iteraciones por época: 4


In [None]:
"""
This cell:
- Loads the model from the MLflow Model Registry:
  - If a Production version exists, it uses that one.
  - Otherwise, it falls back to the most recent best_*.pt in local_checkpoints.
- Rebuilds the model architecture and loads the corresponding state_dict.
"""

REGISTERED_MODEL_NAME = "frcnn_coco_cpu_person_car_airplane"

def build_model(num_classes: int):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

def find_latest_best_checkpoint(models_dir: Path) -> Path:
    cands = sorted(models_dir.glob("best_*.pt"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not cands:
        raise FileNotFoundError("No se encontró best_*.pt en models/local_checkpoints.")
    return cands[0]

def try_get_production_model_version(client: MlflowClient, name: str):
    try:
        versions = client.search_model_versions(f"name='{name}'")
        prod = [v for v in versions if getattr(v, "current_stage", "") == "Production"]
        if prod:
            # tomar el más nuevo
            prod_sorted = sorted(prod, key=lambda x: int(x.version), reverse=True)
            return prod_sorted[0]
    except Exception:
        return None
    return None

prod_mv = try_get_production_model_version(client, REGISTERED_MODEL_NAME)

BASE_CKPT_PATH = None
BASE_SOURCE = None

if prod_mv is not None:
  
    BASE_SOURCE = f"registry:/{REGISTERED_MODEL_NAME}/{prod_mv.version}"
    BASE_CKPT_PATH = find_latest_best_checkpoint(MODELS_DIR)
else:
    BASE_SOURCE = "local:best_latest"
    BASE_CKPT_PATH = find_latest_best_checkpoint(MODELS_DIR)

base_ckpt = torch.load(BASE_CKPT_PATH, map_location="cpu")

base_target_classes = base_ckpt["target_classes"]
NUM_CLASSES = len(base_target_classes) + 1

model = build_model(NUM_CLASSES)
model.load_state_dict(base_ckpt["model_state_dict"])
model.to(DEVICE)

if not INCR_CONFIG["train_backbone"]:
    for p in model.backbone.parameters():
        p.requires_grad = False

for p in model.roi_heads.box_predictor.parameters():
    p.requires_grad = True

model.train()

print("Base source:", BASE_SOURCE)
print("BASE_CKPT_PATH:", BASE_CKPT_PATH.name)
print("train_backbone:", INCR_CONFIG["train_backbone"])


Base source: registry:/frcnn_coco_cpu_person_car_airplane/13
BASE_CKPT_PATH: best_incr_incr_train_20260204_121034.pt
train_backbone: False


In [None]:

"""
This cell:
- Creates a small, fixed validation DataLoader (CPU-safe).
- Computes val_loss_before using the base model (before incremental training).
- Important: in torchvision detection, the loss is obtained with model.train() and no gradients.
"""

VAL_JSON = (PROCESSED_DIR / "coco_person_car_airplane_val.json").resolve()
log(f"VAL_JSON: {VAL_JSON}")

if not VAL_JSON.exists():
    raise FileNotFoundError("No existe VAL_JSON reducido. Ejecuta Notebook 02.")

with open(VAL_JSON, "r", encoding="utf-8") as f:
    val_coco = json.load(f)

log(f"Imágenes en JSON de validación: {len(val_coco.get('images', []))}")
log(f"Anotaciones en JSON de validación: {len(val_coco.get('annotations', []))}")

name_to_id = labelmap["name_to_id"]
target_cat_ids_local = [int(name_to_id[n]) for n in TARGET_CLASSES]
coco_to_internal_val = {cid: i + 1 for i, cid in enumerate(target_cat_ids_local)}

log(f"Clases objetivo (local ids): {coco_to_internal_val}")

class CocoValDataset(Dataset):
    def __init__(self, images_dir: Path, coco_json: dict, max_images: int):
        self.images_dir = images_dir
        self.coco = coco_json
        self.images = self.coco["images"][:max_images]
        self.annotations = self.coco["annotations"]

        self.img_id_to_anns = {}
        for ann in self.annotations:
            self.img_id_to_anns.setdefault(ann["image_id"], []).append(ann)

        self.id_to_image = {img["id"]: img for img in self.images}
        self.image_ids = list(self.id_to_image.keys())

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx: int):
        img_id = self.image_ids[idx]
        img_meta = self.id_to_image[img_id]
        img_path = self.images_dir / img_meta["file_name"]

        img = Image.open(img_path).convert("RGB")
        img_t = F.to_tensor(img)

        anns = self.img_id_to_anns.get(img_id, [])
        boxes, labels, areas, iscrowd = [], [], [], []

        for a in anns:
            cid = int(a["category_id"])
            if cid not in coco_to_internal_val:
                continue
            x, y, w, h = a["bbox"]
            boxes.append([x, y, x + w, y + h])
            labels.append(coco_to_internal_val[cid])
            areas.append(a.get("area", w * h))
            iscrowd.append(a.get("iscrowd", 0))

        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64),
            "image_id": torch.tensor([img_id], dtype=torch.int64),
            "area": torch.tensor(areas, dtype=torch.float32),
            "iscrowd": torch.tensor(iscrowd, dtype=torch.int64),
        }
        return img_t, target

@torch.no_grad()
def evaluate_loss_torchvision(model, data_loader) -> float:
    log("Evaluando loss en modelo base (modo train sin grad)")
    was_training = model.training
    model.train()

    total = 0.0
    n = 0

    for images, targets in tqdm(data_loader, desc="val_loss_check", leave=False):
        images = [img.to(DEVICE) for img in images]
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        total += float(losses.item())
        n += 1

    if not was_training:
        model.eval()

    avg_loss = total / max(1, n)
    log(f"val_loss promedio calculado: {avg_loss:.6f}")
    return avg_loss

log("Creando DataLoader de validación (CPU-safe) ...")
eval_ds = CocoValDataset(
    VAL_IMG_DIR,
    val_coco,
    max_images=INCR_CONFIG["eval_max_images"]
)
log(f"Imágenes usadas para evaluación: {len(eval_ds)}")

eval_loader = DataLoader(
    eval_ds,
    batch_size=1,
    shuffle=False,
    num_workers=0,
    collate_fn=collate_fn
)

log("Calculando val_loss_before ...")
val_loss_before = evaluate_loss_torchvision(model, eval_loader)

log(f"val_loss_check_before_incremental: {val_loss_before:.6f}")


                                                               

In [None]:
"""
This cell:
- Trains ONLY with incr_loader (new images).
- Saves a checkpoint every epoch (always) plus a best model (based on train_loss_newdata).
- Logs everything inside an incremental MLflow run.
"""
import json
import time
from datetime import datetime

# =========================
# LOG EVENT (JSONL)
# =========================
RETRAIN_LOG = (LOGS_DIR / "retrain_progress.log").resolve()

def log_event(event: dict):
    """
    Escribe eventos en formato JSONL (1 evento por línea).
    El frontend lo parsea para barra/epoch/loss/eta.
    """
    event["ts"] = time.strftime("%Y-%m-%d %H:%M:%S")
    with open(RETRAIN_LOG, "a", encoding="utf-8", errors="ignore") as f:
        f.write(json.dumps(event, ensure_ascii=False) + "\n")


# =========================
# OPTIMIZER
# =========================
trainable_params = [p for p in model.parameters() if p.requires_grad]

optimizer = torch.optim.AdamW(
    trainable_params,
    lr=INCR_CONFIG["learning_rate"],
    weight_decay=INCR_CONFIG["weight_decay"]
)

def train_one_epoch_incremental(model, data_loader, optimizer, epoch: int) -> float:
    model.train()
    total_loss = 0.0
    n = 0

    for images, targets in tqdm(data_loader, desc=f"incr train e{epoch}", leave=False):
        images = [img.to(DEVICE) for img in images]
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        total_loss += float(losses.item())
        n += 1

    return total_loss / max(1, n)


# =========================
# RUN SETUP
# =========================
incr_run_name = f"incr_train_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
best_train_loss = float("inf")
best_incr_ckpt_path = None
last_epoch_ckpt_path = None

# Guardar lista de archivos nuevos usados (solo NUEVOS, para que tenga sentido)
used_list_path = MODELS_DIR / f"new_files_{incr_run_name}.json"
with open(used_list_path, "w", encoding="utf-8") as f:
    json.dump([{"image": p[0].name, "label": p[1].name} for p in pairs_new], f, indent=2, ensure_ascii=False)


# =========================
# MLFLOW RUN
# =========================
with mlflow.start_run(run_name=incr_run_name) as run:
    incr_run_id = run.info.run_id

    mlflow.set_tag("stage", "incremental")
    mlflow.set_tag("status", "TRAINED")
    mlflow.set_tag("parent_checkpoint", BASE_CKPT_PATH.name)
    mlflow.set_tag("parent_source", BASE_SOURCE)
    mlflow.set_tag("classes", ",".join(TARGET_CLASSES))
    mlflow.set_tag("registered_model_name", REGISTERED_MODEL_NAME)

    mlflow.log_params(INCR_CONFIG)
    mlflow.log_param("new_images", len(pairs_new))          
    mlflow.log_param("train_images_total", len(pairs_train)) 

    mlflow.log_metric("val_loss_before", val_loss_before)

    mlflow.log_artifact(str(used_list_path), artifact_path="artifacts")
    mlflow.log_artifact(str(PROJECT_CONFIG_PATH), artifact_path="artifacts")
    mlflow.log_artifact(str(LABELMAP_PATH), artifact_path="artifacts")

    # =========================
    # EVENTO START
    # =========================
    log_event({
        "type": "start",
        "epochs_total": INCR_CONFIG["epochs"],
        "run_name": incr_run_name,
        "run_id": incr_run_id,
        "new_images": int(len(pairs_new)),            
        "train_images_total": int(len(pairs_train)),  
        "val_loss_before": float(val_loss_before) if val_loss_before is not None else None,
    })

    # =========================
    # TRAIN LOOP
    # =========================
    for epoch in range(1, INCR_CONFIG["epochs"] + 1):
        t0 = time.time()

        train_loss = train_one_epoch_incremental(model, incr_loader, optimizer, epoch)
        epoch_time = time.time() - t0

        mlflow.log_metric("train_loss_newdata", train_loss, step=epoch)
        mlflow.log_metric("epoch_time_sec", epoch_time, step=epoch)

        # checkpoint por época (SIEMPRE)
        epoch_ckpt_path = MODELS_DIR / f"epoch_{epoch}_incr_{incr_run_name}.pt"
        last_epoch_ckpt_path = epoch_ckpt_path

        ckpt_out = {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "parent_checkpoint": str(BASE_CKPT_PATH),
            "parent_source": BASE_SOURCE,
            "incr_config": INCR_CONFIG,
            "target_classes": TARGET_CLASSES,
        }

        torch.save(ckpt_out, epoch_ckpt_path)
        mlflow.log_artifact(str(epoch_ckpt_path), artifact_path="checkpoints")

        print(f"Epoch {epoch}/{INCR_CONFIG['epochs']} | train_loss_newdata={train_loss:.4f} | time={epoch_time:.1f}s")

        if train_loss < best_train_loss:
            best_train_loss = train_loss
            best_incr_ckpt_path = MODELS_DIR / f"best_incr_{incr_run_name}.pt"
            torch.save({**ckpt_out, "best_train_loss_newdata": best_train_loss}, best_incr_ckpt_path)
            mlflow.log_artifact(str(best_incr_ckpt_path), artifact_path="checkpoints")

        # =========================
        # EVENTO EPOCH
        # =========================
        log_event({
            "type": "epoch",
            "epoch": int(epoch),
            "epochs_total": int(INCR_CONFIG["epochs"]),
            "train_loss": float(train_loss),
            "val_loss": None,  # no evaluamos por epoch en este notebook
            "epoch_time_sec": float(epoch_time),
        })

    # =========================
    # POST-TRAIN METRICS
    # =========================
    mlflow.log_metric("best_train_loss_newdata", best_train_loss)

    # Eval AFTER
    val_loss_after = evaluate_loss_torchvision(model, eval_loader)
    mlflow.log_metric("val_loss_after", val_loss_after)

    # =========================
    # MODEL REGISTRY
    # =========================
    from mlflow.tracking import MlflowClient
    client = MlflowClient()

    MODEL_NAME = REGISTERED_MODEL_NAME
    run_id = mlflow.active_run().info.run_id

    if best_incr_ckpt_path is None:
        best_incr_ckpt_path = last_epoch_ckpt_path

    model_source = f"runs:/{run_id}/checkpoints/{best_incr_ckpt_path.name}"

    registered = False
    promoted_version = None

    if True:
        mv = mlflow.register_model(model_source, MODEL_NAME)
        registered = True
        promoted_version = mv.version
        print("Registered model:", MODEL_NAME, "version:", mv.version)

        try:
            client.transition_model_version_stage(
                name=MODEL_NAME,
                version=mv.version,
                stage="Production",
                archive_existing_versions=True
            )
            print("Promoted to Production:", mv.version)
        except Exception as e:
            print("Stage transition skipped:", str(e))
    else:
        print("No se registró nueva versión porque no mejoró.")

  
    log_event({
        "type": "done",
        "status": "OK",
        "best_train_loss": float(best_train_loss),
        "val_loss_after": float(val_loss_after),
        "registered": bool(registered),
        "production_version": int(promoted_version) if promoted_version is not None else None,
    })


    manifest = load_manifest(MANIFEST_PATH)
    used_files = set(manifest.get("used_files", []))

    for img_path, _ in pairs_new:
        used_files.add(img_path.name)

    manifest["used_files"] = sorted(list(used_files))
    save_manifest(MANIFEST_PATH, manifest)

    print("Manifest actualizado. Nuevas marcadas como usadas:", len(pairs_new))


print("Incremental terminado")
print("best_incr_ckpt_path:", best_incr_ckpt_path)
print("last_epoch_ckpt_path:", last_epoch_ckpt_path)


                                                            

Epoch 1/2 | train_loss_newdata=0.4238 | time=28.5s


                                                            

Epoch 2/2 | train_loss_newdata=0.3497 | time=24.1s


                                                               

Registered model: frcnn_coco_cpu_person_car_airplane version: 14
Promoted to Production: 14
Manifest actualizado. Nuevas marcadas como usadas: 7
Incremental terminado
best_incr_ckpt_path: c:\Users\Johnny\Desktop\IA-final\models\local_checkpoints\best_incr_incr_train_20260204_130559.pt
last_epoch_ckpt_path: c:\Users\Johnny\Desktop\IA-final\models\local_checkpoints\epoch_2_incr_incr_train_20260204_130559.pt


Registered model 'frcnn_coco_cpu_person_car_airplane' already exists. Creating a new version of this model...
Created version '14' of model 'frcnn_coco_cpu_person_car_airplane'.
  client.transition_model_version_stage(


In [None]:
"""
This cell:
- Computes val_loss_after using the same eval_loader.
- Determines whether the model improved by comparing it against val_loss_before.
"""

val_loss_after = evaluate_loss_torchvision(model, eval_loader)
print("val_loss_check_after_incremental:", val_loss_after)

improvement_delta = float(INCR_CONFIG["improvement_delta"])
improved = (val_loss_after + improvement_delta) < val_loss_before

print("Improved:", improved)
print("Delta used:", improvement_delta)
print("Before:", val_loss_before, "After:", val_loss_after)


                                                               

val_loss_check_after_incremental: 0.6454900339245796
Improved: True
Delta used: 0.0
Before: 0.6709062796086073 After: 0.6454900339245796




In [None]:
"""
This cell:
- Updates manifest.json to prevent retraining twice on the same images.
- Moves images and labels to new_data/used/<run_name>/ for archiving.
"""

archive_dir = (NEW_USED_DIR / incr_run_name).resolve()
archive_img = (archive_dir / "images").resolve()
archive_lbl = (archive_dir / "labels").resolve()
archive_img.mkdir(parents=True, exist_ok=True)
archive_lbl.mkdir(parents=True, exist_ok=True)

for img_path, lbl_path in new_pairs:
    used.add(img_path.name)
    shutil.move(str(img_path), str(archive_img / img_path.name))
    shutil.move(str(lbl_path), str(archive_lbl / lbl_path.name))

manifest["used_files"] = sorted(list(used))
save_manifest(MANIFEST_PATH, manifest)

print("Archivos marcados como usados y archivados en:", archive_dir)
print("Manifest actualizado:", MANIFEST_PATH)


Archivos marcados como usados y archivados en: C:\Users\Johnny\Desktop\IA-final\data\new_data\used\incr_train_20260204_130559
Manifest actualizado: C:\Users\Johnny\Desktop\IA-final\data\new_data\manifest.json
