# Segundo notebook
A continuacion se prepara un indice reproducble de las imaganes, como solo buscamos car, truck y airplane se enfocara unicamente en esas 

Este notebook:
- Lee las anotaciones COCO (train y opcionalmente val)
- Filtra solo las clases objetivo
- Selecciona un subconjunto limitado de imágenes por clase (para entrenar rápido)
- Genera archivos índice (manifests) que usará el Notebook 02


In [2]:
# --- IMPORTS ---
from pathlib import Path
import json
import random
from collections import defaultdict, Counter
from typing import Dict, List, Tuple, Set

import numpy as np


In [3]:
# --- RUTAS  ---
#solo para confirmar se declara las rutas establecidad previamente
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent

COCO_ROOT = Path(r"C:\Users\Johnny\Desktop\IA\data\archive\coco2017")
COCO_ANN_DIR = COCO_ROOT / "annotations"
COCO_TRAIN_IMG_DIR = COCO_ROOT / "train2017"
COCO_VAL_IMG_DIR = COCO_ROOT / "val2017"

COCO_INSTANCES_TRAIN = COCO_ANN_DIR / "instances_train2017.json"
COCO_INSTANCES_VAL = COCO_ANN_DIR / "instances_val2017.json"

ARTIFACTS_DIR = PROJECT_ROOT / "artifacts"
DATASET_INDEX_DIR = ARTIFACTS_DIR / "dataset_index"

CLASS_MAP_PATH = ARTIFACTS_DIR / "class_map.json"
CONFIG_SNAPSHOT_PATH = ARTIFACTS_DIR / "config_snapshot.json"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("COCO_ROOT:", COCO_ROOT)
print("DATASET_INDEX_DIR:", DATASET_INDEX_DIR)


PROJECT_ROOT: c:\Users\Johnny\Desktop\IA
COCO_ROOT: C:\Users\Johnny\Desktop\IA\data\archive\coco2017
DATASET_INDEX_DIR: c:\Users\Johnny\Desktop\IA\artifacts\dataset_index


1. Carga de configuraciones previas

In [4]:
#De igualmanera para evitar problemas se hace una validacion simple de carpetas, para evitar fallos mas que nada
# --- VALIDACIONES BÁSICAS ---
def assert_exists(p: Path, desc: str) -> None:
    if not p.exists():
        raise FileNotFoundError(f"Falta {desc}: {p}")

assert_exists(COCO_ROOT, "carpeta COCO_ROOT")
assert_exists(COCO_ANN_DIR, "carpeta annotations")
assert_exists(COCO_TRAIN_IMG_DIR, "carpeta train2017")
assert_exists(COCO_INSTANCES_TRAIN, "archivo instances_train2017.json")

assert_exists(ARTIFACTS_DIR, "carpeta artifacts (debe existir por el Notebook 00)")
assert_exists(CLASS_MAP_PATH, "class_map.json (debe existir por el Notebook 00)")
assert_exists(CONFIG_SNAPSHOT_PATH, "config_snapshot.json (debe existir por el Notebook 00)")

DATASET_INDEX_DIR.mkdir(parents=True, exist_ok=True)

print("Validaciones OK.")


Validaciones OK.


In [5]:
#Cargamos las limitaciones del dataset previo
# --- CARGA DE CONFIGURACIÓN DEL PROYECTO ---
with open(CONFIG_SNAPSHOT_PATH, "r", encoding="utf-8") as f:
    cfg = json.load(f)

with open(CLASS_MAP_PATH, "r", encoding="utf-8") as f:
    class_map = json.load(f)

target_classes = tuple(cfg["target_classes"])
max_train = int(cfg["max_images_per_class_train"])
max_val = int(cfg["max_images_per_class_val"])
seed = int(cfg["seed"])

random.seed(seed)
np.random.seed(seed)

print("Clases objetivo:", target_classes)
print("Límite train por clase:", max_train)
print("Límite val por clase:", max_val)
print("Seed:", seed)


Clases objetivo: ('car', 'airplane', 'truck')
Límite train por clase: 900
Límite val por clase: 300
Seed: 42


2. Configuracion del Dataset

In [6]:
#define funciones utilitarias para leer y organizar un dataset en formato COCO igualmanera usa el json creado previaene
# --- FUNCIONES PARA PARSEAR COCO ---
def load_coco(path_json: Path) -> Dict:
    with open(path_json, "r", encoding="utf-8") as f:
        return json.load(f)

def build_cat_maps(coco: Dict) -> Tuple[Dict[str, int], Dict[int, str]]:
    categories = coco.get("categories", [])
    name_to_id = {c["name"]: c["id"] for c in categories}
    id_to_name = {c["id"]: c["name"] for c in categories}
    return name_to_id, id_to_name

def build_image_map(coco: Dict) -> Dict[int, Dict]:
    images = coco.get("images", [])
    return {img["id"]: img for img in images}

def build_annotations_by_image(coco: Dict) -> Dict[int, List[Dict]]:
    ann_by_img = defaultdict(list)
    for ann in coco.get("annotations", []):
        ann_by_img[ann["image_id"]].append(ann)
    return ann_by_img


In [7]:

# --- CARGA COCO TRAIN  ---
coco_train = load_coco(COCO_INSTANCES_TRAIN)
train_name_to_id, train_id_to_name = build_cat_maps(coco_train)
train_images = build_image_map(coco_train)
train_ann_by_img = build_annotations_by_image(coco_train)

has_val_file = COCO_INSTANCES_VAL.exists()

if has_val_file:
    coco_val = load_coco(COCO_INSTANCES_VAL)
    val_name_to_id, val_id_to_name = build_cat_maps(coco_val)
    val_images = build_image_map(coco_val)
    val_ann_by_img = build_annotations_by_image(coco_val)
    print("OK: instances_val2017.json encontrado.")
else:
    coco_val = None
    val_name_to_id, val_id_to_name = {}, {}
    val_images, val_ann_by_img = {}, {}
    print("Aviso: no hay instances_val2017.json. Se hará split desde train.")


OK: instances_val2017.json encontrado.


In [8]:
#Validaciones de las clases coco necesarias, json y formatos correctos
# --- VALIDAR QUE LAS CLASES OBJETIVO EXISTEN EN COCO ---
missing = [c for c in target_classes if c not in train_name_to_id]
if missing:
    raise ValueError(f"Clases faltantes en COCO train categories: {missing}")

target_cat_ids_train = {train_name_to_id[c] for c in target_classes}

print("Category IDs (train):")
for c in target_classes:
    print("-", c, "->", train_name_to_id[c])


Category IDs (train):
- car -> 3
- airplane -> 5
- truck -> 8


3. Seleccion de imagenes 

In [9]:
#Este bloque construye, para cada clase objetivo, la lista de imágenes que la contienen ayuda a balancear, filtrar y depurar el dataset
# --- CONSTRUIR LISTAS DE CANDIDATOS POR CLASE (IMÁGENES QUE CONTIENEN LA CLASE) ---
def images_by_class(images_map: Dict[int, Dict],
                    ann_by_img: Dict[int, List[Dict]],
                    target_cat_ids: Set[int]) -> Dict[int, Set[int]]:
    class_to_imgs = defaultdict(set)
    for img_id, anns in ann_by_img.items():
        for a in anns:
            cid = a.get("category_id")
            if cid in target_cat_ids:
                class_to_imgs[cid].add(img_id)
    return class_to_imgs

train_class_to_imgs = images_by_class(train_images, train_ann_by_img, target_cat_ids_train)

print("Candidatos (train) por clase (imágenes únicas):")
for c in target_classes:
    cid = train_name_to_id[c]
    print("-", c, ":", f"{len(train_class_to_imgs[cid]):,}")


Candidatos (train) por clase (imágenes únicas):
- car : 12,251
- airplane : 2,986
- truck : 6,127


In [None]:
#esta seccion ayuda a evitar un desbalance de imagenes, como se vio en el primer notebook
#  existe una categoria con mas ejemplares y para evitar falsos positovos que solo identififique esa categora especifica a la fuerza se debe balancear el numero de imagenes
# --- SELECCIÓN BALANCEADA DE IMÁGENES (GREEDY) ---
def select_balanced_images(class_to_imgs: Dict[int, Set[int]],
                           cat_ids: List[int],
                           per_class_limit: int,
                           seed: int) -> List[int]:
    rng = random.Random(seed)
    remaining = {cid: set(imgs) for cid, imgs in class_to_imgs.items()}
    selected: List[int] = []
    selected_set: Set[int] = set()
    counts = {cid: 0 for cid in cat_ids}

    pool = set().union(*[remaining.get(cid, set()) for cid in cat_ids])
    pool = list(pool)
    rng.shuffle(pool)

    def img_classes(img_id: int) -> List[int]:
        present = []
        for cid in cat_ids:
            if img_id in remaining.get(cid, set()):
                present.append(cid)
        return present

    for img_id in pool:
        present = img_classes(img_id)
        if not present:
            continue

        can_help = any(counts[cid] < per_class_limit for cid in present)
        if not can_help:
            continue

        if img_id not in selected_set:
            selected.append(img_id)
            selected_set.add(img_id)

        for cid in present:
            if counts[cid] < per_class_limit:
                counts[cid] += 1

        if all(counts[cid] >= per_class_limit for cid in cat_ids):
            break

    return selected

cat_ids_order = [train_name_to_id[c] for c in target_classes]

selected_train_img_ids = select_balanced_images(
    train_class_to_imgs,
    cat_ids_order,
    per_class_limit=max_train,
    seed=seed
)

print("Imágenes seleccionadas (train):", len(selected_train_img_ids))
#imagenes seleccionas es del conjunto total de todo el datatset se entrenara solo con una fraccion del mismo

Imágenes seleccionadas (train): 2341


In [None]:
# --- CONTEO REAL POR CLASE EN EL SUBCONJUNTO SELECCIONADO ---
def count_presence_by_class(img_ids: List[int],
                            ann_by_img: Dict[int, List[Dict]],
                            cat_ids: List[int]) -> Dict[int, int]:
    counts = {cid: 0 for cid in cat_ids}
    img_set = set(img_ids)
    for img_id in img_set:
        anns = ann_by_img.get(img_id, [])
        present = set(a["category_id"] for a in anns)
        for cid in cat_ids:
            if cid in present:
                counts[cid] += 1
    return counts

train_presence_counts = count_presence_by_class(
    selected_train_img_ids, train_ann_by_img, cat_ids_order
)

print("Presencia por clase (train subset):")
for c in target_classes:
    cid = train_name_to_id[c]
    print("-", c, ":", train_presence_counts[cid])
    #Para el trainig se seleccionarios 1246 imagenes de autos, 900 de aviones y 994 de carros grandes/tractores


Presencia por clase (train subset):
- car : 1246
- airplane : 900
- truck : 993


4. Construccion de formato listo para YOLO

In [12]:
#Este bloque crea el conjunto de validación de forma segura y reproducible, usando VAL oficial si existe o generándolo desde TRAIN si no.
# --- ARMAR VAL: USAR VAL OFICIAL SI EXISTE, SI NO HACER SPLIT DESDE TRAIN ---
def build_subset_manifest(coco: Dict,
                          images_map: Dict[int, Dict],
                          ann_by_img: Dict[int, List[Dict]],
                          selected_img_ids: List[int],
                          target_cat_ids: Set[int]) -> Dict:
    selected_set = set(selected_img_ids)

    images_out = []
    annotations_out = []
    ann_id_counter = 1

    for img_id in selected_img_ids:
        if img_id not in images_map:
            continue
        images_out.append(images_map[img_id])

        for ann in ann_by_img.get(img_id, []):
            if ann["category_id"] in target_cat_ids:
                ann_copy = dict(ann)
                ann_copy["id"] = ann_id_counter
                ann_id_counter += 1
                annotations_out.append(ann_copy)

    categories_out = [c for c in coco.get("categories", []) if c["id"] in target_cat_ids]

    out = {
        "images": images_out,
        "annotations": annotations_out,
        "categories": categories_out
    }
    return out

if has_val_file:
    val_missing = [c for c in target_classes if c not in val_name_to_id]
    if val_missing:
        print("Aviso: en VAL faltan clases:", val_missing)
    target_cat_ids_val = {val_name_to_id[c] for c in target_classes if c in val_name_to_id}

    val_class_to_imgs = images_by_class(val_images, val_ann_by_img, target_cat_ids_val)
    val_cat_ids_order = [val_name_to_id[c] for c in target_classes if c in val_name_to_id]

    selected_val_img_ids = select_balanced_images(
        val_class_to_imgs,
        val_cat_ids_order,
        per_class_limit=max_val,
        seed=seed
    )

    val_manifest = build_subset_manifest(
        coco_val,
        val_images,
        val_ann_by_img,
        selected_val_img_ids,
        target_cat_ids_val
    )

    print("Imágenes seleccionadas (val oficial):", len(selected_val_img_ids))

else:
    val_ratio = 0.2
    shuffled = selected_train_img_ids[:]
    random.shuffle(shuffled)

    val_size = max(1, int(len(shuffled) * val_ratio))
    selected_val_img_ids = shuffled[:val_size]
    selected_train_img_ids = shuffled[val_size:]

    train_manifest = build_subset_manifest(
        coco_train,
        train_images,
        train_ann_by_img,
        selected_train_img_ids,
        target_cat_ids_train
    )

    val_manifest = build_subset_manifest(
        coco_train,
        train_images,
        train_ann_by_img,
        selected_val_img_ids,
        target_cat_ids_train
    )

    print("Split hecho desde train.")
    print("Train images:", len(selected_train_img_ids))
    print("Val images:", len(selected_val_img_ids))


Imágenes seleccionadas (val oficial): 550


In [13]:
# --- SI EXISTE VAL OFICIAL, CREAR TAMBIÉN EL MANIFEST TRAIN ---
if has_val_file:
    train_manifest = build_subset_manifest(
        coco_train,
        train_images,
        train_ann_by_img,
        selected_train_img_ids,
        target_cat_ids_train
    )

print("Train manifest:", len(train_manifest["images"]), "images,", len(train_manifest["annotations"]), "anns")
print("Val manifest:", len(val_manifest["images"]), "images,", len(val_manifest["annotations"]), "anns")


Train manifest: 2341 images, 7862 anns
Val manifest: 550 images, 1895 anns


Alista los datos para ser utilizados por los siguientes notebooks

In [14]:
# --- GUARDAR ÍNDICES / MANIFESTS PARA EL NOTEBOOK 02 ---
TRAIN_INDEX_PATH = DATASET_INDEX_DIR / "subset_train_coco.json"
VAL_INDEX_PATH = DATASET_INDEX_DIR / "subset_val_coco.json"
META_PATH = DATASET_INDEX_DIR / "subset_meta.json"

with open(TRAIN_INDEX_PATH, "w", encoding="utf-8") as f:
    json.dump(train_manifest, f)

with open(VAL_INDEX_PATH, "w", encoding="utf-8") as f:
    json.dump(val_manifest, f)

meta = {
    "target_classes": list(target_classes),
    "train_images_count": len(train_manifest["images"]),
    "train_annotations_count": len(train_manifest["annotations"]),
    "val_images_count": len(val_manifest["images"]),
    "val_annotations_count": len(val_manifest["annotations"]),
    "max_images_per_class_train": max_train,
    "max_images_per_class_val": max_val,
    "seed": seed,
    "coco_root": str(COCO_ROOT),
    "train_images_dir": str(COCO_TRAIN_IMG_DIR),
    "val_images_dir": str(COCO_VAL_IMG_DIR),
    "train_instances_json": str(COCO_INSTANCES_TRAIN),
    "val_instances_json": str(COCO_INSTANCES_VAL) if has_val_file else None
}

with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print("Guardado:")
print("-", TRAIN_INDEX_PATH)
print("-", VAL_INDEX_PATH)
print("-", META_PATH)


Guardado:
- c:\Users\Johnny\Desktop\IA\artifacts\dataset_index\subset_train_coco.json
- c:\Users\Johnny\Desktop\IA\artifacts\dataset_index\subset_val_coco.json
- c:\Users\Johnny\Desktop\IA\artifacts\dataset_index\subset_meta.json


In [None]:
#estos sn los datos que estan listos para ser usados por los siguientes notebooks 
def present_counts(manifest: Dict) -> Dict[str, int]:
    cats = {c["id"]: c["name"] for c in manifest.get("categories", [])}
    ann_counts = Counter()
    img_presence = defaultdict(set)

    for ann in manifest.get("annotations", []):
        cid = ann["category_id"]
        ann_counts[cats[cid]] += 1
        img_presence[cats[cid]].add(ann["image_id"])

    out = {}
    for name in sorted(ann_counts.keys()):
        out[f"{name}_bbox_count"] = int(ann_counts[name])
        out[f"{name}_image_presence"] = int(len(img_presence[name]))
    return out

print("Train subset stats:")
for k, v in present_counts(train_manifest).items():
    print("-", k, ":", v)

print("\nVal subset stats:")
for k, v in present_counts(val_manifest).items():
    print("-", k, ":", v)


Train subset stats:
- airplane_bbox_count : 1572
- airplane_image_presence : 900
- car_bbox_count : 4602
- car_image_presence : 1246
- truck_bbox_count : 1688
- truck_image_presence : 993

Val subset stats:
- airplane_bbox_count : 143
- airplane_image_presence : 97
- car_bbox_count : 1337
- car_image_presence : 378
- truck_bbox_count : 415
- truck_image_presence : 250
