# Preparación del dataset para YOLOv8

In [3]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

# 1. Rutas
image_base_path = '../data/processed/'   
csv_path = '../data/dataset.csv'      

# 2. Cargar y preparar datos
df = pd.read_csv(csv_path)
df = df[df["Bars"] >= 0].copy()
df["has_bar"] = df["Bars"].apply(lambda x: 1 if x > 0 else 0)
df["image_path"] = df["name"].apply(lambda x: os.path.join(image_base_path, f"{x}.png"))

# 3. Tomar muestra balanceada pequeña
sample_size = 1000  # por clase
sample_df = df.groupby("has_bar", group_keys=False).apply(lambda x: x.sample(n=sample_size, random_state=42))

# 4. División estratificada
train_df, val_df = train_test_split(sample_df, test_size=0.2, stratify=sample_df["has_bar"], random_state=42)

# 5. Crear carpetas destino (limpiar si ya existen)
for subset in ['train', 'val']:
    img_dir = f'galaxy_yolov8/images/{subset}'
    lbl_dir = f'galaxy_yolov8/labels/{subset}'
    shutil.rmtree(img_dir, ignore_errors=True)
    shutil.rmtree(lbl_dir, ignore_errors=True)
    os.makedirs(img_dir, exist_ok=True)
    os.makedirs(lbl_dir, exist_ok=True)

# 6. Copiar imágenes y generar etiquetas
def process_subset(subset_df, subset_name):
    for _, row in subset_df.iterrows():
        img_name = os.path.basename(row['image_path'])
        image_dst = f'galaxy_yolov8/images/{subset_name}/{img_name}'
        label_dst = f'galaxy_yolov8/labels/{subset_name}/{os.path.splitext(img_name)[0]}.txt'

        shutil.copy(row['image_path'], image_dst)

        if row['has_bar'] == 1:
            with open(label_dst, 'w') as f:
                f.write('0 0.5 0.5 1.0 1.0\n')
        else:
            open(label_dst, 'w').close()  # etiqueta vacía

process_subset(train_df, 'train')
process_subset(val_df, 'val')

# 7. Crear archivo YAML compatible con YOLOv8
yaml_content = """train: galaxy_yolov8/images/train
val: galaxy_yolov8/images/val
nc: 1
names: ['bar']
"""
with open('galaxy_yolov8/galaxy_dataset.yaml', 'w') as f:
    f.write(yaml_content)

# 8. Verificación
print("Dataset balanceado y listo para YOLOv8.")
print("Total muestras por clase:")
print(sample_df["has_bar"].value_counts())
print("\nTrain class distribution:")
print(train_df["has_bar"].value_counts())
print(train_df["has_bar"].value_counts(normalize=True))
print("\nVal class distribution:")
print(val_df["has_bar"].value_counts())
print(val_df["has_bar"].value_counts(normalize=True))


  sample_df = df.groupby("has_bar", group_keys=False).apply(lambda x: x.sample(n=sample_size, random_state=42))


Dataset balanceado y listo para YOLOv8.
Total muestras por clase:
has_bar
0    1000
1    1000
Name: count, dtype: int64

Train class distribution:
has_bar
1    800
0    800
Name: count, dtype: int64
has_bar
1    0.5
0    0.5
Name: proportion, dtype: float64

Val class distribution:
has_bar
0    200
1    200
Name: count, dtype: int64
has_bar
0    0.5
1    0.5
Name: proportion, dtype: float64


In [4]:
!pip install ultralytics --quiet


In [9]:
from ultralytics import YOLO

model = YOLO('yolov8m.pt')

model.train(
    data='galaxy_yolov8/galaxy_dataset.yaml',
    imgsz=416,
    epochs=10,
    batch=64,
    device='mps',  # ← para usar tu GPU Apple Silicon
    name='bar_detector_yolov8',
    cache=True
)


New https://pypi.org/project/ultralytics/8.3.142 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.141 🚀 Python-3.10.16 torch-2.7.0 MPS (Apple M2 Max)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=64, bgr=0.0, box=7.5, cache=True, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=galaxy_yolov8/galaxy_dataset.yaml, degrees=0.0, deterministic=True, device=mps, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=10, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=416, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8m.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=bar_detector_yolov83, nbs=64, nms=False, opset=Non

[34m[1mtrain: [0mScanning /Users/isaidoropeza/Documents/MNA_Proyecto_Integrador/Baseline/i[0m

[34m[1mtrain: [0mNew cache created: /Users/isaidoropeza/Documents/MNA_Proyecto_Integrador/Baseline/itesm-mna-barred-galaxies-main/notebooks/galaxy_yolov8/labels/train.cache







[34m[1mtrain: [0mCaching images (0.8GB RAM): 100%|██████████| 1600/1600 [00:00<00:00, 4100[0m

[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1260.4±153.0 MB/s, size: 138.2 KB)



[34m[1mval: [0mScanning /Users/isaidoropeza/Documents/MNA_Proyecto_Integrador/Baseline/ite[0m

[34m[1mval: [0mNew cache created: /Users/isaidoropeza/Documents/MNA_Proyecto_Integrador/Baseline/itesm-mna-barred-galaxies-main/notebooks/galaxy_yolov8/labels/val.cache







[34m[1mval: [0mCaching images (0.2GB RAM): 100%|██████████| 400/400 [00:00<00:00, 4054.99i[0m


Plotting labels to runs/detect/bar_detector_yolov83/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 77 weight(decay=0.0), 84 weight(decay=0.0005), 83 bias(decay=0.0)
Image sizes 416 train, 416 val
Using 0 dataloader workers
Logging results to [1mruns/detect/bar_detector_yolov83[0m
Starting training for 10 epochs...
Closing dataloader mosaic

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/10      22.9G     0.2198      1.495       1.01         28        416: 1
                 Class     Images  Instances      Box(P          R      mAP50  m



                 Class     Images  Instances      Box(P          R      mAP50  m



                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        400        200      0.138      0.715      0.387      0.293






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/10        23G     0.2371     0.9136      0.984         37        416: 1
                 Class     Images  Instances      Box(P          R      mAP50  m



                 Class     Images  Instances      Box(P          R      mAP50  m


                   all        400        200          0          0          0          0

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/10        23G       0.25     0.9705      1.005         34        416: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        400        200     0.0334          1     0.0338     0.0207






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/10        23G     0.2438     0.9032      1.002         31        416: 1
                 Class     Images  Instances      Box(P          R      mAP50  m



                 Class     Images  Instances      Box(P          R      mAP50  m


                   all        400        200    0.00545       0.21    0.00323    0.00143

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/10        23G     0.1713     0.8272     0.9479         29        416: 1
                 Class     Images  Instances      Box(P          R      mAP50  m


                   all        400        200      0.178      0.925      0.375      0.195

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/10        23G     0.1263     0.7969     0.9351         36        416: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        400        200      0.507          1      0.512      0.328






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/10        23G     0.1177     0.7836     0.9194         32        416: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        400        200        0.5          1      0.594      0.594






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/10        23G    0.09663     0.7741     0.9211         36        416: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        400        200      0.499      0.995      0.583      0.583






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/10        23G    0.07705     0.7715     0.9022         38        416: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        400        200        0.5          1      0.604      0.604






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/10        23G    0.05694     0.7423     0.9131         32        416: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        400        200      0.542      0.935      0.631      0.631






10 epochs completed in 0.521 hours.
Optimizer stripped from runs/detect/bar_detector_yolov83/weights/last.pt, 52.0MB
Optimizer stripped from runs/detect/bar_detector_yolov83/weights/best.pt, 52.0MB

Validating runs/detect/bar_detector_yolov83/weights/best.pt...
Ultralytics 8.3.141 🚀 Python-3.10.16 torch-2.7.0 MPS (Apple M2 Max)
Model summary (fused): 92 layers, 25,840,339 parameters, 0 gradients, 78.7 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  m


                   all        400        200        0.5          1      0.617      0.617
Speed: 0.6ms preprocess, 14.4ms inference, 0.0ms loss, 12.2ms postprocess per image
Results saved to [1mruns/detect/bar_detector_yolov83[0m


ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([0])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x32b48cdc0>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043043,    0.044044,    0.045045,    0.046046,    0.047047,
          0.048048,    