# Preparación del dataset para YOLOv5

In [2]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

# 1. Rutas
image_base_path = '../data/processed/'   
csv_path = '../data/dataset.csv'      

# 2. Cargar y preparar datos
df = pd.read_csv(csv_path)
df = df[df["Bars"] >= 0].copy()
df["has_bar"] = df["Bars"].apply(lambda x: 1 if x > 0 else 0)
df["image_path"] = df["name"].apply(lambda x: os.path.join(image_base_path, f"{x}.png"))

# 3. Tomar muestra balanceada pequeña
sample_size = 1000  # por clase
sample_df = df.groupby("has_bar", group_keys=False).apply(lambda x: x.sample(n=sample_size, random_state=42))

# 4. División estratificada
train_df, val_df = train_test_split(sample_df, test_size=0.2, stratify=sample_df["has_bar"], random_state=42)

# 5. Crear carpetas destino
for subset in ['train', 'val']:
    os.makedirs(f'galaxy_yolo/images/{subset}', exist_ok=True)
    os.makedirs(f'galaxy_yolo/labels/{subset}', exist_ok=True)

# 6. Copiar imágenes y etiquetas
def process_subset(subset_df, subset_name):
    for _, row in subset_df.iterrows():
        img_name = os.path.basename(row['image_path'])
        image_dst = f'galaxy_yolo/images/{subset_name}/{img_name}'
        label_dst = f'galaxy_yolo/labels/{subset_name}/{os.path.splitext(img_name)[0]}.txt'

        shutil.copy(row['image_path'], image_dst)

        if row['has_bar'] == 1:
            with open(label_dst, 'w') as f:
                f.write('0 0.5 0.5 1.0 1.0\n')
        else:
            open(label_dst, 'w').close()

process_subset(train_df, 'train')
process_subset(val_df, 'val')

# 7. Crear archivo YAML
yaml_content = """train: galaxy_yolo/images/train
val: galaxy_yolo/images/val
nc: 1
names: ['bar']
"""
with open('galaxy_yolo/galaxy_dataset.yaml', 'w') as f:
    f.write(yaml_content)

# 8. Verificación
print("Dataset reducido preparado.")
print("Total muestras por clase:")
print(sample_df["has_bar"].value_counts())
print("\nTrain class distribution:")
print(train_df["has_bar"].value_counts())
print(train_df["has_bar"].value_counts(normalize=True))
print("\nVal class distribution:")
print(val_df["has_bar"].value_counts())
print(val_df["has_bar"].value_counts(normalize=True))


  sample_df = df.groupby("has_bar", group_keys=False).apply(lambda x: x.sample(n=sample_size, random_state=42))


Dataset reducido preparado.
Total muestras por clase:
has_bar
0    1000
1    1000
Name: count, dtype: int64

Train class distribution:
has_bar
1    800
0    800
Name: count, dtype: int64
has_bar
1    0.5
0    0.5
Name: proportion, dtype: float64

Val class distribution:
has_bar
0    200
1    200
Name: count, dtype: int64
has_bar
0    0.5
1    0.5
Name: proportion, dtype: float64


In [4]:
!git clone https://github.com/ultralytics/yolov5


Cloning into 'yolov5'...
remote: Enumerating objects: 17483, done.[K
remote: Counting objects: 100% (110/110), done.[K
remote: Compressing objects: 100% (80/80), done.[K
remote: Total 17483 (delta 80), reused 30 (delta 30), pack-reused 17373 (from 3)[K
Receiving objects: 100% (17483/17483), 16.39 MiB | 2.71 MiB/s, done.
Resolving deltas: 100% (11985/11985), done.


In [7]:
!pip install -r yolov5/requirements.txt

Collecting gitpython>=3.1.30 (from -r yolov5/requirements.txt (line 5))
  Downloading GitPython-3.1.44-py3-none-any.whl.metadata (13 kB)
Collecting opencv-python>=4.1.1 (from -r yolov5/requirements.txt (line 8))
  Using cached opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl.metadata (20 kB)
Collecting thop>=0.1.1 (from -r yolov5/requirements.txt (line 14))
  Using cached thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting tqdm>=4.66.3 (from -r yolov5/requirements.txt (line 17))
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting ultralytics>=8.2.34 (from -r yolov5/requirements.txt (line 18))
  Using cached ultralytics-8.3.141-py3-none-any.whl.metadata (37 kB)
Collecting pandas>=1.1.4 (from -r yolov5/requirements.txt (line 27))
  Downloading pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting seaborn>=0.11.0 (from -r yolov5/requirements.txt (line 28))
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Col

In [3]:
!python yolov5/train.py \
  --img 416 \
  --batch 64 \
  --epochs 10 \
  --data galaxy_yolo/galaxy_dataset.yaml \
  --weights yolov5m.pt \
  --device cpu \
  --name bar_detector_sample \
  --cache


[34m[1mtrain: [0mweights=yolov5m.pt, cfg=, data=galaxy_yolo/galaxy_dataset.yaml, hyp=yolov5/data/hyps/hyp.scratch-low.yaml, epochs=10, batch_size=64, imgsz=416, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, evolve_population=yolov5/data/hyps, resume_evolve=None, bucket=, cache=ram, image_weights=False, device=cpu, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=yolov5/runs/train, name=bar_detector_sample, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest, ndjson_console=False, ndjson_file=False
[34m[1mgithub: [0mup to date with https://github.com/ultralytics/yolov5 ✅
YOLOv5 🚀 v7.0-419-gcd44191c Python-3.10.16 torch-2.7.0 CPU

[34m[1mhyperparameters: [0mlr0=0.01, lrf=0.01, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_m