In [2]:
import os
import pandas as pd
import random

# --- CONFIG ---
csv_file = "/mnt/d/seaqr/train_label.csv"          
images_dir = "/mnt/d/seaqr/images/train/kolomverse"   # base dir of images
labels_dir = "/mnt/d/seaqr/labels/train/kolomverse"   # base dir of labels
train_txt = "data/train.txt"
val_txt = "data/val.txt"
val_split = 0.1   # 10% of images go to val

class_map = {
    "ship": 0,
    "buoy": 1,
    "fishnet buoy": 2,
    "lighthouse": 3,
    "wind farm": 4,
    # whale = 5 handled separately
}

os.makedirs(labels_dir, exist_ok=True)

# --- READ CSV ---
df = pd.read_csv(csv_file)

required_cols = ["image", "width", "height", "xmin", "ymin", "xmax", "ymax", "label"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"CSV missing column: {col}")

# --- COLLECT ALL IMAGES & LABELS ---
all_image_paths = []
skipped = 0

for image_name, group in df.groupby("image"):
    img_w = group.iloc[0]["width"]
    img_h = group.iloc[0]["height"]
    image_path = image_name.split('/')
    
    ### TODO: subset of data used
    if int(image_path[1]) >= 2:
        break
    
    image_name = image_path[-1]

    # try to locate image in subfolders
    found_path = None
    for root, _, files in os.walk(images_dir):
        if image_name in files:
            found_path = os.path.join(root, image_name)
            break

    if not found_path:
        print(f"⚠️ Image {image_name} not found under {images_dir}, skipping")
        skipped += 1
        continue

    # prepare label lines
    lines = []
    for _, row in group.iterrows():
        label = str(row["label"]).strip()
        if label not in class_map:
            print(f"⚠️ Unknown class {label} in {image_name}, skipping")
            continue

        cls = class_map[label]
        xmin, ymin, xmax, ymax = row["xmin"], row["ymin"], row["xmax"], row["ymax"]

        # normalize
        x_center = ((xmin + xmax) / 2) / img_w
        y_center = ((ymin + ymax) / 2) / img_h
        w = (xmax - xmin) / img_w
        h = (ymax - ymin) / img_h

        if w <= 0 or h <= 0:
            print(f"⚠️ Invalid box in {image_name}, skipping")
            continue

        lines.append(f"{cls} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}")

    # skip if no boxes
    if not lines:
        continue

    # save label file (mirror subfolder structure)
    rel_dir = os.path.relpath(os.path.dirname(found_path), images_dir)
    label_subdir = os.path.join(labels_dir, rel_dir)
    os.makedirs(label_subdir, exist_ok=True)

    label_path = os.path.join(label_subdir, os.path.splitext(image_name)[0] + ".txt")
    # with open(label_path, "w") as f:
    #     f.write("\n".join(lines))

    all_image_paths.append(found_path)

# --- SPLIT TRAIN/VAL ---
random.shuffle(all_image_paths)
val_size = int(len(all_image_paths) * val_split)
val_images = all_image_paths[:val_size]
train_images = all_image_paths[val_size:]

with open(train_txt, "w") as f:
    f.write("\n".join(train_images) + "\n")

with open(val_txt, "w") as f:
    f.write("\n".join(val_images) + "\n")

print(f"✅ Done. {len(train_images)} train, {len(val_images)} val images processed.")
print(f"⚠️ {skipped} images listed in CSV but not found on disk.")
print("Labels saved under:", labels_dir)



✅ Done. 3123 train, 346 val images processed.
⚠️ 0 images listed in CSV but not found on disk.
Labels saved under: /mnt/d/seaqr/labels/train/kolomverse
