# ðŸ“¦ Prepare `dataset/` at repo root (robust, repo-root aware)

This notebook downloads **dataset.zip** from Google Drive, extracts it to  
`<repo>/dataset/` (`soccer_vision/dataset`), fixes any nested paths created by the
notebookâ€™s working directory, and validates the expected structure:

```

soccer_vision/
â”œâ”€â”€ dataset/
    â”œâ”€â”€train/
        â”œâ”€â”€ images/
        â”œâ”€â”€ labels/
    â”œâ”€â”€ val/           (optional; can be created later by the training script)
```

## Install/imports

In [14]:
import os, zipfile, shutil, sys
from pathlib import Path

# Install gdown if needed
try:
    import gdown  # noqa: F401
except Exception:
    %pip -q install gdown
import gdown

## Configuration

- `ROOT_NAME`: the folder name of your repo.
- `GDRIVE_DATASET_ID`: the Google Drive **file id** for `dataset.zip`.

In [15]:
ROOT_NAME = "soccer_vision"  # repo folder name
GDRIVE_DATASET_ID = "13c8_NnjszJBDuLAHCr8jKq3eBP7Pocez"  # <-- your dataset.zip file id

## Helper functions

- `resolve_repo_root()` finds the repo root no matter where this notebook runs.
- `dataset_looks_ok()` checks expected folders.
- `tree()` prints a small directory tree for quick inspection.

In [16]:
def resolve_repo_root() -> Path:
    """Find the repo root named ROOT_NAME, regardless of where this notebook runs."""
    env = os.getenv("SOCCER_VISION_ROOT")
    if env:
        p = Path(env).expanduser().resolve()
        if p.name != ROOT_NAME:
            print(f"[WARN] SOCCER_VISION_ROOT is set to {p}, but folder name != {ROOT_NAME}")
        return p

    here = Path.cwd().resolve()
    if here.name == ROOT_NAME and (here / "notebooks").exists():
        return here
    if here.name == "notebooks" and here.parent.name == ROOT_NAME:
        return here.parent
    for p in [here, *here.parents]:
        if p.name == ROOT_NAME:
            return p
    for p in [here, *here.parents]:
        if (p / "notebooks").exists():
            return p
    return here

def dataset_looks_ok(ds: Path) -> bool:
    return (ds / "train" / "images").exists() and (ds / "train" / "labels").exists()

def tree(path: Path, max_files=5):
    print(f"\n[Tree] {path.resolve()}")
    for root, dirs, files in os.walk(path):
        indent = "  " * (len(Path(root).relative_to(path).parts))
        print(f"{indent}{Path(root).name}/")
        for d in sorted(dirs):
            print(f"{indent}  {d}/")
        for i, f in enumerate(sorted(files)):
            if i >= max_files:
                print(f"{indent}  ... (+{len(files)-max_files} more)")
                break
            print(f"{indent}  {f}")


## Locate repo root & define key paths

In [4]:
from soccer_vision.notebooks.modules.paths import find_repo_root

BASE = find_repo_root()
print(f"[BASE] {BASE}")

DATASET_DIR = BASE / "dataset"       # expected final location
DATASET_ZIP = BASE / "dataset.zip"   # temporary zip path

[BASE] Z:\Proyectos\Robotica\footbot\soccer_vision


## Fix a previously nested location

If a prior run placed the dataset under `notebooks/soccer_vision/dataset`, this
moves it back to `<repo>/dataset`.

In [18]:
nested = BASE / "notebooks" / ROOT_NAME / "dataset"
if nested.exists() and not DATASET_DIR.exists():
    print(f"[FIX] Moving nested dataset -> {DATASET_DIR}")
    DATASET_DIR.parent.mkdir(parents=True, exist_ok=True)
    shutil.move(str(nested), str(DATASET_DIR))
    # Attempt to remove the now-empty wrapper dir
    try:
        shutil.rmtree((BASE / "notebooks" / ROOT_NAME), ignore_errors=True)
    except Exception as e:
        print("[WARN] Could not remove nested wrapper folder:", e)

## Download `dataset.zip` from Google Drive (only if missing)

Requires the file to be shared as **Anyone with the link**.

In [19]:
if DATASET_DIR.exists() and dataset_looks_ok(DATASET_DIR):
    print("[INFO] dataset/ already exists, skipping download.")
else:
    print("[DL] Downloading dataset.zip â€¦")
    url = f"https://drive.google.com/uc?id={GDRIVE_DATASET_ID}"
    ok = gdown.download(url, str(DATASET_ZIP), quiet=False)
    if not ok:
        sys.exit("[ERR] gdown failed. Make sure the Drive file is public (Anyone with the link).")

[DL] Downloading dataset.zip â€¦


Downloading...
From (original): https://drive.google.com/uc?id=13c8_NnjszJBDuLAHCr8jKq3eBP7Pocez
From (redirected): https://drive.google.com/uc?id=13c8_NnjszJBDuLAHCr8jKq3eBP7Pocez&confirm=t&uuid=5df764ce-c6dd-460c-b71f-332a6234a749
To: Z:\Proyectos\Robotica\footbot\dataset.zip
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 61.6M/61.6M [00:01<00:00, 31.8MB/s]


## Unzip & normalize to `<repo>/dataset`

In [20]:
if not (DATASET_DIR.exists() and dataset_looks_ok(DATASET_DIR)):
    print("[UNZIP] Extracting â€¦")
    with zipfile.ZipFile(DATASET_ZIP, "r") as z:
        z.extractall(BASE)

    if not dataset_looks_ok(DATASET_DIR):
        # Find a folder that contains train/images and train/labels and move it to dataset/
        candidates = []
        for p in BASE.iterdir():
            if p.is_dir() and (p / "train" / "images").exists() and (p / "train" / "labels").exists():
                candidates.append(p)
        if not candidates:
            sys.exit("[ERR] Could not locate a folder with train/images and train/labels after unzip.")

        pick = next((c for c in candidates if c.name == "dataset"), candidates[0])
        if pick != DATASET_DIR:
            if DATASET_DIR.exists():
                shutil.rmtree(DATASET_DIR)
            shutil.move(str(pick), str(DATASET_DIR))
    
    print("[UNZIP] Finished")

[UNZIP] Extracting â€¦
[UNZIP] Finished


## Clean up temporary files

In [21]:
print("[CLEAN] Deleting dataset.zip (if present)")
try:
    DATASET_ZIP.unlink()
except FileNotFoundError:
    pass

[CLEAN] Deleting dataset.zip (if present)


## Final validation

We should now have `soccer_vision/dataset/train/images` and `soccer_vision/dataset/train/labels`.

In [22]:
if not dataset_looks_ok(DATASET_DIR):
    sys.exit("[ERR] dataset/ is missing expected subfolders (train/images and train/labels).")

print("[OK] Dataset ready at:", DATASET_DIR)
tree(DATASET_DIR)

[OK] Dataset ready at: Z:\Proyectos\Robotica\footbot\dataset

[Tree] Z:\Proyectos\Robotica\footbot\dataset
dataset/
  train/
  classes.txt
  notes.json
  train/
    images/
    labels/
    images/
      00b200b7-photo_2025-10-12_16-30-16_aug_2.jpg
      01aef225-photo_2025-10-12_16-30-49_aug_2.jpg
      02147359-photo_2025-10-12_22-01-54_aug_3.jpg
      02260e77-photo_2025-10-12_22-01-58_aug_1.jpg
      0392df9f-photo_2025-10-12_16-28-24_aug_3.jpg
      ... (+507 more)
    labels/
      00b200b7-photo_2025-10-12_16-30-16_aug_2.txt
      01aef225-photo_2025-10-12_16-30-49_aug_2.txt
      02147359-photo_2025-10-12_22-01-54_aug_3.txt
      02260e77-photo_2025-10-12_22-01-58_aug_1.txt
      0392df9f-photo_2025-10-12_16-28-24_aug_3.txt
      ... (+507 more)


## Install Ultralytics (YOLO)
Only needed the first time you run this environment.

In [23]:
%pip -q install ultralytics torch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Training parameters
Adjust these, then run the next cell to retrain.


In [24]:
MODEL_BACKBONE   = "yolo11s.pt"
EPOCHS           = 60
IMG_SIZE         = 640
BATCH_SIZE       = 16
DEVICE           = "0"            # GPU id like "0" or "cpu"
TRAIN_PCT        = 0.90           # used only if val/ is empty and --no-split is False
NO_SPLIT         = False          # set True if you don't want main.py to create/move a val split
PATIENCE         = 20             # early stopping patience
WORKERS          = 8              # dataloader workers
SEED             = 0              # for reproducible split & training seed
RUN_NAME         = None           # if None -> defaults to "<backbone_stem>_train"
OUT_SUBDIR       = None           # if None -> defaults to "<backbone_stem>" under soccer_vision/models
FINAL_WEIGHTS    = "soccer_yolo.pt"  # file name saved in models/<OUT_SUBDIR>/
CONF_FOR_VALPREV = 0.25           # quick val prediction confidence


## Retrain the model
This calls the project script so all runs/models land **inside** `soccer_vision/`.

In [25]:
# Make sure we can import notebooks.modules.* no matter the current CWD
from pathlib import Path
import sys

here = Path.cwd().resolve()
repo = here if (here.name == "soccer_vision") else (here.parent if here.name == "notebooks" else None)
if not repo:
    # walk up until we find soccer_vision/
    for p in [here, *here.parents]:
        if (p / "soccer_vision" / "notebooks" / "modules").exists():
            repo = p / "soccer_vision"
            break

assert repo and (repo / "notebooks" / "modules").exists(), f"Could not find modules under {here}"
if str(repo) not in sys.path:
    sys.path.insert(0, str(repo))  # add repo root to sys.path

print("[REPO]", repo)

[REPO] Z:\Proyectos\Robotica\footbot\soccer_vision


In [None]:
# Live logs training
from soccer_vision.notebooks.modules.logging_utils import get_logger, log
from soccer_vision.notebooks.modules.paths import find_repo_root
from soccer_vision.notebooks.modules.train import train_yolo

# Optional: INFO (or DEBUG) level
get_logger()  # already INFO by default

BASE = find_repo_root()
SV_DIR = BASE  # shorthand

# ---- knobs to tweak from notebook ----
MODEL_BACKBONE   = SV_DIR / "yolo11s.pt"   # or "yolo11s.pt" to auto-download
EPOCHS           = 60
IMG_SIZE         = 640
BATCH_SIZE       = 16
DEVICE           = "0"                     # "0" for first GPU; "cpu" to force CPU
WORKERS          = 8
SEED             = 0
PATIENCE         = 20
TRAIN_PCT        = 0.90
COPY_SPLIT       = False
NO_SPLIT         = False
FINAL_WEIGHTS    = "soccer_yolo.pt"
CONF_FOR_VALPREV = 0.25
RUN_NAME         = "yolo11s_train"
OUT_SUBDIR       = "yolo11s"               # models/<OUT_SUBDIR>

result = train_yolo(
    base_dir=SV_DIR,
    model=MODEL_BACKBONE,
    epochs=EPOCHS,
    imgsz=IMG_SIZE,
    batch=BATCH_SIZE,
    device=DEVICE,
    workers=WORKERS,
    seed=SEED,
    patience=PATIENCE,
    train_pct=TRAIN_PCT,
    copy_split=COPY_SPLIT,
    final_name=FINAL_WEIGHTS,
    conf_for_valprev=CONF_FOR_VALPREV,
    run_name=RUN_NAME,
    out_subdir=OUT_SUBDIR,
    no_split=NO_SPLIT,
    logger=log,
)

print("\n--- RESULT ---")
print("Best Weights :", result.best_weights)
print("Run Dir      :", result.run_dir)
print("Artifacts    :", result.artifacts_dir)
print("Val Predicts :", result.val_pred_dir)
print("Device Used  :", result.device_used)


ModuleNotFoundError: No module named 'notebooks'