In [1]:
# [C00] Keep Kaggle's default torch (do NOT install torch). Only pin numpy.
!pip -q install --no-cache-dir --upgrade --force-reinstall "numpy==2.0.2"

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m60.9/60.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m19.2/19.2 MB[0m [31m273.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.26.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
google-colab 1.0.0 requires google-auth==2.38.0, but you have google-auth 2.47.0 which is incompatible.
google-colab 1.0.0 requires jupyter-server==2.14.0, but you have jupyter-server 2.12.5 which is incompatible.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is in

In [2]:
# [C01] Install Ultralytics without upgrading torch/cuda stack
!pip -q uninstall -y ultralytics || true
!pip -q install --no-cache-dir "ultralytics==8.4.11" --no-deps
!pip -q install pyyaml tqdm

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.2/1.2 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h

In [3]:
# [C01b] Verify GPU + torch compatibility
import torch, numpy as np, ultralytics

print("numpy:", np.__version__)
print("ultralytics:", ultralytics.__version__)
print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())

if torch.cuda.is_available():
    name = torch.cuda.get_device_name(0)
    cap = torch.cuda.get_device_capability(0)
    print("GPU0:", name, "capability:", cap)
    # T4 should be (7,5). P100 is (6,0) which fails with your torch 2.10 build.

Creating new Ultralytics Settings v0.0.6 file ‚úÖ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
numpy: 2.0.2
ultralytics: 8.4.11
torch: 2.8.0+cu126
cuda available: True
GPU0: Tesla T4 capability: (7, 5)


In [4]:
# [C01c] Use only GPU0
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
# [C02] Imports + config

import os, glob, shutil
import yaml
from tqdm import tqdm

# Unified class order for the final merged dataset
TARGET_CLASSES = ["player", "goalkeeper", "referee", "ball"]

# Common label name normalization/remapping
NAME_MAP = {
    # players
    "player": "player",
    "players": "player",

    # referees
    "referee": "referee",
    "ref": "referee",
    "refree": "referee",

    # goalkeepers
    "goalkeeper": "goalkeeper",
    "goalie": "goalkeeper",
    "goal_keeper": "goalkeeper",
    "goalkeeper1": "goalkeeper",
    "goalkeeper2": "goalkeeper",
    "goalkeeper_1": "goalkeeper",
    "goalkeeper_2": "goalkeeper",
    "goalkeepera": "goalkeeper",
    "goalkeeperb": "goalkeeper",

    # ball
    "ball": "ball",
}

OUT_ROOT = "/kaggle/working/football_merged"

In [6]:
# [C03] Helper functions (YAML + dataset root discovery)

def norm(s: str) -> str:
    return str(s).strip().lower().replace(" ", "").replace("-", "").replace("__", "_")

def read_yaml(path):
    with open(path, "r") as f:
        return yaml.safe_load(f)

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def list_data_yaml_roots(base_dir: str):
    """Return directories that contain a data.yaml somewhere under base_dir."""
    roots = []
    for dirpath, dirnames, filenames in os.walk(base_dir):
        if "data.yaml" in filenames:
            roots.append(dirpath)
    # sort shallow-first
    roots.sort(key=lambda p: len(p.split(os.sep)))
    return roots

def find_yolo_splits(ds_root: str):
    """Return list of split names that exist ('train','valid','val','test') with images folder."""
    splits = []
    for split in ["train", "valid", "val", "test"]:
        if os.path.exists(os.path.join(ds_root, split, "images")):
            splits.append(split)
    return splits

In [7]:
# [C04] Inspect Kaggle inputs (confirm folders)

print("Kaggle inputs:", os.listdir("/kaggle/input"))

for name in ["football-players-detection", "footballplayers-v1i-yolov8", "d"]:
    p = f"/kaggle/input/{name}"
    print("\n===", p, "===")
    if os.path.exists(p):
        print("exists. top-level:", os.listdir(p)[:30])
    else:
        print("NOT FOUND")

Kaggle inputs: ['footballplayers-v1i-yolov8', 'football-players-detection']

=== /kaggle/input/football-players-detection ===
exists. top-level: ['football-players-detection']

=== /kaggle/input/footballplayers-v1i-yolov8 ===
exists. top-level: ['FootballPlayers.v1i.yolov8 (2)', 'FootballPlayers.v1i.yolov8', 'FootballPlayers.v4-2025oct27_soccerintial.yolov8', 'FootballPlayers.v1i.yolov8 (1)']

=== /kaggle/input/d ===
NOT FOUND


In [8]:
# [C05] Collect all dataset roots (all data.yaml under those inputs)
# This handles the case where footballplayers-v1i-yolov8 has multiple subfolders each with its own data.yaml.

INPUT_DATASETS = [
    "/kaggle/input/football-players-detection",
    "/kaggle/input/footballplayers-v1i-yolov8",
    "/kaggle/input/d",
]

all_roots = []
for base in INPUT_DATASETS:
    if os.path.exists(base):
        roots = list_data_yaml_roots(base)
        print(f"\nFound {len(roots)} data.yaml roots under {base}:")
        for r in roots[:20]:
            print(" -", r)
        all_roots.extend(roots)

print("\nTOTAL data.yaml roots:", len(all_roots))


Found 1 data.yaml roots under /kaggle/input/football-players-detection:
 - /kaggle/input/football-players-detection/football-players-detection

Found 4 data.yaml roots under /kaggle/input/footballplayers-v1i-yolov8:
 - /kaggle/input/footballplayers-v1i-yolov8/FootballPlayers.v1i.yolov8 (2)
 - /kaggle/input/footballplayers-v1i-yolov8/FootballPlayers.v1i.yolov8
 - /kaggle/input/footballplayers-v1i-yolov8/FootballPlayers.v4-2025oct27_soccerintial.yolov8
 - /kaggle/input/footballplayers-v1i-yolov8/FootballPlayers.v1i.yolov8 (1)

TOTAL data.yaml roots: 5


In [9]:
# [C06] Merge function (YOLO dataset with data.yaml)
# This merges by label names, not by class IDs, so it works across datasets with different class orders.

def merge_yolo_dataset(ds_root: str, out_root: str, dataset_tag: str):
    data_yaml = os.path.join(ds_root, "data.yaml")
    if not os.path.exists(data_yaml):
        raise FileNotFoundError(f"Missing data.yaml in {ds_root}")

    data = read_yaml(data_yaml)

    # names can be list or dict
    names = data.get("names", None)
    if names is None:
        raise ValueError(f"No 'names' in {data_yaml}")

    if isinstance(names, dict):
        # keys might be strings in some exports
        max_k = max(int(k) for k in names.keys())
        src_names = [names[str(i)] if str(i) in names else names[i] for i in range(max_k + 1)]
    else:
        src_names = list(names)

    # build src class id -> target class id mapping (or None to drop)
    srcid_to_tgtid = {}
    for src_id, src_name in enumerate(src_names):
        key = norm(src_name)
        mapped = NAME_MAP.get(key, None)
        if mapped is None:
            srcid_to_tgtid[src_id] = None
        else:
            srcid_to_tgtid[src_id] = TARGET_CLASSES.index(mapped)

    splits = find_yolo_splits(ds_root)
    if not splits:
        # Some datasets put images/labels directly without split folders; skip them for now.
        print(f"[WARN] No train/valid/test splits found in {ds_root}. Skipping.")
        return 0

    merged_images = 0

    for split in splits:
        img_dir = os.path.join(ds_root, split, "images")
        lbl_dir = os.path.join(ds_root, split, "labels")

        out_split = "valid" if split in ["valid", "val"] else split
        out_img = os.path.join(out_root, out_split, "images")
        out_lbl = os.path.join(out_root, out_split, "labels")
        ensure_dir(out_img); ensure_dir(out_lbl)

        images = glob.glob(os.path.join(img_dir, "*.*"))
        for img_path in tqdm(images, desc=f"Merging {dataset_tag}:{os.path.basename(ds_root)}:{split}", leave=False):
            base = os.path.splitext(os.path.basename(img_path))[0]
            lbl_path = os.path.join(lbl_dir, base + ".txt")

            new_base = f"{dataset_tag}__{os.path.basename(ds_root)}__{base}"
            new_img_path = os.path.join(out_img, new_base + os.path.splitext(img_path)[1])
            shutil.copy2(img_path, new_img_path)

            new_lbl_path = os.path.join(out_lbl, new_base + ".txt")
            out_lines = []

            if os.path.exists(lbl_path):
                with open(lbl_path, "r") as f:
                    for line in f:
                        parts = line.strip().split()
                        if len(parts) < 5:
                            continue
                        src_cls = int(parts[0])
                        tgt_cls = srcid_to_tgtid.get(src_cls, None)
                        if tgt_cls is None:
                            continue
                        parts[0] = str(tgt_cls)
                        out_lines.append(" ".join(parts))

            # write label file (can be empty)
            with open(new_lbl_path, "w") as f:
                f.write("\n".join(out_lines))

            merged_images += 1

    return merged_images

In [10]:
# [C07] Perform merge (clears output folder first)

if os.path.exists(OUT_ROOT):
    shutil.rmtree(OUT_ROOT)
ensure_dir(OUT_ROOT)

total = 0
for i, r in enumerate(all_roots):
    tag = f"ds{i:02d}"
    try:
        n = merge_yolo_dataset(r, OUT_ROOT, dataset_tag=tag)
        total += n
    except Exception as e:
        print(f"[SKIP] root={r} error={e}")

print("\nMerged images total:", total)
print("Merged output:", OUT_ROOT)

                                                                                                                      


Merged images total: 3040
Merged output: /kaggle/working/football_merged




In [11]:
# [C08] Write final merged data.yaml (the one YOLO will train on)

DATA_YAML = os.path.join(OUT_ROOT, "data.yaml")
final_yaml = {
    "path": OUT_ROOT,
    "train": "train/images",
    "val": "valid/images",
    "names": {i: n for i, n in enumerate(TARGET_CLASSES)}
}
with open(DATA_YAML, "w") as f:
    yaml.safe_dump(final_yaml, f, sort_keys=False)

print(open(DATA_YAML).read())

path: /kaggle/working/football_merged
train: train/images
val: valid/images
names:
  0: player
  1: goalkeeper
  2: referee
  3: ball



In [12]:
# [C09] Sanity check: counts + class distribution

from collections import Counter

def count_files(pat): 
    return len(glob.glob(pat))

print("train images:", count_files(os.path.join(OUT_ROOT, "train/images/*.*")))
print("train labels:", count_files(os.path.join(OUT_ROOT, "train/labels/*.txt")))
print("valid images:", count_files(os.path.join(OUT_ROOT, "valid/images/*.*")))
print("valid labels:", count_files(os.path.join(OUT_ROOT, "valid/labels/*.txt")))

# class distribution from a sample of label files (fast)
label_files = glob.glob(os.path.join(OUT_ROOT, "train/labels/*.txt"))
cls_counts = Counter()
for lf in label_files[:5000]:
    with open(lf, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 1 and parts[0].isdigit():
                cls_counts[int(parts[0])] += 1

print("\nSample class counts (from up to 5000 label files):")
for k in range(len(TARGET_CLASSES)):
    print(k, TARGET_CLASSES[k], ":", cls_counts.get(k, 0))

# If ball is extremely low compared to players, ball detection will be weak. (We can fix later by adding a dedicated soccer-ball dataset.)

train images: 2677
train labels: 2677
valid images: 251
valid labels: 251

Sample class counts (from up to 5000 label files):
0 player : 47730
1 goalkeeper : 1776
2 referee : 5660
3 ball : 2092


In [13]:
# [C10] Train detector (YOLOv8m)

from ultralytics import YOLO

model = YOLO("yolov8m.pt")  # strong baseline on P100

model.train(
    data=DATA_YAML,
    imgsz=960,        # helps small objects + wide shots
    epochs=80,
    batch=8,          # if OOM: set batch=4; if plenty memory: try 12/16
    device=0,
    workers=2,
    patience=15,
    close_mosaic=10,
    optimizer="AdamW",
    lr0=0.003,
    cos_lr=True,
    project="runs",
    name="football_yolov8m_merged",
)

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.4.0/yolov8m.pt to 'yolov8m.pt': 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 49.7MB 209.9MB/s 0.2s0.2s<0.0s
Ultralytics 8.4.11 üöÄ Python-3.12.12 torch-2.8.0+cu126 CUDA:0 (Tesla T4, 14913MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, angle=1.0, augment=False, auto_augment=randaugment, batch=8, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=True, cutmix=0.0, data=/kaggle/working/football_merged/data.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, end2end=None, epochs=80, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=960, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.003, lrf=0.01, mask_ratio=4, max_det=300, mix

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([0, 1, 2, 3])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x7e8938c9fcb0>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043043,    0.044044,    0.045045,    0.046046,    0.047047,
          0

In [14]:
# [C11] Find best.pt robustly (don‚Äôt assume a fixed path)
import glob

candidates = glob.glob("/kaggle/working/**/best.pt", recursive=True)
print("Found best.pt candidates:")
for c in candidates:
    print(" -", c)

assert len(candidates) > 0, "No best.pt found. Check training output dirs."
BEST_PT = candidates[0]  # pick first; you can choose specific if multiple
print("\nUsing BEST_PT =", BEST_PT)

Found best.pt candidates:
 - /kaggle/working/runs/detect/runs/football_yolov8m_merged/weights/best.pt

Using BEST_PT = /kaggle/working/runs/detect/runs/football_yolov8m_merged/weights/best.pt


In [15]:
 # [C12] Copy best.pt into a simple output path
import shutil, os

out_weights_dir = "/kaggle/working/exported_weights"
os.makedirs(out_weights_dir, exist_ok=True)

dst = os.path.join(out_weights_dir, "best.pt")
shutil.copy2(BEST_PT, dst)

print("Copied to:", dst)

Copied to: /kaggle/working/exported_weights/best.pt


In [16]:
# [C13] Zip all training outputs (runs/) + exported weights into one file for 1-click download
import os, zipfile
from pathlib import Path

ZIP_PATH = "/kaggle/working/football_training_outputs.zip"

paths_to_zip = [
    "/kaggle/working/runs",              # all training logs/plots/weights
    "/kaggle/working/exported_weights",  # best.pt copied here (from C12)
]

def zip_dir(zipf, folder, arc_prefix):
    folder = Path(folder)
    for p in folder.rglob("*"):
        if p.is_file():
            zipf.write(p, arcname=str(Path(arc_prefix) / p.relative_to(folder)))

with zipfile.ZipFile(ZIP_PATH, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as z:
    for p in paths_to_zip:
        if os.path.exists(p):
            zip_dir(z, p, arc_prefix=os.path.basename(p))
        else:
            print(f"[WARN] Skipping missing path: {p}")

print("Created:", ZIP_PATH)
print("Size (MB):", round(os.path.getsize(ZIP_PATH) / (1024 * 1024), 2))

Created: /kaggle/working/football_training_outputs.zip
Size (MB): 145.33


In [17]:
# [C14] Prepare a small folder (best.pt + data.yaml + class names) to publish as a Kaggle Dataset
import os, shutil

bundle_dir = "/kaggle/working/model_bundle"
os.makedirs(bundle_dir, exist_ok=True)

# copy weights
shutil.copy2("/kaggle/working/exported_weights/best.pt", os.path.join(bundle_dir, "best.pt"))

# copy merged data.yaml (useful for consistent class mapping)
shutil.copy2("/kaggle/working/football_merged/data.yaml", os.path.join(bundle_dir, "data.yaml"))

# write class list
with open(os.path.join(bundle_dir, "classes.txt"), "w") as f:
    f.write("\n".join(TARGET_CLASSES) + "\n")

print("Model bundle ready at:", bundle_dir)
print("Files:", os.listdir(bundle_dir))

Model bundle ready at: /kaggle/working/model_bundle
Files: ['data.yaml', 'best.pt', 'classes.txt']
