# 🌊 YOLO-UDD v2.0 - Kaggle Training

**Simple 6-Step Training - No Crashes, No Loops!** ⚡

## 📋 Before You Start:
1. **Enable GPU**: Settings → Accelerator → **GPU T4 x2** → Save
2. **Dataset**: Google Drive link already configured (automatic download)
3. **Run**: Execute cells 1-6 in order OR click "Run All"

## ⏱️ Training Info:
- **Time**: ~10 hours (100 epochs)
- **Expected mAP**: 70-72%
- **No restarts needed!** ✅

---

## Cell 1: Environment Setup

In [None]:
# Complete environment setup
import os
import sys

print("="*70)
print("🔧 CELL 1: Environment Setup")
print("="*70)

# Check and fix NumPy version FIRST (before any other imports)
print("\n[Step 1/3] Checking NumPy version...")
try:
    import numpy as np
    numpy_ver = np.__version__
    
    if numpy_ver.startswith('2.'):
        print(f"  ⚠️  NumPy {numpy_ver} detected (will cause TensorFlow crashes)")
        print("  🔧 Installing NumPy 1.26.4...")
        
        # Use pip directly with quiet mode
        !pip uninstall -y numpy > /dev/null 2>&1
        !pip install -q numpy==1.26.4
        
        print("  ✅ NumPy 1.26.4 installed")
        print("  ℹ️  If you see import errors later, just re-run this cell")
    else:
        print(f"  ✅ NumPy {numpy_ver} OK")
except Exception as e:
    print(f"  ⚠️  NumPy check issue: {e}")
    print("  📦 Installing NumPy 1.26.4...")
    !pip install -q numpy==1.26.4
    print("  ✅ Installed")

# Setup directories
print("\n[Step 2/3] Setting up directories...")
WORK_DIR = '/kaggle/working'
REPO_DIR = f'{WORK_DIR}/YOLO-UDD-v2.0'

os.chdir(WORK_DIR)
print(f"  ✅ Working directory: {WORK_DIR}")

# Clone repository
print("\n[Step 3/3] Cloning repository...")
if os.path.exists(REPO_DIR):
    import shutil
    shutil.rmtree(REPO_DIR)

!git clone -q https://github.com/kshitijkhede/YOLO-UDD-v2.0.git

if os.path.exists(REPO_DIR):
    os.chdir(REPO_DIR)
    if REPO_DIR not in sys.path:
        sys.path.insert(0, REPO_DIR)
    print(f"  ✅ Repository ready: {REPO_DIR}")
else:
    raise Exception("Clone failed!")

print("\n" + "="*70)
print("✅ Cell 1 Complete - Environment Ready!")
print("="*70)

## Cell 2: Verify & Install Dependencies

In [None]:
# Verify setup and install dependenciesimport osimport torchprint("="*70)print("�� CELL 2: Verification & Dependencies")print("="*70)# Verify repository structureprint("\n[Step 1/4] Verifying repository...")required = ['models/', 'scripts/', 'utils/', 'configs/', 'scripts/train.py']all_ok = Truefor item in required:    if os.path.exists(item):        print(f"  ✅ {item}")    else:        print(f"  ❌ {item} MISSING")        all_ok = Falseif not all_ok:    raise Exception("Repository incomplete! Re-run Cell 1")# Check GPUprint("\n[Step 2/4] Checking GPU...")if torch.cuda.is_available():    print(f"  ✅ GPU: {torch.cuda.get_device_name(0)}")    print(f"  ✅ Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")else:    print("  ❌ NO GPU! Enable: Settings → GPU T4 x2")    raise RuntimeError("GPU required!")# Verify NumPy versionprint("\n[Step 3/4] Verifying NumPy...")import numpy as npprint(f"  📦 NumPy version: {np.__version__}")if np.__version__.startswith('2.'):    print(f"  ❌ ERROR: NumPy {np.__version__} still active!")    print(f"  🔧 FIX: Re-run Cell 1, then restart kernel")    raise RuntimeError("NumPy 2.x detected! Re-run Cell 1 and restart kernel.")else:    print(f"  ✅ NumPy {np.__version__} OK")# Install dependencies - FORCE REINSTALL to rebuild against NumPy 1.xprint("\n[Step 4/4] Installing dependencies (takes ~3 min)...")print("  🔧 Force reinstalling packages to rebuild against NumPy 1.26.4...")# Uninstall problematic packages first!pip uninstall -y tensorboard tensorflow keras scikit-learn matplotlib > /dev/null 2>&1# Install with force reinstall to rebuild against NumPy 1.x!pip install -q --force-reinstall --no-cache-dir tensorboard scikit-learn matplotlib seaborn!pip install -q torch>=2.0.0 torchvision>=0.15.0 albumentations>=1.3.0 \    opencv-python-headless>=4.7.0 pycocotools>=2.0.6 tqdm pyyamlprint("  ✅ All dependencies installed and rebuilt against NumPy 1.26.4")print("\n" + "="*70)print("✅ Cell 2 Complete - System Ready!")print("="*70)

## Cell 3: Setup Dataset

**Dataset will download automatically from Google Drive (~170 MB, takes 2-3 min)**

Alternative: Upload your own dataset to Kaggle and set `USE_KAGGLE_DATASET = True`

In [None]:
# Dataset setup
import os

print("="*70)
print("📦 CELL 3: Dataset Setup")
print("="*70)

# ============================================
# CONFIGURATION
# ============================================
USE_KAGGLE_DATASET = False  # Set True if you added dataset to Kaggle
KAGGLE_DATASET_PATH = '/kaggle/input/trashcan-dataset'

USE_GDRIVE = True  # ✅ Default: Download from Google Drive
GDRIVE_FILE_ID = '10PCbGqgVi0-XQn0EfGTTfSjwNS0JXR99'
# ============================================

DATASET_PATH = None

if USE_KAGGLE_DATASET:
    print("\n📂 Using Kaggle Dataset...")
    if os.path.exists(KAGGLE_DATASET_PATH):
        if os.path.isfile(KAGGLE_DATASET_PATH):
            print("  📦 Extracting...")
            !unzip -q {KAGGLE_DATASET_PATH} -d /kaggle/working/
            DATASET_PATH = '/kaggle/working/trashcan'
        else:
            trashcan = os.path.join(KAGGLE_DATASET_PATH, 'trashcan')
            DATASET_PATH = trashcan if os.path.exists(trashcan) else KAGGLE_DATASET_PATH
        print(f"  ✅ Dataset: {DATASET_PATH}")
    else:
        print(f"  ❌ NOT FOUND: {KAGGLE_DATASET_PATH}")

elif USE_GDRIVE:
    print("\n☁️  Downloading from Google Drive...")
    print("  📦 Installing gdown...")
    !pip install -q gdown
    
    print("  ⬇️  Downloading dataset (~170 MB, 2-3 min)...")
    !gdown --id {GDRIVE_FILE_ID} -O /kaggle/working/trashcan.zip --quiet
    
    if os.path.exists('/kaggle/working/trashcan.zip'):
        size = os.path.getsize('/kaggle/working/trashcan.zip') / 1024 / 1024
        print(f"  ✅ Downloaded: {size:.1f} MB")
        
        print("  📦 Extracting...")
        !unzip -q /kaggle/working/trashcan.zip -d /kaggle/working/
        
        if os.path.exists('/kaggle/working/trashcan'):
            DATASET_PATH = '/kaggle/working/trashcan'
            print(f"  ✅ Dataset: {DATASET_PATH}")
        else:
            print("  ❌ Extraction failed")
    else:
        print("  ❌ Download failed")
else:
    print("\n❌ No method selected! Set USE_KAGGLE_DATASET or USE_GDRIVE = True")

# Verify dataset
print("\n" + "="*70)
if DATASET_PATH and os.path.exists(DATASET_PATH):
    print(f"✅ DATASET READY: {DATASET_PATH}")
    
    # Count images
    if os.path.exists(os.path.join(DATASET_PATH, 'images')):
        for split in ['train', 'val', 'test']:
            img_path = os.path.join(DATASET_PATH, 'images', split)
            if os.path.exists(img_path):
                count = len([f for f in os.listdir(img_path) if f.endswith(('.jpg', '.png'))])
                print(f"  📁 {split}: {count:,} images")
    
    print("\n" + "="*70)
    print("✅ Cell 3 Complete - Dataset Ready!")
    print("="*70)
else:
    print("❌ DATASET NOT READY!")
    raise Exception("Dataset setup failed!")

## Cell 4: Build & Test Model

In [None]:
# Build and test YOLO-UDD model
import os
import sys
import torch

print("="*70)
print("🏗️  CELL 4: Build Model")
print("="*70)

# Ensure correct paths
REPO_DIR = '/kaggle/working/YOLO-UDD-v2.0'
os.chdir(REPO_DIR)
if REPO_DIR not in sys.path:
    sys.path.insert(0, REPO_DIR)

print("\n[Step 1/2] Building model...")
from models.yolo_udd import build_yolo_udd

model = build_yolo_udd(num_classes=22)  # TrashCAN has 22 classes
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
print(f"  ✅ Model: YOLO-UDD v2.0")
print(f"  ✅ Classes: 22")
print(f"  ✅ Device: {device}")
print(f"  ✅ Parameters: {total_params:,}")

# Test forward pass
print("\n[Step 2/2] Testing model...")
x = torch.randn(1, 3, 640, 640).to(device)
with torch.no_grad():
    predictions, turb_score = model(x)

print(f"  ✅ Forward pass successful")
print(f"  ✅ Turbidity score: {turb_score.item():.4f}")

print("\n" + "="*70)
print("✅ Cell 4 Complete - Model Ready!")
print("="*70)

## Cell 5: Start Training ⚡

**⏱️ This will take ~10 hours for 100 epochs**

Training automatically:
- Saves checkpoints every epoch
- Shows progress with progress bar
- Saves best model as `best.pt`
- Results saved to `/kaggle/working/runs/train/`

In [None]:
# Configure and start training
import os

print("="*70)
print("🚀 CELL 5: TRAINING")
print("="*70)

# Training configuration
EPOCHS = 100
BATCH_SIZE = 8
LEARNING_RATE = 0.01
SAVE_DIR = '/kaggle/working/runs/train'

print(f"\n📊 Configuration:")
print(f"  • Epochs: {EPOCHS}")
print(f"  • Batch Size: {BATCH_SIZE}")
print(f"  • Learning Rate: {LEARNING_RATE}")
print(f"  • Dataset: {DATASET_PATH}")
print(f"  • Save Dir: {SAVE_DIR}")
print(f"\n⏱️  Estimated Time: ~10 hours")
print(f"🎯 Expected mAP: 70-72%")

os.makedirs(SAVE_DIR, exist_ok=True)

print("\n" + "="*70)
print("🚀 TRAINING STARTING...")
print("="*70)
print()

# Run training with correct arguments
!python scripts/train.py \
    --config configs/train_config.yaml \
    --data-dir {DATASET_PATH} \
    --epochs {EPOCHS} \
    --batch-size {BATCH_SIZE} \
    --lr {LEARNING_RATE} \
    --save-dir {SAVE_DIR}

## Cell 6: Check Results & Download

In [None]:
# Check training results
import os

print("="*70)
print("📊 CELL 6: Results")
print("="*70)

if os.path.exists(SAVE_DIR):
    print(f"\n📁 Results: {SAVE_DIR}\n")
    
    # List files
    for root, dirs, files in os.walk(SAVE_DIR):
        level = root.replace(SAVE_DIR, '').count(os.sep)
        indent = '  ' * level
        print(f"{indent}{os.path.basename(root)}/")
        sub_indent = '  ' * (level + 1)
        for file in files:
            size = os.path.getsize(os.path.join(root, file)) / (1024*1024)
            print(f"{sub_indent}{file} ({size:.1f} MB)")
    
    # Check for best checkpoint
    best_pt = os.path.join(SAVE_DIR, 'best.pt')
    if os.path.exists(best_pt):
        size = os.path.getsize(best_pt) / (1024*1024)
        print("\n" + "="*70)
        print("✅ TRAINING COMPLETE!")
        print("="*70)
        print(f"\n🏆 Best Model: {best_pt}")
        print(f"📦 Size: {size:.1f} MB")
        print(f"\n📥 DOWNLOAD: Check 'Output' section in right sidebar →")
        print(f"🎯 Expected Performance: 70-72% mAP@50:95")
        print(f"\n🎉 Success! Model ready for deployment!")
        print("="*70)
    else:
        print("\n⚠️  best.pt not found - check if training completed")
else:
    print(f"\n❌ Results not found: {SAVE_DIR}")
    print("Training may have failed or not started.")