# 🌊 YOLO-UDD v2.0 - Kaggle Training (CLEAN VERSION)

**Simple 10-Step Training Pipeline** ⚡

## 📋 Before You Start:
1. **Enable GPU**: Settings → Accelerator → **GPU T4 x2** → Save
2. **Add Dataset**: 
   - Upload TrashCAN dataset as Kaggle Dataset OR
   - Use Google Drive link (already configured below)
3. **Run All Cells**: Execute cells 1-3, restart kernel, then run all again

## ⏱️ Training Time:
- **100 epochs**: ~10 hours
- **Expected mAP**: 70-72%

---

## Cell 1: Clone Repository

In [None]:
# Clone YOLO-UDD v2.0 repository
import os
import sys

WORK_DIR = '/kaggle/working'
REPO_DIR = f'{WORK_DIR}/YOLO-UDD-v2.0'

print("Cloning repository...")
os.chdir(WORK_DIR)

# Remove old directory if exists
if os.path.exists(REPO_DIR):
    import shutil
    shutil.rmtree(REPO_DIR)

# Clone from GitHub
!git clone https://github.com/kshitijkhede/YOLO-UDD-v2.0.git

# Setup paths
os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)

print(f"✅ Repository ready at: {REPO_DIR}")

## Cell 2: Verify Structure

In [None]:
# Verify repository structure
import os

print("Checking repository structure...\n")

required = {
    'models/': 'Model architecture',
    'scripts/': 'Training scripts', 
    'utils/': 'Utility functions',
    'configs/': 'Configuration files',
    'data/': 'Dataset handling',
    'requirements.txt': 'Dependencies',
    'scripts/train.py': 'Training script'
}

all_ok = True
for path, desc in required.items():
    if os.path.exists(path):
        print(f"✅ {path} - {desc}")
    else:
        print(f"❌ {path} - MISSING!")
        all_ok = False

if all_ok:
    print("\n✅ Repository structure verified!")
else:
    print("\n❌ Some files missing! Re-run Cell 1.")

## Cell 3: Fix NumPy Compatibility ⚠️ CRITICAL

**⚠️ YOU MUST RESTART KERNEL AFTER THIS CELL!**

Kaggle has NumPy 2.x which crashes TensorFlow. This fixes it.

In [None]:
# FIX NumPy compatibility issue
import numpy as np
import sys

print(f"Current NumPy version: {np.__version__}\n")

if np.__version__.startswith('2.'):
    print("⚠️  NumPy 2.x detected - This will crash TensorFlow!")
    print("Fixing by downgrading to NumPy 1.26.4...\n")
    
    # Uninstall NumPy 2.x
    !{sys.executable} -m pip uninstall -y numpy
    
    # Install NumPy 1.26.4
    !{sys.executable} -m pip install 'numpy==1.26.4' --force-reinstall --no-cache-dir
    
    print("\n" + "="*60)
    print("✅ NumPy 1.26.4 installed!")
    print("="*60)
    print("\n🔴 STOP! RESTART KERNEL NOW! 🔴")
    print("\nSteps:")
    print("1. Click: Session → Restart Session")
    print("2. Run ALL cells again from Cell 1")
    print("3. This cell will show 'NumPy 1.x OK' after restart")
    print("4. Then continue to Cell 4")
    print("\n💡 Why? NumPy is loaded in memory. Restart loads new version.")
    print("="*60)
    
    # Force stop execution
    raise SystemExit("⛔ RESTART KERNEL NOW!")
else:
    print(f"✅ NumPy 1.x OK ({np.__version__})")
    print("✅ Continue to Cell 4!")

## Cell 4: Check GPU

In [None]:
# Check GPU availability
import torch

print("Checking GPU...\n")

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_count = torch.cuda.device_count()
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1024**3
    
    print(f"✅ GPU: {gpu_name}")
    print(f"✅ Count: {gpu_count}")
    print(f"✅ Memory: {gpu_mem:.1f} GB")
    print(f"✅ CUDA: {torch.version.cuda}")
    print(f"✅ PyTorch: {torch.__version__}")
else:
    print("❌ NO GPU DETECTED!")
    print("\nFix: Settings → Accelerator → GPU T4 x2 → Save")
    raise RuntimeError("GPU required for training!")

## Cell 5: Install Dependencies

In [None]:
# Install required packages
print("Installing dependencies...\n")

!pip install -q torch>=2.0.0 torchvision>=0.15.0
!pip install -q albumentations>=1.3.0
!pip install -q opencv-python-headless>=4.7.0
!pip install -q pycocotools>=2.0.6
!pip install -q tensorboard>=2.12.0
!pip install -q tqdm pyyaml scikit-learn matplotlib seaborn

print("\n✅ All dependencies installed!")

## Cell 6: Setup Dataset

**Choose ONE method:**

### Option A: Kaggle Dataset (Recommended)
1. Upload TrashCAN dataset to Kaggle Datasets
2. Add to notebook: "+ Add Data"
3. Set `USE_KAGGLE_DATASET = True`
4. Update `KAGGLE_DATASET_PATH`

### Option B: Google Drive (Easiest - Already Configured!)
1. Just set `USE_GDRIVE = True`
2. File ID already configured below
3. Downloads automatically (~2-3 min)

In [None]:
# Dataset configuration
import os

# ============================================
# CHOOSE ONE METHOD (Set to True)
# ============================================

# Option A: Kaggle Dataset (Permanent)
USE_KAGGLE_DATASET = False
KAGGLE_DATASET_PATH = '/kaggle/input/trashcan-dataset'  # Update if needed

# Option B: Google Drive (Already configured!)
USE_GDRIVE = True  # ✅ SET THIS TO TRUE
GDRIVE_FILE_ID = '10PCbGqgVi0-XQn0EfGTTfSjwNS0JXR99'  # ✅ Already set!

# ============================================
# Automatic setup
# ============================================

DATASET_PATH = None

if USE_KAGGLE_DATASET:
    print("Using Kaggle Dataset...")
    
    if os.path.exists(KAGGLE_DATASET_PATH):
        # Check if ZIP or folder
        if os.path.isfile(KAGGLE_DATASET_PATH):
            print("Extracting ZIP...")
            !unzip -q {KAGGLE_DATASET_PATH} -d /kaggle/working/
            DATASET_PATH = '/kaggle/working/trashcan'
        else:
            # Check for trashcan subdirectory
            trashcan_path = os.path.join(KAGGLE_DATASET_PATH, 'trashcan')
            DATASET_PATH = trashcan_path if os.path.exists(trashcan_path) else KAGGLE_DATASET_PATH
        
        print(f"✅ Dataset at: {DATASET_PATH}")
    else:
        print(f"❌ NOT FOUND: {KAGGLE_DATASET_PATH}")
        print("\nSetup:")
        print("1. Upload dataset to Kaggle Datasets")
        print("2. Add to notebook: '+ Add Data'")
        print("3. Update path above")

elif USE_GDRIVE:
    print("Using Google Drive...")
    print("Installing gdown...")
    !pip install -q gdown
    
    print(f"\nDownloading dataset (File ID: {GDRIVE_FILE_ID})...")
    !gdown --id {GDRIVE_FILE_ID} -O /kaggle/working/trashcan.zip
    
    if os.path.exists('/kaggle/working/trashcan.zip'):
        size_mb = os.path.getsize('/kaggle/working/trashcan.zip') / 1024 / 1024
        print(f"✅ Downloaded: {size_mb:.1f} MB")
        
        print("Extracting...")
        !unzip -q /kaggle/working/trashcan.zip -d /kaggle/working/
        
        if os.path.exists('/kaggle/working/trashcan'):
            DATASET_PATH = '/kaggle/working/trashcan'
            print(f"✅ Extracted to: {DATASET_PATH}")
        else:
            print("❌ Extraction failed")
    else:
        print("❌ Download failed!")
        print("Check: File ID, sharing settings, internet enabled")

else:
    print("❌ NO METHOD SELECTED!")
    print("Set USE_KAGGLE_DATASET or USE_GDRIVE to True above")

# Verify dataset
print("\n" + "="*60)
if DATASET_PATH and os.path.exists(DATASET_PATH):
    print(f"✅ DATASET READY: {DATASET_PATH}\n")
    
    # Show structure
    for item in ['images', 'annotations']:
        path = os.path.join(DATASET_PATH, item)
        if os.path.exists(path):
            if item == 'images':
                # Count subdirectories
                for subdir in ['train', 'val', 'test']:
                    subpath = os.path.join(path, subdir)
                    if os.path.exists(subpath):
                        count = len([f for f in os.listdir(subpath) if f.endswith(('.jpg', '.png'))])
                        print(f"  📁 {item}/{subdir}: {count:,} images")
            else:
                # Count annotation files
                for subdir in ['train', 'val', 'test']:
                    subpath = os.path.join(path, subdir)
                    if os.path.exists(subpath):
                        files = [f for f in os.listdir(subpath) if f.endswith('.json')]
                        print(f"  📁 {item}/{subdir}: {len(files)} annotation files")
    
    print("\n✅ Dataset structure verified!")
else:
    print("❌ DATASET NOT READY!")
    print("Fix: Choose a method above and re-run this cell")

print("="*60)

## Cell 7: Test Model Loading

In [None]:
# Test model import and build
import os
import sys
import torch

# Ensure we're in repo directory
REPO_DIR = '/kaggle/working/YOLO-UDD-v2.0'
os.chdir(REPO_DIR)
if REPO_DIR not in sys.path:
    sys.path.insert(0, REPO_DIR)

print("Building YOLO-UDD v2.0 model...\n")

# Import and build model
from models.yolo_udd import build_yolo_udd

model = build_yolo_udd(num_classes=22)  # TrashCAN has 22 classes
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"✅ Model: YOLO-UDD v2.0")
print(f"✅ Classes: 22")
print(f"✅ Device: {device}")
print(f"✅ Parameters: {total_params:,} total, {trainable_params:,} trainable")

# Test forward pass
print("\nTesting forward pass...")
x = torch.randn(1, 3, 640, 640).to(device)
with torch.no_grad():
    predictions, turb_score = model(x)

print(f"✅ Forward pass OK!")
print(f"✅ Turbidity score: {turb_score.item():.4f}")
print(f"✅ Detection scales: {len(predictions)}")

## Cell 8: Configure Training

In [None]:
# Training configuration
import os

# Hyperparameters
EPOCHS = 100
BATCH_SIZE = 8
LEARNING_RATE = 0.01
SAVE_DIR = '/kaggle/working/runs/train'

print("="*60)
print("Training Configuration")
print("="*60)
print(f"Epochs: {EPOCHS}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Learning Rate: {LEARNING_RATE}")
print(f"Dataset: {DATASET_PATH}")
print(f"Save Directory: {SAVE_DIR}")
print(f"\nEstimated Time: ~10 hours")
print(f"Expected mAP: 70-72%")
print("="*60)

# Create save directory
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"\n✅ Configuration ready!")

## Cell 9: Start Training ⚡

**⏱️ This will take ~10 hours**

Training will:
- Save checkpoints automatically
- Show progress in real-time
- Save best model as `best.pt`
- Save results to `/kaggle/working/runs/train/`

In [None]:
# Start training with CORRECT arguments
print("="*60)
print("🚀 STARTING TRAINING")
print("="*60)
print(f"Training for {EPOCHS} epochs (~10 hours)")
print(f"Expected mAP: 70-72%")
print("="*60)
print()

# Run training script - FIXED ARGUMENTS!
!python scripts/train.py \
    --config configs/train_config.yaml \
    --data-dir {DATASET_PATH} \
    --epochs {EPOCHS} \
    --batch-size {BATCH_SIZE} \
    --lr {LEARNING_RATE} \
    --save-dir {SAVE_DIR}

## Cell 10: Check Results & Download

In [None]:
# Check training results
import os

print("="*60)
print("Training Results")
print("="*60)

if os.path.exists(SAVE_DIR):
    print(f"\n📁 Results: {SAVE_DIR}\n")
    
    # List all files
    for root, dirs, files in os.walk(SAVE_DIR):
        level = root.replace(SAVE_DIR, '').count(os.sep)
        indent = '  ' * level
        print(f"{indent}{os.path.basename(root)}/")
        sub_indent = '  ' * (level + 1)
        for file in files:
            size_mb = os.path.getsize(os.path.join(root, file)) / (1024*1024)
            print(f"{sub_indent}{file} ({size_mb:.1f} MB)")
    
    # Check for best checkpoint
    best_pt = os.path.join(SAVE_DIR, 'best.pt')
    if os.path.exists(best_pt):
        size_mb = os.path.getsize(best_pt) / (1024*1024)
        print(f"\n✅ BEST MODEL: {best_pt} ({size_mb:.1f} MB)")
        print("\n📥 Download from Output section!")
        print("\n🎉 Training complete! Expected mAP: 70-72%")
    else:
        print("\n⚠️  best.pt not found - training may have failed")
else:
    print(f"\n❌ Results directory not found: {SAVE_DIR}")
    print("Training may not have started or failed early.")

print("="*60)

print("\n" + "="*60)
print("🎉 ALL DONE!")
print("="*60)
print("Next steps:")
print("1. Download best.pt from Output folder")
print("2. Use for evaluation or inference")
print("3. Expected performance: 70-72% mAP@50:95")
print("="*60)