# 🌊 YOLO-UDD v2.0 - Underwater Debris Detection (KAGGLE)

**Complete Training Pipeline on Kaggle with GPU** ⚡

## 🚀 Quick Start:
1. **Upload Dataset**: Add TrashCAN dataset as Kaggle Dataset
2. **Enable GPU**: Settings → Accelerator → GPU T4 x2 → Save
3. **Run All**: Run all cells sequentially
4. **Download Results**: Download trained model from Output folder

## ⚙️ Configuration:
- **Epochs**: 100 (reduced for faster training ~10 hours)
- **Batch Size**: 8
- **Classes**: 22 (matches TrashCAN dataset)
- **Expected mAP**: 70-72%

---

## Step 1: Setup Environment

In [None]:
# Clone repository
import os
import sys

# Kaggle uses /kaggle/working directory
WORK_DIR = '/kaggle/working'
REPO_DIR = f'{WORK_DIR}/YOLO-UDD-v2.0'

print("="*60)
print("Step 1: Cloning Repository")
print("="*60)

# Ensure we're in working directory
try:
    os.chdir(WORK_DIR)
    print(f"✓ Changed to working directory: {os.getcwd()}")
except Exception as e:
    print(f"✗ Error changing directory: {e}")
    raise

# Remove existing directory if present
if os.path.exists(REPO_DIR):
    import shutil
    shutil.rmtree(REPO_DIR)
    print("✓ Cleaned existing directory")

# Clone repository
print("\nCloning repository from GitHub...")
!git clone https://github.com/kshitijkhede/YOLO-UDD-v2.0.git

# Verify clone succeeded
if not os.path.exists(REPO_DIR):
    print(f"\n✗ ERROR: Repository not cloned!")
    print(f"   Expected location: {REPO_DIR}")
    raise FileNotFoundError("Failed to clone repository. Please check internet connection and repository URL.")

# Change to repo directory
try:
    os.chdir(REPO_DIR)
    print(f"\n✓ Changed to repository directory: {os.getcwd()}")
except Exception as e:
    print(f"\n✗ Error changing to repo directory: {e}")
    raise

# Add to Python path
if REPO_DIR not in sys.path:
    sys.path.insert(0, REPO_DIR)
    print(f"✓ Added to Python path: {REPO_DIR}")

# Verify we're in the right place
print(f"\n✓ Current directory: {os.getcwd()}")
print(f"✓ Python path includes: {REPO_DIR}")
print("="*60)

In [None]:
# Verify repository structure
import os

print("="*60)
print("📂 Repository Structure")
print("="*60)

required_dirs = ['models', 'scripts', 'data', 'utils', 'configs']
required_files = ['requirements.txt', 'models/__init__.py', 'scripts/train.py']

for dir_name in required_dirs:
    status = "✓" if os.path.exists(dir_name) else "✗"
    print(f"{status} {dir_name}/")

print()
for file_name in required_files:
    status = "✓" if os.path.exists(file_name) else "✗"
    print(f"{status} {file_name}")

print("="*60)

In [None]:
# Verify Python can find modules
import os
import sys

print("="*60)
print("🔍 Module Import Diagnostics")
print("="*60)

print(f"\nCurrent working directory:")
print(f"  {os.getcwd()}")

print(f"\nPython sys.path (first 3 entries):")
for i, path in enumerate(sys.path[:3]):
    print(f"  {i+1}. {path}")

print(f"\nChecking for models module:")
models_path = os.path.join(os.getcwd(), 'models')
if os.path.exists(models_path):
    print(f"  ✓ models/ directory exists at: {models_path}")
    if os.path.exists(os.path.join(models_path, '__init__.py')):
        print(f"  ✓ models/__init__.py exists")
    if os.path.exists(os.path.join(models_path, 'yolo_udd.py')):
        print(f"  ✓ models/yolo_udd.py exists")
else:
    print(f"  ✗ models/ directory NOT FOUND!")
    print(f"  ✗ Expected at: {models_path}")
    print(f"\n  Available directories:")
    for item in os.listdir(os.getcwd()):
        if os.path.isdir(item):
            print(f"    📁 {item}/")

print("="*60)

In [None]:
# FIX: Force add models to Python path
import os
import sys

print("="*60)
print("🔧 Fixing Module Import Path")
print("="*60)

# Get current directory
current_dir = os.getcwd()
print(f"\nCurrent directory: {current_dir}")

# Check if we're in the repo directory
if 'YOLO-UDD-v2.0' not in current_dir:
    print("\n⚠️  Not in YOLO-UDD-v2.0 directory!")
    
    # Try to find and change to it
    possible_paths = [
        '/kaggle/working/YOLO-UDD-v2.0',
        '/kaggle/YOLO-UDD-v2.0',
        os.path.join(os.getcwd(), 'YOLO-UDD-v2.0')
    ]
    
    for path in possible_paths:
        if os.path.exists(path):
            os.chdir(path)
            current_dir = os.getcwd()
            print(f"✓ Changed to: {current_dir}")
            break
    else:
        print("✗ Could not find YOLO-UDD-v2.0 directory!")
        print("  Please re-run the clone cell (Cell 3)")

# Ensure repo is in Python path
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)
    print(f"✓ Added to sys.path: {current_dir}")

# Verify models can be imported
print("\n🔍 Verifying module availability...")
models_path = os.path.join(current_dir, 'models')
if os.path.exists(models_path):
    print(f"  ✓ models/ exists at: {models_path}")
    
    # Check for required files
    required_files = ['__init__.py', 'yolo_udd.py', 'psem.py', 'sdwh.py', 'tafm.py']
    all_present = True
    for file in required_files:
        file_path = os.path.join(models_path, file)
        if os.path.exists(file_path):
            print(f"  ✓ {file}")
        else:
            print(f"  ✗ {file} MISSING!")
            all_present = False
    
    if all_present:
        print("\n✅ All model files present - import should work!")
    else:
        print("\n❌ Some files missing - clone may be incomplete")
        print("   → Re-run Cell 3 (Clone Repository)")
else:
    print(f"  ✗ models/ NOT FOUND at: {models_path}")
    print("\n  Available directories:")
    for item in os.listdir(current_dir):
        if os.path.isdir(os.path.join(current_dir, item)):
            print(f"    📁 {item}/")
    print("\n❌ Repository clone failed!")
    print("   → Re-run Cell 3 (Clone Repository)")

print("="*60)

## CRITICAL FIX: NumPy Compatibility

**⚠️ IMPORTANT**: Kaggle has NumPy 2.x by default, but TensorFlow/scikit-learn require NumPy 1.x.
This fix prevents training crashes!

In [None]:
# ============================================================
# CRITICAL FIX: Force NumPy 1.x Installation
# ============================================================

print("="*60)
print("🔧 FIXING NumPy Compatibility Issue")
print("="*60)

# Check current NumPy version
import numpy as np
current_version = np.__version__
print(f"\n📌 Current NumPy version: {current_version}")

if current_version.startswith('2.'):
    print("\n⚠️  NumPy 2.x detected - this WILL crash TensorFlow/scikit-learn!")
    print("Forcing downgrade to NumPy 1.x...\n")
    
    # Force uninstall NumPy 2.x
    import sys
    !{sys.executable} -m pip uninstall -y numpy
    
    # Install NumPy 1.x with force reinstall
    !{sys.executable} -m pip install 'numpy==1.26.4' --force-reinstall --no-cache-dir
    
    # Verify the fix worked
    print("\n" + "="*60)
    print("✅ Verifying Fix...")
    print("="*60)
    print("✓ NumPy 1.26.4 has been installed!")
    print("✓ SUCCESS! Training will now work without crashes.")
    print("\n⚠️  IMPORTANT: You MUST restart the kernel now!")
    print("   Click: Kernel → Restart Kernel")
    print("   Then run all cells again from Cell 1.")
else:
    print(f"✓ NumPy 1.x already installed - no fix needed!")
    print("✓ Training should work correctly.")

print("="*60)

In [None]:
# Check GPU availability
import torch

print("="*60)
print("🔥 GPU Status Check")
print("="*60)

if torch.cuda.is_available():
    print(f"✓ GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"✓ GPU Count: {torch.cuda.device_count()}")
    print(f"✓ CUDA Version: {torch.version.cuda}")
    print(f"✓ PyTorch Version: {torch.__version__}")
    
    # Get GPU memory info
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"✓ GPU Memory: {gpu_mem:.1f} GB")
else:
    print("✗ GPU NOT AVAILABLE!")
    print("⚠️  Please enable GPU: Settings → Accelerator → GPU T4 x2 → Save")
    raise RuntimeError("GPU not available. Training will be extremely slow on CPU.")

print("="*60)

## Step 2: Install Dependencies

In [None]:
# Install required packages
print("Installing dependencies...\n")

# Install from requirements.txt
!pip install -q torch>=2.0.0 torchvision>=0.15.0
!pip install -q albumentations>=1.3.0
!pip install -q opencv-python-headless>=4.7.0
!pip install -q pycocotools>=2.0.6
!pip install -q tensorboard>=2.12.0
!pip install -q tqdm pyyaml
!pip install -q scikit-learn matplotlib seaborn

print("\n✓ All dependencies installed successfully!")

## Step 3: Setup Dataset

**Choose ONE of the following methods:**

### **METHOD 1: Kaggle Dataset (Recommended)**
1. Go to: https://www.kaggle.com/datasets
2. Click "New Dataset"
3. Upload TrashCAN dataset ZIP
4. Add to notebook: "Add Data" → Search for your dataset
5. Update `DATASET_PATH` in the cell below

### **METHOD 2: Google Drive (Alternative - Easiest)**
1. Upload your TrashCAN dataset folder to Google Drive
2. Share the folder/file publicly
3. Get the file ID from the share link
4. Update `GDRIVE_FILE_ID` in the cell below
5. Set `USE_GDRIVE = True`

### **METHOD 3: Direct Upload in Notebook**
1. ZIP your dataset locally
2. Upload ZIP to Kaggle notebook directly (< 500MB recommended)
3. Set `USE_LOCAL_UPLOAD = True`

In [None]:
# Configure dataset - Choose your method
import os

# ============================================================
# CONFIGURATION - Choose ONE method
# ============================================================

# METHOD 1: Kaggle Dataset (⭐ RECOMMENDED - Upload once, use forever!)
USE_KAGGLE_DATASET = True  # ⭐ Set to True to use Kaggle Dataset
KAGGLE_DATASET_PATH = '/kaggle/input/trashcan-dataset'  # Update with your dataset name

# METHOD 2: Google Drive (Backup option - downloads each time)
USE_GDRIVE = False  # Set to True to download from Google Drive
GDRIVE_FILE_ID = '10PCbGqgVi0-XQn0EfGTTfSjwNS0JXR99'  # ✅ Your File ID (already set!)

# ============================================================
# Automatic Dataset Setup
# ============================================================

print("="*60)
print("📦 Dataset Setup")
print("="*60)

DATASET_PATH = None

# METHOD 1: Kaggle Dataset
if USE_KAGGLE_DATASET:
    print("\n🌟 Using Kaggle Dataset (Permanent Storage)...")
    print("   ✅ Fast: Instant access (no download)")
    print("   ✅ Persistent: Never deleted")
    print("   ✅ Free: No quota usage")
    
    # Check if dataset exists
    if os.path.exists(KAGGLE_DATASET_PATH):
        # Check if it's the ZIP or extracted folder
        if os.path.isfile(KAGGLE_DATASET_PATH):
            print(f"\n✓ Found ZIP at: {KAGGLE_DATASET_PATH}")
            print("Extracting...")
            !unzip -q {KAGGLE_DATASET_PATH} -d /kaggle/working/
            
            if os.path.exists('/kaggle/working/trashcan'):
                DATASET_PATH = '/kaggle/working/trashcan'
                print(f"✓ Extracted to: {DATASET_PATH}")
        elif os.path.isdir(KAGGLE_DATASET_PATH):
            # Check for trashcan subdirectory
            trashcan_path = os.path.join(KAGGLE_DATASET_PATH, 'trashcan')
            if os.path.exists(trashcan_path):
                DATASET_PATH = trashcan_path
            else:
                DATASET_PATH = KAGGLE_DATASET_PATH
            print(f"✓ Using dataset at: {DATASET_PATH}")
    else:
        print(f"\n❌ Dataset NOT FOUND at: {KAGGLE_DATASET_PATH}")
        print("\n📝 FIRST TIME SETUP REQUIRED:")
        print("   1. Go to: https://www.kaggle.com/datasets")
        print("   2. Click: '+ New Dataset'")
        print("   3. Upload: trashcan.zip (170 MB)")
        print("   4. Title: 'TrashCAN Dataset'")
        print("   5. Click: 'Create'")
        print("   6. In this notebook: '+ Add Data' → Search 'TrashCAN'")
        print("   7. Update KAGGLE_DATASET_PATH above if needed")
        print("\n   Available datasets:")
        if os.path.exists('/kaggle/input'):
            for item in os.listdir('/kaggle/input'):
                print(f"     📁 /kaggle/input/{item}")

# METHOD 2: Google Drive
elif USE_GDRIVE:
    print("\n🔄 Using Google Drive (Downloads Each Time)...")
    print("   ⚠️  Slower: 2-3 min download")
    print("   ⚠️  Temporary: Deleted after session")
    print("   ⚠️  Quota: Uses internet quota")
    print("\n💡 TIP: Consider using Kaggle Dataset instead!")
    
    print("\nInstalling gdown...")
    !pip install -q gdown
    
    print(f"\nDownloading dataset from Google Drive...")
    print(f"File ID: {GDRIVE_FILE_ID}")
    !gdown --id {GDRIVE_FILE_ID} -O /kaggle/working/trashcan.zip
    
    if os.path.exists('/kaggle/working/trashcan.zip'):
        file_size = os.path.getsize('/kaggle/working/trashcan.zip') / 1024 / 1024
        print(f"\n✓ Downloaded: {file_size:.1f} MB")
        print("Extracting...")
        !unzip -q /kaggle/working/trashcan.zip -d /kaggle/working/
        
        if os.path.exists('/kaggle/working/trashcan'):
            DATASET_PATH = '/kaggle/working/trashcan'
            print(f"✓ Extracted to: {DATASET_PATH}")
        else:
            print("❌ Extraction failed - trashcan folder not found")
            print("   Check ZIP structure")
    else:
        print("❌ Download failed! Check:")
        print("  1. File ID is correct")
        print("  2. File sharing: 'Anyone with link can view'")
        print("  3. Internet is enabled in Kaggle settings")

else:
    print("\n❌ NO METHOD SELECTED!")
    print("   Please set either USE_KAGGLE_DATASET or USE_GDRIVE to True above")

# Verify final dataset
print("\n" + "="*60)
if DATASET_PATH and os.path.exists(DATASET_PATH):
    print(f"✅ DATASET READY: {DATASET_PATH}")
    print("\n📂 Dataset structure:")
    for item in os.listdir(DATASET_PATH):
        item_path = os.path.join(DATASET_PATH, item)
        if os.path.isdir(item_path):
            count = len(os.listdir(item_path))
            print(f"  📁 {item}/ ({count} items)")
        else:
            size = os.path.getsize(item_path) / 1024
            print(f"  📄 {item} ({size:.1f} KB)")
    
    # Verify it has the correct structure
    if os.path.exists(os.path.join(DATASET_PATH, 'images')):
        img_count = len([f for f in os.listdir(os.path.join(DATASET_PATH, 'images')) if f.endswith('.jpg')])
        print(f"\n✅ Found {img_count:,} images")
    if os.path.exists(os.path.join(DATASET_PATH, 'annotations')):
        ann_files = os.listdir(os.path.join(DATASET_PATH, 'annotations'))
        print(f"✅ Found {len(ann_files)} annotation files")
else:
    print("❌ DATASET NOT READY!")
    print("\nPlease:")
    print("1. Choose ONE method above (set to True)")
    print("2. Follow the setup instructions")
    print("3. Re-run this cell")
    
print("="*60)


## Step 4: Build Model

In [None]:
# Build YOLO-UDD model
from models.yolo_udd import build_yolo_udd
import torch

print("="*60)
print("🏗️  Building YOLO-UDD v2.0 Model")
print("="*60)

# Build model with 22 classes (TrashCAN dataset)
model = build_yolo_udd(num_classes=22)

# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(f"✓ Model built successfully")
print(f"✓ Device: {device}")
print(f"✓ Number of classes: 22")

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"✓ Total parameters: {total_params:,}")
print(f"✓ Trainable parameters: {trainable_params:,}")

# Test forward pass
print("\n🧪 Testing forward pass...")
x = torch.randn(1, 3, 640, 640).to(device)
with torch.no_grad():
    predictions, turb_score = model(x)

print(f"✓ Forward pass successful!")
print(f"✓ Turbidity Score: {turb_score.item():.4f}")
print(f"✓ Detection scales: {len(predictions)}")

print("="*60)

## Step 5: Training Configuration

In [None]:
# Training hyperparameters - Reduced for faster training
EPOCHS = 100  # Reduced from 300 (10 hours instead of 30 hours)
BATCH_SIZE = 8
LEARNING_RATE = 0.01
NUM_WORKERS = 2
SAVE_DIR = '/kaggle/working/runs/train'

print("="*60)
print("⚙️  Training Configuration")
print("="*60)
print(f"Epochs: {EPOCHS}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Learning Rate: {LEARNING_RATE}")
print(f"Number of Workers: {NUM_WORKERS}")
print(f"Save Directory: {SAVE_DIR}")
print(f"Dataset Path: {DATASET_PATH}")
print("="*60)

# Create save directory
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"\n✓ Save directory created: {SAVE_DIR}")

## Step 6: Start Training

**⏱️ Estimated Time**: ~10 hours for 100 epochs on T4 GPU

**💡 Tips**:
- Training will save checkpoints automatically
- You can monitor progress in real-time
- Results saved to `/kaggle/working/runs/train/`
- Download best checkpoint from Output folder after training

In [None]:
# Start training
print("="*60)
print("🚀 Starting Training...")
print("="*60)
print(f"Training for {EPOCHS} epochs (~10 hours)")
print(f"Expected mAP: 70-72%")
print("="*60)

# Run training script
!python scripts/train.py \
    --config configs/train_config.yaml \
    --data-dir {DATASET_PATH} \
    --epochs {EPOCHS} \
    --batch-size {BATCH_SIZE} \
    --learning-rate {LEARNING_RATE} \
    --num-workers {NUM_WORKERS} \
    --save-dir {SAVE_DIR}

## Step 7: Download Results

After training completes, download the trained model checkpoint.

In [None]:
# Check training results
import os

print("="*60)
print("📊 Training Results")
print("="*60)

if os.path.exists(SAVE_DIR):
    print(f"\n📁 Results directory: {SAVE_DIR}")
    print("\nContents:")
    for root, dirs, files in os.walk(SAVE_DIR):
        level = root.replace(SAVE_DIR, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 2 * (level + 1)
        for file in files:
            size = os.path.getsize(os.path.join(root, file)) / (1024*1024)
            print(f"{subindent}{file} ({size:.1f} MB)")
    
    # Check for best checkpoint
    best_checkpoint = os.path.join(SAVE_DIR, 'best.pt')
    if os.path.exists(best_checkpoint):
        size = os.path.getsize(best_checkpoint) / (1024*1024)
        print(f"\n✓ Best checkpoint: {best_checkpoint} ({size:.1f} MB)")
        print("\n📥 Download this file from the Output section!")
    else:
        print("\n⚠️  Best checkpoint not found. Check if training completed successfully.")
else:
    print(f"✗ Results directory not found: {SAVE_DIR}")

print("="*60)

## 🎉 Training Complete!

### Next Steps:
1. **Download Checkpoint**: Download `best.pt` from Output folder
2. **Evaluate Model**: Run evaluation script locally with downloaded checkpoint
3. **Test Detections**: Test on new images

### Expected Results:
- mAP@50:95: **70-72%** (22 classes)
- Training Time: **~10 hours** (100 epochs)
- Checkpoint Size: **~200-300 MB**

---

**📧 Issues?** Check the GitHub repository: https://github.com/kshitijkhede/YOLO-UDD-v2.0