In [1]:

import subprocess
import sys
import os

print("🚀 Setting up custom environment for YOLOv8 training...")

# Create a custom directory for packages
custom_site_packages = "/kaggle/working/custom_packages"
os.makedirs(custom_site_packages, exist_ok=True)

# Install packages to custom directory
print("📦 Installing packages to avoid conflicts...")
packages = [
    "ultralytics",
    "roboflow", 
    "opencv-python-headless",
    "torch",
    "torchvision", 
    "torchaudio",
    "pillow",
    "matplotlib",
    "seaborn",
    "wandb",
    "tensorboard"
]

for package in packages:
    print(f"Installing {package}...")
    try:
        result = subprocess.run([
            sys.executable, "-m", "pip", "install", 
            "--target", custom_site_packages,
            "--upgrade", package
        ], capture_output=True, text=True, check=True)
        print(f"✅ {package} installed successfully")
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to install {package}")
        print("Error:", e.stderr)

# Install additional dependencies
print("\n📚 Installing additional dependencies...")
deps = ["scipy", "requests", "tqdm", "psutil", "py-cpuinfo", "pyyaml"]
for dep in deps:
    try:
        subprocess.run([
            sys.executable, "-m", "pip", "install", 
            "--target", custom_site_packages,
            "--upgrade", dep
        ], check=True, capture_output=True)
    except:
        pass

print(f"\n✅ Custom packages installed in: {custom_site_packages}")

# Function to setup environment
def setup_custom_env():
    """Add custom packages to Python path"""
    if custom_site_packages not in sys.path:
        sys.path.insert(0, custom_site_packages)
    print("🔧 Custom environment activated!")

# Test installations
print("\n🧪 Testing installations...")
setup_custom_env()

test_results = {}

# Test imports
test_packages = [
    ('cv2', 'OpenCV'),
    ('roboflow', 'Roboflow'),
    ('torch', 'PyTorch'),
    ('ultralytics', 'Ultralytics')
]

for module, name in test_packages:
    try:
        exec(f"import {module}")
        if module == 'cv2':
            exec(f"version = {module}.__version__")
        elif module == 'torch':
            exec(f"version = {module}.__version__")
        else:
            version = "imported"
        test_results[name] = f"✅ {version}"
    except Exception as e:
        test_results[name] = f"❌ {str(e)[:50]}"

print("\n📋 Installation Results:")
for name, result in test_results.items():
    print(f"  {name}: {result}")

print("\n" + "="*60)
print("🎉 ENVIRONMENT SETUP COMPLETE!")
print("="*60)
print("Next: Run Cell 2 to download your dataset from Roboflow")#!/usr/bin/env python3

🚀 Setting up custom environment for YOLOv8 training...
📦 Installing packages to avoid conflicts...
Installing ultralytics...


✅ ultralytics installed successfully
Installing roboflow...


✅ roboflow installed successfully
Installing opencv-python-headless...


✅ opencv-python-headless installed successfully
Installing torch...


✅ torch installed successfully
Installing torchvision...


✅ torchvision installed successfully
Installing torchaudio...


In [None]:
# Setup custom environment
import sys
custom_site_packages = "/kaggle/working/custom_packages"
if custom_site_packages not in sys.path:
    sys.path.insert(0, custom_site_packages)

import os
from roboflow import Roboflow


!pip install roboflow


print("📥 Downloading dataset from Roboflow...")

# REPLACE THESE WITH YOUR ROBOFLOW DETAILS
ROBOFLOW_API_KEY = "mFwrI2EyU1GiHKaOGlxI"  # Replace with your API key
WORKSPACE_NAME = "aims-ipbxa"        # Replace with your workspace name
PROJECT_NAME = "activity-cswy1-weawp"           # Replace with your project name  
VERSION_NUMBER = 2                      # Replace with your dataset version

# Initialize Roboflow
try:
    rf = Roboflow(api_key=ROBOFLOW_API_KEY)
    print(f"✅ Connected to Roboflow with API key: {ROBOFLOW_API_KEY[:8]}...")
    
    # Get project
    project = rf.workspace(WORKSPACE_NAME).project(PROJECT_NAME)
    print(f"✅ Found project: {PROJECT_NAME}")
    
    # Download dataset
    dataset_path = "/kaggle/working/dataset"
    dataset = project.version(VERSION_NUMBER).download("yolov8", location=dataset_path)
    print(f"✅ Dataset downloaded to: {dataset_path}")
    
    # Verify dataset structure
    print("\n📁 Dataset structure:")
    for root, dirs, files in os.walk(dataset_path):
        level = root.replace(dataset_path, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 2 * (level + 1)
        for file in files[:5]:  # Show first 5 files in each directory
            print(f'{subindent}{file}')
        if len(files) > 5:
            print(f'{subindent}... and {len(files) - 5} more files')
    
    # Find data.yaml file
    data_yaml_path = None
    for root, dirs, files in os.walk(dataset_path):
        if 'data.yaml' in files:
            data_yaml_path = os.path.join(root, 'data.yaml')
            break
    
    if data_yaml_path:
        print(f"\n✅ Found data.yaml at: {data_yaml_path}")
        
        # Read and display data.yaml content
        with open(data_yaml_path, 'r') as f:
            yaml_content = f.read()
        print("\n📄 data.yaml content:")
        print(yaml_content)
        
        # Store the path for later use
        os.environ["DATA_YAML_PATH"] = data_yaml_path
        print(f"\n💾 Data YAML path saved to environment: {data_yaml_path}")
    else:
        print("❌ data.yaml not found in dataset!")
        
except Exception as e:
    print(f"❌ Error downloading dataset: {str(e)}")
    print("\n💡 Make sure to:")
    print("1. Replace ROBOFLOW_API_KEY with your actual API key")
    print("2. Replace WORKSPACE_NAME with your workspace name")  
    print("3. Replace PROJECT_NAME with your project name")
    print("4. Set correct VERSION_NUMBER")
    print("\n🔗 Get your API key from: https://roboflow.com/settings/api")

print("\n" + "="*60)
print("Next: Run Cell 3 to create the training script")
print("="*60)

In [None]:
%%writefile train_yolov8_ddp.py
import sys
import os

# Add custom packages path
custom_packages = '/kaggle/working/custom_packages'
if custom_packages not in sys.path:
    sys.path.insert(0, custom_packages)

# Import after adding the path
try:
    from ultralytics import YOLO
    import torch
    print("✅ Ultralytics and PyTorch imported successfully")
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
except ImportError as e:
    print(f"❌ Failed to import required packages: {e}")
    print("Current Python path:")
    for path in sys.path:
        print(f"  {path}")
    sys.exit(1)

def main():
    print("🚀 Starting YOLOv8m DDP training...")
    
    # Environment setup for better logging
    os.environ["PYTHONUNBUFFERED"] = "1"
    os.environ["ULTRALYTICS_VERBOSE"] = "1"
    
    # Get data.yaml path from environment or use default
    data_yaml = os.environ.get("DATA_YAML_PATH", "/kaggle/working/dataset/data.yaml")
    
    if not os.path.exists(data_yaml):
        print(f"❌ Data YAML file not found at: {data_yaml}")
        print("Please check the path and run the dataset download cell first.")
        return
    
    print(f"📊 Using dataset: {data_yaml}")
    
    # Load YOLOv8m model
    print("📥 Loading YOLOv8m model...")
    model = YOLO("yolov8m.pt")  # This will download the pretrained weights
    
    print("🔧 Training configuration:")
    config = {
        'data': data_yaml,
        'epochs': 20,           # Adjust based on your needs
        'imgsz': 800,           # Image size
        'batch': 44,            # Total batch size (will be split across GPUs)
        'device': [0, 1],       # Use both T4 GPUs
        'workers': 8,           # Data loading workers
        'project': '/kaggle/working/runs',  # Where to save results
        'name': 'yolov8m_ddp', # Experiment name
        'save': True,
        'save_period': 10,      # Save checkpoint every 10 epochs
        'plots': True,
        'verbose': True,
        'patience': 10,         # Early stopping patience
        'lr0': 0.01,           # Initial learning rate
        'weight_decay': 0.0005,
        'warmup_epochs': 3,
        'box': 7.5,            # Box loss weight
        'cls': 0.5,            # Class loss weight  
        'dfl': 1.5,            # Distribution focal loss weight
        'pose': 12.0,          # Pose loss weight (if using pose model)
        'kobj': 1.0,           # Keypoint object loss weight
        'label_smoothing': 0.0,
        'nbs': 64,             # Nominal batch size
        'overlap_mask': True,
        'mask_ratio': 4,
        'dropout': 0.0,
        'val': True,           # Validate during training
        'split': 'val',        # Dataset split to use for validation
        'resume': False,       # Resume from last checkpoint
        'amp': True,           # Automatic mixed precision
        'fraction': 1.0,       # Dataset fraction to use
        'profile': False,      # Profile ONNX and TensorRT speeds
        'freeze': None,        # Freeze layers: backbone=10, all=24
        'multi_scale': False,  # Multi-scale training
        'optimizer': 'auto',   # Optimizer (SGD, Adam, AdamW, NAdam, RAdam, RMSProp)
        'cos_lr': False,       # Use cosine learning rate scheduler
        'close_mosaic': 10,    # Disable mosaic augmentation for final epochs
        'single_cls': False,   # Train multi-class data as single-class
        'rect': False,         # Rectangular training
        'deterministic': True, # Force deterministic augmentation
        # 'sync_bn': True,       # Use SyncBatchNorm, only available in DDP mode
        'exist_ok': False,     # Overwrite existing experiment
        'seed': 0,             # Global training seed
        # 'local_rank': -1,      # Automatic DDP Multi-GPU argument, do not modify
        'cache': False,        # True/ram, disk or False. Use cache for data loading
        'visualize': False,    # Visualize features
        'augment': True,       # Apply image augmentation to prediction sources
        'agnostic_nms': False, # Class-agnostic NMS
        'retina_masks': False, # Use high-resolution segmentation masks
    }
    
    for key, value in config.items():
        print(f"  {key}: {value}")
    
    print("\n🏋️ Starting training...")
    print("This will use both T4 GPUs with DDP automatically")
    
    try:
        # Start training
        results = model.train(**config)
        
        print("\n🎉 Training completed successfully!")
        print(f"Results saved to: {config['project']}/{config['name']}")
        
        # Display some results
        if results:
            print("\n📈 Training Results:")
            if hasattr(results, 'results_dict'):
                for key, value in results.results_dict.items():
                    if isinstance(value, (int, float)):
                        print(f"  {key}: {value:.4f}")
        
    except Exception as e:
        print(f"❌ Training failed: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

In [None]:
# Setup custom environment
import sys
custom_site_packages = "/kaggle/working/custom_packages"
if custom_site_packages not in sys.path:
    sys.path.insert(0, custom_site_packages)

import torch
import os
import subprocess

print("🔍 System and GPU Information")
print("=" * 50)

# Check CUDA and PyTorch
print("🐍 Python & PyTorch Info:")
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

# GPU details
if torch.cuda.is_available():
    print("\n🎮 GPU Details:")
    for i in range(torch.cuda.device_count()):
        gpu_name = torch.cuda.get_device_name(i)
        gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
        print(f"  GPU {i}: {gpu_name} ({gpu_memory:.1f} GB)")
        
        # Set GPU and check memory
        torch.cuda.set_device(i)
        memory_allocated = torch.cuda.memory_allocated(i) / 1024**3
        memory_reserved = torch.cuda.memory_reserved(i) / 1024**3
        print(f"    Memory allocated: {memory_allocated:.2f} GB")
        print(f"    Memory reserved: {memory_reserved:.2f} GB")
        print(f"    Memory free: {gpu_memory - memory_reserved:.2f} GB")

# Check NVIDIA-SMI
print("\n🖥️ NVIDIA System Info:")
try:
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
    if result.returncode == 0:
        print(result.stdout)
    else:
        print("❌ nvidia-smi not available")
except:
    print("❌ nvidia-smi command failed")

# Check dataset path
data_yaml_path = os.environ.get("DATA_YAML_PATH")
if data_yaml_path and os.path.exists(data_yaml_path):
    print(f"\n✅ Dataset ready at: {data_yaml_path}")
    
    # Count images in dataset
    dataset_dir = os.path.dirname(data_yaml_path)
    train_dir = os.path.join(dataset_dir, 'train', 'images')
    val_dir = os.path.join(dataset_dir, 'valid', 'images') 
    test_dir = os.path.join(dataset_dir, 'test', 'images')
    
    train_count = len(os.listdir(train_dir)) if os.path.exists(train_dir) else 0
    val_count = len(os.listdir(val_dir)) if os.path.exists(val_dir) else 0
    test_count = len(os.listdir(test_dir)) if os.path.exists(test_dir) else 0
    
    print(f"📊 Dataset Statistics:")
    print(f"  Training images: {train_count}")
    print(f"  Validation images: {val_count}")
    print(f"  Test images: {test_count}")
    print(f"  Total images: {train_count + val_count + test_count}")
else:
    print("⚠️ Dataset not found. Please run the dataset download cell first.")

# Test DDP availability
print("\n🔗 Distributed Training Check:")
if torch.cuda.device_count() > 1:
    print(f"✅ Multiple GPUs detected: {torch.cuda.device_count()} GPUs")
    print("✅ DDP (Distributed Data Parallel) will be used automatically")
    
    # Test multi-GPU tensor
    if torch.cuda.is_available():
        device0 = torch.device('cuda:0')
        device1 = torch.device('cuda:1')
        
        # Create tensors on both GPUs
        tensor0 = torch.randn(1000, 1000, device=device0)
        tensor1 = torch.randn(1000, 1000, device=device1)
        
        print("✅ Successfully created tensors on both GPUs")
        
        # Clean up
        del tensor0, tensor1
        torch.cuda.empty_cache()
else:
    print("⚠️ Only 1 GPU detected. DDP will not be used.")

print("\n" + "="*60)
print("🚀 Ready to start training!")
print("Run Cell 5 to begin YOLOv8m training with DDP")
print("="*60)

In [None]:
import subprocess
import sys
import os

print("🚀 Starting YOLOv8m DDP Training...")
print("=" * 50)

# Check if dataset is ready
data_yaml_path = os.environ.get("DATA_YAML_PATH")
if not data_yaml_path or not os.path.exists(data_yaml_path):
    print("❌ Dataset not found! Please run the dataset download cell first.")
    print("Expected path:", data_yaml_path)
    exit(1)

print(f"✅ Dataset found: {data_yaml_path}")

# Set up environment for the training process
env = os.environ.copy()
env["DATA_YAML_PATH"] = data_yaml_path
env["PYTHONPATH"] = "/kaggle/working/custom_packages"
env["CUDA_VISIBLE_DEVICES"] = "0,1"  # Use both GPUs

# Check if training script exists
if not os.path.exists("/kaggle/working/train_yolov8_ddp.py"):
    print("❌ Training script not found! Please run Cell 3 first.")
    exit(1)

print("✅ Training script found")
print("🔧 Environment setup:")
print(f"  Data YAML: {data_yaml_path}")
print(f"  Python Path: {env.get('PYTHONPATH')}")
print(f"  CUDA Devices: {env.get('CUDA_VISIBLE_DEVICES')}")

# Start training
print("\n🏋️ Launching training process...")
print("This may take several hours depending on your dataset size and epochs.")
print("Training will use both T4 GPUs automatically with DDP.")

try:
    # Run the training script
    process = subprocess.Popen([
        sys.executable, "/kaggle/working/train_yolov8_ddp.py"
    ], env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
       universal_newlines=True, cwd="/kaggle/working")
    
    # Stream output in real-time
    for line in process.stdout:
        print(line.rstrip())
    
    # Wait for completion
    return_code = process.wait()
    
    if return_code == 0:
        print("\n🎉 Training completed successfully!")
        
        # Check for results
        results_dir = "/kaggle/working/runs/detect/yolov8m_ddp"
        if os.path.exists(results_dir):
            print(f"✅ Results saved to: {results_dir}")
            
            # List key files
            key_files = ['weights/best.pt', 'weights/last.pt', 'results.png', 'confusion_matrix.png']
            print("\n📁 Key output files:")
            for file in key_files:
                full_path = os.path.join(results_dir, file)
                if os.path.exists(full_path):
                    print(f"  ✅ {file}")
                else:
                    print(f"  ❌ {file} (not found)")
        else:
            print("⚠️ Results directory not found")
    else:
        print(f"\n❌ Training failed with return code: {return_code}")
        
except KeyboardInterrupt:
    print("\n⚠️ Training interrupted by user")
    if 'process' in locals():
        process.terminate()
        
except Exception as e:
    print(f"\n❌ Error during training: {str(e)}")
    import traceback
    traceback.print_exc()

print("\n" + "="*60)
print("Training process completed!")
print("Check the output above for results and any errors.")
print("="*60)