# YOLOv8 Malaria Detection - Colab Pro Optimized
## Clinical Parasite Detection Training Pipeline

**Optimized for Colab Pro V100/A100 GPUs**
- Enhanced batch sizes and full training cycles
- Advanced contour detection for bounding boxes
- Production-ready model export

## 1. Environment Setup & GPU Check

In [None]:
# Check GPU and install dependencies
!nvidia-smi
import torch
print(f"🚀 CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"🎯 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Install packages
!pip install ultralytics kagglehub wandb opencv-python matplotlib seaborn scikit-learn -q
print("✅ All packages installed")

In [None]:
# Import libraries
import os, shutil, random, cv2, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
from ultralytics import YOLO
import kagglehub, yaml, time, zipfile
from PIL import Image
from datetime import datetime

# Set seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
print("📚 Libraries imported")

## 2. Colab Pro Configuration

In [None]:
# Colab Pro optimized settings
CONFIG = {
    'epochs': 100, 'batch_size': 32, 'image_size': 640, 'workers': 8,
    'lr0': 0.01, 'weight_decay': 0.0005, 'patience': 25,
    'optimizer': 'AdamW', 'cache': 'ram'
}

print(f"⚙️ Batch: {CONFIG['batch_size']}, Epochs: {CONFIG['epochs']}")

## 3. Dataset Download & Preparation

In [None]:
# Download Kaggle dataset
print("📥 Downloading dataset...")
kaggle_path = kagglehub.dataset_download("iarunava/cell-images-for-detecting-malaria")
print(f"✅ Downloaded to: {kaggle_path}")

In [None]:
def generate_bbox(image_path, padding=0.15):
    """Enhanced bounding box generation with CLAHE preprocessing."""
    image = cv2.imread(image_path)
    if image is None:
        return (0.5, 0.5, 0.85, 0.85)
    
    h, w = image.shape[:2]
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # CLAHE enhancement
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    enhanced = clahe.apply(gray)
    blurred = cv2.GaussianBlur(enhanced, (5, 5), 0)
    
    # Adaptive threshold
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    
    # Find contours
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if contours:
        # Filter by area
        min_area = (w * h) * 0.01
        valid_contours = [c for c in contours if cv2.contourArea(c) > min_area]
        
        if valid_contours:
            largest = max(valid_contours, key=cv2.contourArea)
            x, y, bbox_w, bbox_h = cv2.boundingRect(largest)
            
            # Add padding
            pad_x, pad_y = int(bbox_w * padding), int(bbox_h * padding)
            x = max(0, x - pad_x)
            y = max(0, y - pad_y)
            bbox_w = min(w - x, bbox_w + 2 * pad_x)
            bbox_h = min(h - y, bbox_h + 2 * pad_y)
            
            # YOLO format
            center_x = (x + bbox_w / 2) / w
            center_y = (y + bbox_h / 2) / h
            norm_w = bbox_w / w
            norm_h = bbox_h / h
            
            return (max(0.1, min(0.9, center_x)), max(0.1, min(0.9, center_y)), 
                   max(0.1, min(0.8, norm_w)), max(0.1, min(0.8, norm_h)))
    
    return (0.5, 0.5, 0.85, 0.85)

print("🔧 Bbox generation function ready")

In [None]:
# Create YOLO dataset structure
yolo_path = Path("yolo_malaria_pro")
for split in ['train', 'val', 'test']:
    (yolo_path / split / "images").mkdir(parents=True, exist_ok=True)
    (yolo_path / split / "labels").mkdir(parents=True, exist_ok=True)

print(f"📁 Created structure: {yolo_path}")

In [None]:
# Find and process dataset
kaggle_path = Path(kaggle_path)
cell_images_path = None

for root, dirs, files in os.walk(kaggle_path):
    if 'Parasitized' in dirs and 'Uninfected' in dirs:
        cell_images_path = Path(root)
        break

print(f"📊 Found images at: {cell_images_path}")

# Process classes
all_files = []
for class_name in ['Parasitized', 'Uninfected']:
    class_path = cell_images_path / class_name
    class_files = list(class_path.glob('*.png'))
    print(f"   {class_name}: {len(class_files)} images")
    
    class_id = 0 if class_name == 'Parasitized' else None
    for img_path in class_files:
        all_files.append((img_path, class_id))

# Split dataset
random.shuffle(all_files)
total = len(all_files)
train_end = int(total * 0.70)
val_end = int(total * 0.90)

splits = {
    'train': all_files[:train_end],
    'val': all_files[train_end:val_end],
    'test': all_files[val_end:]
}

print(f"📈 Split: Train={len(splits['train'])}, Val={len(splits['val'])}, Test={len(splits['test'])}")

In [None]:
# Convert files to YOLO format
for split_name, files in splits.items():
    print(f"🔄 Processing {split_name}...")
    
    images_dir = yolo_path / split_name / "images"
    labels_dir = yolo_path / split_name / "labels"
    
    for i, (img_path, class_id) in enumerate(files):
        if i % 2000 == 0 and i > 0:
            print(f"   Progress: {i}/{len(files)}")
        
        # Copy image
        new_name = f"{split_name}_{i:06d}.png"
        shutil.copy2(img_path, images_dir / new_name)
        
        # Create label
        label_path = labels_dir / f"{split_name}_{i:06d}.txt"
        if class_id is not None:
            bbox = generate_bbox(str(img_path))
            with open(label_path, 'w') as f:
                f.write(f"{class_id} {bbox[0]:.6f} {bbox[1]:.6f} {bbox[2]:.6f} {bbox[3]:.6f}\n")
        else:
            label_path.touch()
    
    print(f"✅ {split_name} complete")

print("🎉 Dataset conversion complete!")

In [None]:
# Create data.yaml
yaml_content = f"""path: {yolo_path.absolute()}
train: train/images
val: val/images
test: test/images

nc: 1
names: ['malaria_parasite']

# Stats
total_images: {total}
converted_on: {datetime.now().isoformat()}
"""

yaml_path = yolo_path / "malaria_data.yaml"
with open(yaml_path, 'w') as f:
    f.write(yaml_content)

print(f"📄 Config created: {yaml_path}")

## 4. Visualize Dataset Samples

In [None]:
# Visualize samples with bounding boxes
import matplotlib.patches as patches

train_images = list((yolo_path / "train" / "images").glob("*.png"))
samples = random.sample(train_images, 6)

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, img_path in enumerate(samples):
    image = Image.open(img_path)
    w, h = image.size
    
    axes[i].imshow(image)
    axes[i].set_title(f"Sample {i+1}")
    axes[i].axis('off')
    
    # Load label
    label_path = yolo_path / "train" / "labels" / f"{img_path.stem}.txt"
    if label_path.exists() and label_path.stat().st_size > 0:
        with open(label_path, 'r') as f:
            line = f.readline().strip()
            if line:
                parts = line.split()
                center_x, center_y, width, height = map(float, parts[1:5])
                
                x = (center_x - width/2) * w
                y = (center_y - height/2) * h
                box_w = width * w
                box_h = height * h
                
                rect = patches.Rectangle((x, y), box_w, box_h, 
                                       linewidth=2, edgecolor='red', facecolor='none')
                axes[i].add_patch(rect)
                axes[i].text(x, y-5, 'Parasite', color='red', fontsize=10, weight='bold')

plt.tight_layout()
plt.show()
print("📊 Sample visualization complete")

## 5. Initialize & Train YOLOv8 Model

In [None]:
# Initialize model
model = YOLO('yolov8n.pt')
print(f"📦 Model: {sum(p.numel() for p in model.model.parameters()):,} parameters")

In [None]:
# Training configuration
train_args = {
    'data': str(yaml_path),
    'epochs': CONFIG['epochs'],
    'batch': CONFIG['batch_size'],
    'imgsz': CONFIG['image_size'],
    'workers': CONFIG['workers'],
    'cache': CONFIG['cache'],
    'device': 0 if torch.cuda.is_available() else 'cpu',
    'lr0': CONFIG['lr0'],
    'weight_decay': CONFIG['weight_decay'],
    'patience': CONFIG['patience'],
    'optimizer': CONFIG['optimizer'],
    'cos_lr': True,
    'amp': True,
    'project': 'malaria_detection_pro',
    'name': 'yolov8n_colab_pro',
    'exist_ok': True,
    'plots': True,
    'save_period': 10
}

print("⚙️ Training config ready")
for key in ['epochs', 'batch', 'lr0', 'optimizer']:
    print(f"   {key}: {train_args[key]}")

In [None]:
# Start training
print(f"🏁 Training started at {datetime.now().strftime('%H:%M:%S')}")
start_time = time.time()

results = model.train(**train_args)

training_time = time.time() - start_time
print(f"🏆 Training completed in {training_time/3600:.1f} hours")
print(f"💾 Best model: {model.trainer.best}")

## 6. Evaluate & Export Model

In [None]:
# Load best model and evaluate
best_model = YOLO(model.trainer.best)
test_results = best_model.val(data=str(yaml_path), split='test')

print("🎯 Test Results:")
print(f"   mAP50: {test_results.box.map50:.4f}")
print(f"   mAP50-95: {test_results.box.map:.4f}")
print(f"   Precision: {test_results.box.mp:.4f}")
print(f"   Recall: {test_results.box.mr:.4f}")

In [None]:
# Export models
print("📦 Exporting models...")

exports = {}
try:
    onnx_path = best_model.export(format='onnx', optimize=True)
    exports['onnx'] = onnx_path
    print(f"✅ ONNX: {onnx_path}")
except Exception as e:
    print(f"❌ ONNX failed: {e}")

try:
    torchscript_path = best_model.export(format='torchscript')
    exports['torchscript'] = torchscript_path
    print(f"✅ TorchScript: {torchscript_path}")
except Exception as e:
    print(f"❌ TorchScript failed: {e}")

## 7. Test Inference & Visualization

In [None]:
# Test inference on samples
test_images = list((yolo_path / "test" / "images").glob("*.png"))[:6]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, img_path in enumerate(test_images):
    # Run inference
    results = best_model(str(img_path))
    
    # Load and process image
    image = cv2.imread(str(img_path))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Draw predictions
    if len(results[0].boxes) > 0:
        boxes = results[0].boxes.xyxy.cpu().numpy()
        confs = results[0].boxes.conf.cpu().numpy()
        
        for box, conf in zip(boxes, confs):
            x1, y1, x2, y2 = box.astype(int)
            cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)
            cv2.putText(image, f'Parasite {conf:.2f}', (x1, y1-10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
    
    axes[i].imshow(image)
    axes[i].set_title(f"Test {i+1} - {len(results[0].boxes)} detections")
    axes[i].axis('off')

plt.tight_layout()
plt.show()
print("🔍 Inference testing complete")

## 8. Create Results Package

In [None]:
# Create downloadable results package
zip_name = f"malaria_detection_pro_{datetime.now().strftime('%Y%m%d_%H%M')}.zip"

with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add models
    zipf.write(model.trainer.best, 'models/best_model.pt')
    
    for format_name, path in exports.items():
        if path and os.path.exists(path):
            zipf.write(path, f'models/best_model.{format_name}')
    
    # Add config
    zipf.write(yaml_path, 'config/malaria_data.yaml')
    
    # Add training results
    results_dir = Path(model.trainer.save_dir)
    for file in results_dir.glob('*.png'):
        zipf.write(file, f'results/{file.name}')
    
    if (results_dir / 'results.csv').exists():
        zipf.write(results_dir / 'results.csv', 'results/training_metrics.csv')
    
    # Add summary
    summary = f"""# Malaria Detection Results - Colab Pro

## Performance
- mAP50: {test_results.box.map50:.4f}
- mAP50-95: {test_results.box.map:.4f}
- Precision: {test_results.box.mp:.4f}
- Recall: {test_results.box.mr:.4f}

## Configuration
- Model: YOLOv8n
- Epochs: {CONFIG['epochs']}
- Batch: {CONFIG['batch_size']}
- Optimizer: {CONFIG['optimizer']}

Generated: {datetime.now().isoformat()}
"""
    zipf.writestr('README.md', summary)

print(f"📦 Results package: {zip_name}")
print(f"📊 Final mAP50: {test_results.box.map50:.4f}")
print("🎉 Training pipeline complete!")