<a href="https://colab.research.google.com/github/jakubstenc/Ecuador_meeting/blob/main/pollen_viability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üå∏ Pollen Viability Deep Learning Pipeline (YOLOv8)

**Project:** Automated counting of viable (stained) vs. non-viable (pale) pollen grains.
**Author:** Jakub ≈†tenc
**Model:** YOLOv8 (Medium/Large)

This pipeline performs three main functions:
1.  **Data Management:** Safely merges new annotations (from Roboflow) and splits them into Train/Validation sets.
2.  **Training:** Retrains the model using specific biological augmentations (rotation, color invariance).
3.  **Inference:** Detects pollen in new microscope images using dynamic resolution switching.

---
### üõ†Ô∏è Step 1: Environment Setup
Run this cell to mount Google Drive and install the necessary computer vision libraries.
* **Mounts:** `/content/drive`
* **Installs:** `ultralytics` (YOLOv8)

# 1. Install yolo from Ultralytics, import libraries, mount google drive and locate the folders

In [1]:
# --- SETUP CELL ---
# Run this once at the start of every session

# 1. Mount Google Drive (to access your data)
from google.colab import drive
drive.mount('/content/drive')

# 2. Install YOLO (The computer forgets this when you close the tab)
!pip install ultralytics -q

# 3. Import libraries
from ultralytics import YOLO
import os

# Load a model
# 'yolov8n.pt' is the "Nano" model (smallest & fastest).
# Good for testing if the pipeline works.
# Later we can use 'yolov8m.pt' (Medium) or 'yolov8x.pt' (Extra Large) for better accuracy.
model = YOLO('yolov8l.pt')

print("Setup complete. Ready to train!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.1/1.1 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
[?25hCreating new Ultralytics Settings v0.0.6 file ‚úÖ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8l.pt to 'yolov8l.pt': 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 83.7MB 38.9MB/s 2.2s
Setup complete. Ready to train!




### Updating the training dataset

In [4]:
import os
import shutil
import zipfile
import datetime
import glob
import random
from tqdm import tqdm

# --- CONFIGURATION ---
# 1. Search for Zip
search_root = '/content/drive/MyDrive/Pollen_viability/staged_area'

# 2. Main Dataset Folders (Now we track BOTH)
dataset_root = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1'
train_dir = os.path.join(dataset_root, 'train')
val_dir = os.path.join(dataset_root, 'val')

# 3. Settings
VAL_SPLIT_RATIO = 0.15  # 15% goes to Validation
temp_dir = '/content/temp_roboflow_process'

print("üöÄ Starting Smart Split & Merge (Train + Val)...")

# --- A. FIND THE ZIP FILE ---
possible_zips = glob.glob(os.path.join(search_root, "*.zip")) + \
                glob.glob(os.path.join(search_root, "labels", "*.zip"))

if not possible_zips:
    print(f"‚ùå Error: No .zip files found in {search_root}")
    raise FileNotFoundError("Stopping. Please check uploaded zip location.")

roboflow_zip = possible_zips[0]
print(f"1Ô∏è‚É£ Found tzip file: {os.path.basename(roboflow_zip)}")

# --- B. CLEANUP & UNZIP ---
if os.path.exists(temp_dir): shutil.rmtree(temp_dir)
os.makedirs(temp_dir, exist_ok=True)

print(f"   Unzipping to temp workspace...")
with zipfile.ZipFile(roboflow_zip, 'r') as z:
    z.extractall(temp_dir)

# --- C. HUNT DOWN NEW DATA ---
found_pairs = [] # We store (image_path, label_path) tuples

print("   Scanning extracted files...")
# Helper to find matching label for an image
def find_label(img_path, search_dir):
    base_name = os.path.splitext(os.path.basename(img_path))[0]
    # Look for .txt with same base name in the whole temp tree
    for r, d, f in os.walk(search_dir):
        for file in f:
            if file == base_name + ".txt":
                return os.path.join(r, file)
    return None

for root, dirs, files in os.walk(temp_dir):
    for f in files:
        if f.startswith('._'): continue
        if f.lower().endswith(('.jpg', '.png', '.jpeg')):
            img_full_path = os.path.join(root, f)
            lbl_full_path = find_label(img_full_path, temp_dir)

            if lbl_full_path:
                found_pairs.append((img_full_path, lbl_full_path))

print(f"   Found {len(found_pairs)} valid Image+Label pairs in zip.")

# --- D. DUPLICATE CHECK ---
print("\n2Ô∏è‚É£ Checking for Duplicates...")
# Get list of ALL current files to avoid collisions
existing_files = set()
for split in [train_dir, val_dir]:
    img_dir = os.path.join(split, 'images')
    if os.path.exists(img_dir):
        existing_files.update(os.listdir(img_dir))

unique_pairs = []
skipped_count = 0

for img_path, lbl_path in found_pairs:
    fname = os.path.basename(img_path)
    if fname in existing_files:
        skipped_count += 1
        # Optional: Print duplicate names
        # print(f"   Skipping duplicate: {fname}")
    else:
        unique_pairs.append((img_path, lbl_path))

if skipped_count > 0:
    print(f"   ‚ö†Ô∏è Skipped {skipped_count} images that already exist in Train or Val.")
else:
    print("   ‚úÖ No duplicates found.")

if len(unique_pairs) == 0:
    raise RuntimeError("No new unique images to add! Stopping.")

# --- E. SPLIT & MERGE ---
print("\n3Ô∏è‚É£ Splitting and Merging...")

# Backup (Backs up the whole pollen_v1 folder structure)
backup_name = f"dataset_backup_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}"
backup_path = f"/content/drive/MyDrive/Pollen_viability/backups/{backup_name}"
shutil.make_archive(backup_path, 'zip', dataset_root)
print(f"   (Backup saved to backups/{backup_name}.zip)")

# Randomize
random.shuffle(unique_pairs)

# Calculate Split Index
split_idx = int(len(unique_pairs) * VAL_SPLIT_RATIO)
val_batch = unique_pairs[:split_idx]
train_batch = unique_pairs[split_idx:]

print(f"   Adding {len(train_batch)} to TRAIN.")
print(f"   Adding {len(val_batch)} to VAL.")

def move_batch(batch, destination_dir):
    img_dest = os.path.join(destination_dir, 'images')
    lbl_dest = os.path.join(destination_dir, 'labels')
    os.makedirs(img_dest, exist_ok=True)
    os.makedirs(lbl_dest, exist_ok=True)

    for img, lbl in tqdm(batch, desc=f"Moving to {os.path.basename(destination_dir)}"):
        shutil.copy2(img, os.path.join(img_dest, os.path.basename(img)))
        shutil.copy2(lbl, os.path.join(lbl_dest, os.path.basename(lbl)))

# Execute Move
move_batch(train_batch, train_dir)
move_batch(val_batch, val_dir)

print("\n‚úÖ SUCCESS! Dataset updated safely.")
print(f"   Total added: {len(unique_pairs)}")
print("   You can delete the zip file from staged_area now.")

üöÄ Starting Smart Split & Merge (Train + Val)...
1Ô∏è‚É£ Found tzip file: Viable_pollen.v10i.yolov8.zip
   Unzipping to temp workspace...
   Scanning extracted files...
   Found 140 valid Image+Label pairs in zip.

2Ô∏è‚É£ Checking for Duplicates...
   ‚ö†Ô∏è Skipped 140 images that already exist in Train or Val.


RuntimeError: No new unique images to add! Stopping.

#### Adding the empty data


In [5]:
import os
import cv2
import numpy as np
import shutil
from tqdm import tqdm

# --- CONFIGURATION ---
# 1. INPUT: Folder where you put your small smudge crops
input_smudges_dir = '/content/drive/MyDrive/Pollen_viability/smudges_raw'

# 2. STAGING: Where we generate the canvas images first
staging_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train_negatives'
os.makedirs(os.path.join(staging_dir, 'images'), exist_ok=True)
os.makedirs(os.path.join(staging_dir, 'labels'), exist_ok=True)

# 3. DESTINATION: Your actual training dataset
train_img_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/images'
train_lbl_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/labels'

# Settings
CANVAS_SIZE = 640
BACKGROUND_COLOR = (245, 245, 245) # Light Gray

print("üß™ Starting Smudge Synthesis & Integration...")

if not os.path.exists(input_smudges_dir) or not os.listdir(input_smudges_dir):
    print(f"‚ùå Error: Folder '{input_smudges_dir}' is empty or missing.")
else:
    files = [f for f in os.listdir(input_smudges_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]
    print(f"   Found {len(files)} smudge source files.")

    generated_count = 0

    # --- PHASE 1: GENERATE IMAGES ---
    for i, fname in enumerate(tqdm(files, desc="Synthesizing")):
        # Unique name based on original filename to prevent collisions
        base_name = os.path.splitext(fname)[0]
        new_name = f"neg_syn_{base_name}.jpg"
        label_name = f"neg_syn_{base_name}.txt"

        output_img_path = os.path.join(staging_dir, 'images', new_name)
        output_lbl_path = os.path.join(staging_dir, 'labels', label_name)

        # Load Smudge
        smudge = cv2.imread(os.path.join(input_smudges_dir, fname))
        if smudge is None: continue
        h, w = smudge.shape[:2]

        # Create Canvas
        canvas = np.full((CANVAS_SIZE, CANVAS_SIZE, 3), BACKGROUND_COLOR, dtype=np.uint8)

        # Resize if needed
        if w > CANVAS_SIZE or h > CANVAS_SIZE:
            scale = min(CANVAS_SIZE/w, CANVAS_SIZE/h)
            smudge = cv2.resize(smudge, (0,0), fx=scale, fy=scale)
            h, w = smudge.shape[:2]

        # Center Paste
        x_off = (CANVAS_SIZE - w) // 2
        y_off = (CANVAS_SIZE - h) // 2
        canvas[y_off:y_off+h, x_off:x_off+w] = smudge

        # Save to Staging
        cv2.imwrite(output_img_path, canvas)
        with open(output_lbl_path, 'w') as f:
            pass # Empty label

        generated_count += 1

    # --- PHASE 2: INTEGRATE TO TRAIN ---
    print(f"\nüì¶ Phase 2: Integrating {generated_count} samples into Training Set...")

    added_count = 0
    skipped_count = 0

    generated_files = os.listdir(os.path.join(staging_dir, 'images'))

    for img_file in generated_files:
        # Source Paths (Staging)
        src_img = os.path.join(staging_dir, 'images', img_file)
        src_lbl = os.path.join(staging_dir, 'labels', img_file.replace('.jpg', '.txt'))

        # Dest Paths (Training)
        dst_img = os.path.join(train_img_dir, img_file)
        dst_lbl = os.path.join(train_lbl_dir, img_file.replace('.jpg', '.txt'))

        # Check if exists
        if os.path.exists(dst_img):
            skipped_count += 1
        else:
            # Move it!
            shutil.move(src_img, dst_img)
            shutil.move(src_lbl, dst_lbl)
            added_count += 1

    print("-" * 30)
    print(f"‚úÖ Integration Complete!")
    print(f"   üÜï Added:   {added_count} new images")
    print(f"   ‚è≠Ô∏è Skipped: {skipped_count} duplicates (already in training)")
    print(f"   üìÇ Training set updated: {train_img_dir}")

üß™ Starting Smudge Synthesis & Integration...
   Found 40 smudge source files.


Synthesizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40/40 [00:43<00:00,  1.09s/it]


üì¶ Phase 2: Integrating 40 samples into Training Set...
------------------------------
‚úÖ Integration Complete!
   üÜï Added:   0 new images
   ‚è≠Ô∏è Skipped: 40 duplicates (already in training)
   üìÇ Training set updated: /content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/images





### üßπ Step 2: Dataset Health Check & Cleaning
**"Garbage In, Garbage Out."** These scripts ensure your dataset is clean before training.

1.  **Quarantine Unlabeled Images:** Moves images without corresponding `.txt` files to a quarantine folder (prevents them from acting as "negative samples").
2.  **Purge Corrupted Labels:** Deletes empty or corrupted label files (orphans) from both Training and Validation sets.
3.  **Fix Drive Errors:** Scans for and removes `.gdoc` files that Google Drive sometimes creates by mistake.

*Run these cells sequentially to sanitize your data folders.*

### Cleaning the training dataset from images without labels

In [6]:
import os
import shutil

# --- CONFIGURATION ---
# Your training folders
train_img_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/images'
train_lbl_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/labels'

# Where to put the "problem" images
quarantine_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/quarantine_unlabeled'
os.makedirs(quarantine_dir, exist_ok=True)

print("Scanning for unlabeled images...")

moved_count = 0
image_files = [f for f in os.listdir(train_img_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]

for img_file in image_files:
    # Construct the expected label filename
    # R Analogy: paste0(tools::file_path_sans_ext(img_file), ".txt")
    label_file = os.path.splitext(img_file)[0] + ".txt"
    label_path = os.path.join(train_lbl_dir, label_file)

    # Check if the label file exists
    if not os.path.exists(label_path):
        print(f"‚ö†Ô∏è No label found for: {img_file}. Moving to quarantine.")

        # Move the image out of the training set
        src_path = os.path.join(train_img_dir, img_file)
        dst_path = os.path.join(quarantine_dir, img_file)
        shutil.move(src_path, dst_path)
        moved_count += 1

print(f"\n--- Cleanup Complete ---")
if moved_count > 0:
    print(f"‚úÖ Moved {moved_count} unlabeled images to: {quarantine_dir}")
    print("Your training folder is now safe!")
else:
    print("All images have labels. You are good to go!")

Scanning for unlabeled images...

--- Cleanup Complete ---
All images have labels. You are good to go!


### Checking for corrupted labels

In [7]:
import os
import shutil

# --- CONFIGURATION ---
# Check both Train and Val folders
folders_to_check = [
    '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/labels',
    '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/val/labels'
]

print("üßπ Starting Google Doc Purge...")

total_deleted = 0
affected_images = []

for folder in folders_to_check:
    print(f"\nScanning: {folder}")

    if not os.path.exists(folder):
        print(f"Skipping {folder} (does not exist)")
        continue

    files = os.listdir(folder)

    for f in files:
        file_path = os.path.join(folder, f)

        # Check 1: Explicit extension
        if f.endswith('.gdoc'):
            print(f"   ‚ùå Found .gdoc file: {f} -> DELETING.")
            os.remove(file_path)
            total_deleted += 1
            # Record the image name that needs a fix
            affected_images.append(f.replace('.gdoc', '.jpg')) # Assuming jpg, check your ext

        # Check 2: Implicit conversion (File has no extension but is a gdoc link)
        # Sometimes Drive removes the extension entirely!
        elif os.path.isfile(file_path) and not f.endswith('.txt'):
            # Peek inside to see if it looks like JSON/HTML
            try:
                with open(file_path, 'r', errors='ignore') as check:
                    header = check.read(100)
                    if "google" in header.lower() or "{" in header:
                        print(f"   ‚ùå Found hidden GDoc link: {f} -> DELETING.")
                        os.remove(file_path)
                        total_deleted += 1
                        affected_images.append(f + ".jpg")
            except:
                pass

print(f"\n--- PURGE COMPLETE ---")
print(f"Deleted {total_deleted} corrupted files.")

if total_deleted > 0:
    print("\n‚ö†Ô∏è ACTION REQUIRED:")
    print("The following labels were corrupted and deleted. You must RE-UPLOAD them as valid .txt files:")
    for img in affected_images[:10]: # Print first 10
        print(f" - Label for: {img}")
    if len(affected_images) > 10:
        print(f" ... and {len(affected_images)-10} others.")

    print("\nüí° TIP: When re-uploading, zip them first ('labels.zip') and unzip in Colab!")
else:
    print("No .gdoc files found. Your labels might be clean (or the corruption is different).")

üßπ Starting Google Doc Purge...

Scanning: /content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/labels

Scanning: /content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/val/labels

--- PURGE COMPLETE ---
Deleted 0 corrupted files.
No .gdoc files found. Your labels might be clean (or the corruption is different).


### Checking if every training image has a label


In [8]:
import os
import glob

# --- CONFIGURATION ---
label_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/labels'
image_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/images'

print(f"üïµÔ∏è‚Äç‚ôÄÔ∏è Inspecting labels in: {label_dir}")

txt_files = glob.glob(os.path.join(label_dir, "*.txt"))
print(f"Found {len(txt_files)} .txt files.")

errors_found = 0

# 1. Check a sample of files for FORMAT errors
print("\n--- FORMAT CHECK ---")
for i, txt_file in enumerate(txt_files):
    if i > 10: break # Only check first 10 to save time

    with open(txt_file, 'r') as f:
        try:
            content = f.read().strip()
            filename = os.path.basename(txt_file)

            # Error A: Empty File
            if not content:
                print(f"‚ùå EMPTY FILE: {filename}")
                errors_found += 1
                continue

            lines = content.split('\n')
            first_line = lines[0].strip()

            # Error B: Wrong Separator (Commas)
            if "," in first_line:
                print(f"‚ùå COMMA DETECTED (Needs spaces): {filename} -> '{first_line}'")
                errors_found += 1

            # Error C: Coordinates not Normalized (Values > 1)
            parts = first_line.split()
            if len(parts) >= 5:
                # check x, y, w, h
                coords = [float(p) for p in parts[1:5]]
                if any(c > 1.0 for c in coords):
                    print(f"‚ùå COORDINATES > 1 (Must be 0-1): {filename} -> {coords}")
                    errors_found += 1
            else:
                print(f"‚ùå MALFORMED LINE (Not enough columns): {filename} -> '{first_line}'")
                errors_found += 1

        except Exception as e:
            print(f"‚ùå CRITICAL ERROR reading {filename}: {e}")
            errors_found += 1

# 2. Check for FILENAME MISMATCHES (Case Sensitivity)
print("\n--- MATCHING CHECK ---")
images = os.listdir(image_dir)
# Clean extensions to compare base names
img_bases = {os.path.splitext(f)[0]: f for f in images}

mismatches = 0
for txt_file in txt_files:
    base_name = os.path.splitext(os.path.basename(txt_file))[0]

    # Check if exact match exists
    if base_name not in img_bases:
        print(f"‚ö†Ô∏è ORPHAN LABEL: {base_name}.txt exists, but no matching image found!")
        mismatches += 1

if errors_found == 0 and mismatches == 0:
    print("\n‚úÖ GREAT NEWS: Your labels look perfect!")
else:
    print(f"\n‚ö†Ô∏è FOUND ISSUES: {errors_found} format errors, {mismatches} mismatch errors.")


###### DELETING ORPHANS ######

# SET THIS TO TRUE TO ACTUALLY DELETE FILES
tDELETE_ORPHANS = True

print(f"üïµÔ∏è‚Äç‚ôÄÔ∏è Inspecting labels in: {label_dir}")

txt_files = glob.glob(os.path.join(label_dir, "*.txt"))
print(f"Found {len(txt_files)} .txt files.")

# --- MATCHING & PURGE CHECK ---
print("\n--- CHECKING FOR ORPHANS ---")
images = os.listdir(image_dir)
# Create a set of valid image names (without extension) for fast lookup
img_bases = {os.path.splitext(f)[0] for f in images if f.endswith(('.jpg', '.png', '.jpeg'))}

mismatches = 0
deleted_count = 0

for txt_file in txt_files:
    # Get the filename without extension (e.g., "flower_01")
    base_name = os.path.splitext(os.path.basename(txt_file))[0]

    # Check if this base name exists in our image list
    if base_name not in img_bases:
        if DELETE_ORPHANS:
            print(f"üóëÔ∏è DELETING ORPHAN: {base_name}.txt (No matching image)")
            os.remove(txt_file)
            deleted_count += 1
        else:
            print(f"‚ö†Ô∏è FOUND ORPHAN: {base_name}.txt (Set DELETE_ORPHANS=True to remove)")

        mismatches += 1

print(f"\n--- SUMMARY ---")
if mismatches == 0:
    print("‚úÖ All labels have matching images.")
else:
    if DELETE_ORPHANS:
        print(f"üßπ Purged {deleted_count} orphan label files.")
    else:
        print(f"‚ö†Ô∏è Found {mismatches} orphan labels. No files were deleted.")

üïµÔ∏è‚Äç‚ôÄÔ∏è Inspecting labels in: /content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/labels
Found 338 .txt files.

--- FORMAT CHECK ---

--- MATCHING CHECK ---

‚úÖ GREAT NEWS: Your labels look perfect!
üïµÔ∏è‚Äç‚ôÄÔ∏è Inspecting labels in: /content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/train/labels
Found 338 .txt files.

--- CHECKING FOR ORPHANS ---

--- SUMMARY ---
‚úÖ All labels have matching images.


### Checking if every validation  image has a label + deleting orphans


In [9]:
import os
import glob

# --- CONFIGURATION ---
# CHANGED: Pointing to 'val' folders now
label_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/val/labels'
image_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/val/images'

# SET THIS TO TRUE TO ACTUALLY DELETE FILES
DELETE_ORPHANS = True

print(f"üïµÔ∏è‚Äç‚ôÄÔ∏è Inspecting Validation labels in: {label_dir}")

txt_files = glob.glob(os.path.join(label_dir, "*.txt"))
print(f"Found {len(txt_files)} .txt files.")

# --- MATCHING & PURGE CHECK ---
print("\n--- CHECKING FOR ORPHANS ---")
images = os.listdir(image_dir)
# Create a set of valid image names (without extension) for fast lookup
img_bases = {os.path.splitext(f)[0] for f in images if f.endswith(('.jpg', '.png', '.jpeg'))}

mismatches = 0
deleted_count = 0

for txt_file in txt_files:
    # Get the filename without extension (e.g., "flower_01")
    base_name = os.path.splitext(os.path.basename(txt_file))[0]

    # Check if this base name exists in our image list
    if base_name not in img_bases:
        if DELETE_ORPHANS:
            print(f"üóëÔ∏è DELETING ORPHAN: {base_name}.txt (No matching image)")
            os.remove(txt_file)
            deleted_count += 1
        else:
            print(f"‚ö†Ô∏è FOUND ORPHAN: {base_name}.txt (Set DELETE_ORPHANS=True to remove)")

        mismatches += 1

print(f"\n--- SUMMARY ---")
if mismatches == 0:
    print("‚úÖ All validation labels have matching images.")
else:
    if DELETE_ORPHANS:
        print(f"üßπ Purged {deleted_count} orphan label files from Validation.")
    else:
        print(f"‚ö†Ô∏è Found {mismatches} orphan labels. No files were deleted.")

üïµÔ∏è‚Äç‚ôÄÔ∏è Inspecting Validation labels in: /content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/val/labels
Found 61 .txt files.

--- CHECKING FOR ORPHANS ---

--- SUMMARY ---
‚úÖ All validation labels have matching images.


### üëÅÔ∏è Step 4: Ground Truth Visualization
**Critical Sanity Check:** Before training, we must verify that the "Teacher" (the labels) is correct.
This script draws the bounding boxes from your `.txt` files onto the images.

* **Check Output:** Go to `datasets/pollen_v1/check_train_labels_visual`.
* **Verify:**
    * **Green Box** = Viable (Class 0) -> Should encompass dark/stained grains.
    * **Red Box** = Non-Viable (Class 1) -> Should encompass pale/transparent grains.

In [None]:
import os
import cv2
from tqdm import tqdm

# --- CONFIGURATION ---
# Base path to your dataset
base_dataset_dir = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1'

# We will check both splits
splits_to_check = ['train', 'val']

# Visualization Settings
COLOR_MAP = {
    0: (0, 255, 0),    # Class 0 (Viable) = Green
    1: (0, 0, 255)     # Class 1 (Non-Viable) = Red
}
BOX_THICKNESS = 2

print("Starting Visual Inspection for ALL data...")

for split in splits_to_check:
    print(f"\n--- Processing {split.upper()} set ---")

    # Define input paths
    img_dir = os.path.join(base_dataset_dir, split, 'images')
    lbl_dir = os.path.join(base_dataset_dir, split, 'labels')

    # Define output path (e.g., check_training_labels, check_val_labels)
    output_dir = os.path.join(base_dataset_dir, f'check_{split}_labels_visual')
    os.makedirs(output_dir, exist_ok=True)

    # Get all images
    if not os.path.exists(img_dir):
        print(f"Skipping {split} (folder not found)")
        continue

    image_files = [f for f in os.listdir(img_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]
    print(f"Saving images to: {output_dir}")

    # Loop with progress bar
    for img_file in tqdm(image_files):
        img_path = os.path.join(img_dir, img_file)

        # Find matching label
        label_file = os.path.splitext(img_file)[0] + ".txt"
        label_path = os.path.join(lbl_dir, label_file)

        # Read Image
        img = cv2.imread(img_path)
        if img is None: continue
        h_img, w_img = img.shape[:2]

        # Draw Boxes if label exists
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                lines = f.readlines()

            valid_label = False
            for line in lines:
                parts = line.strip().split()
                if len(parts) < 5: continue

                try:
                    cls_id = int(parts[0])
                    x_center, y_center, w, h = map(float, parts[1:])

                    # Convert YOLO normalized -> Pixels
                    x1 = int((x_center - w/2) * w_img)
                    y1 = int((y_center - h/2) * h_img)
                    x2 = int((x_center + w/2) * w_img)
                    y2 = int((y_center + h/2) * h_img)

                    # Draw Rectangle
                    color = COLOR_MAP.get(cls_id, (255, 255, 255))
                    cv2.rectangle(img, (x1, y1), (x2, y2), color, BOX_THICKNESS)

                    # Draw ID
                    label_text = "V" if cls_id == 0 else "NV"
                    cv2.putText(img, label_text, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
                    valid_label = True
                except ValueError:
                    continue # Skip corrupt lines

            # If label file was empty
            if not valid_label and os.path.getsize(label_path) == 0:
                 cv2.putText(img, "EMPTY LABEL FILE", (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)

        else:
            # Mark MISSING labels clearly
            cv2.putText(img, "NO LABEL FOUND", (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
            # Optional: Draw a red border around the whole image to verify it's a negative sample
            cv2.rectangle(img, (0,0), (w_img, h_img), (0,0,255), 10)

        # Save the annotated image
        cv2.imwrite(os.path.join(output_dir, img_file), img)

print("\nDone! Please check these folders in your Drive:")
print(f"1. {os.path.join(base_dataset_dir, 'check_train_labels_visual')}")
print(f"2. {os.path.join(base_dataset_dir, 'check_val_labels_visual')}")

Starting Visual Inspection for ALL data...

--- Processing TRAIN set ---
Saving images to: /content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/check_train_labels_visual


 20%|‚ñà‚ñâ        | 67/338 [03:52<17:52,  3.96s/it]

### ‚öôÔ∏è Step 5: Configure Model Hyperparameters
This cell generates the `data.yaml` file required by YOLO. It defines the paths to your Train/Val folders and the class names.

* **Classes:** `['viable', 'non_viable']`
* **Path:** Points to your Google Drive dataset location.

In [None]:
import yaml
import os # Import os module

# Define the content of the data.yaml file
data_config = {
    # PATHS: Points to where we just moved your files
    'path': '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1',
    'train': 'train/images',
    'val': 'val/images',

    # CLASSES:
    # IMPORTANT: These must match your annotation IDs (0, 1, etc.)
    # Example: If your .txt file says "0 0.5 0.5...", "0" corresponds to the first name here.
    'nc': 2,  # Number of classes
    'names': ['viable', 'non_viable'] # Change these names to match your data!
}

# Save it to the dataset folder
yaml_path = '/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/data.yaml'

# Ensure the directory exists before writing the file
os.makedirs(os.path.dirname(yaml_path), exist_ok=True)

with open(yaml_path, 'w') as f:
    yaml.dump(data_config, f, default_flow_style=False)

print(f"Configuration file created at: {yaml_path}")
print("--- Content ---")
print(open(yaml_path).read())

### üèãÔ∏è‚Äç‚ôÄÔ∏è Step 6: Train the Model
This process will take 1-3 hours depending on the model size.

**Biological Hyperparameters:**
We use specific augmentations to make the model robust to microscope variations:
* `degrees=180`: Pollen has no "up" or "down" (rotational invariance).
* `flipud=0.5`: Vertical flipping enabled.
* `hsv_s=0.1`: **Low Saturation Noise**. We limit color augmentation because color (Dark vs. Pale) is the primary viability signal.
* `patience=50`: Early Stopping is enabled. If the model stops improving for 50 epochs, training stops to prevent overfitting.

In [74]:
#Train the model
# data: Points to the yaml file we just created
# epochs: How many times to cycle through the data (start low to test)
# imgsz: Image size (640 is standard for YOLO)
# We pass these arguments directly to the train() function.
results = model.train(
    data='/content/drive/MyDrive/Pollen_viability/datasets/pollen_v1/data.yaml',
    epochs=500,
    patience=50,       # Stop if no improvement for 50 epochs
    imgsz=640,
    batch=32, # Reduced batch size to conserve memory
    name='pollen_v1_aug', # New name so we don't overwrite the old one

    # --- AUGMENTATION PARAMETERS ---
    degrees=180,        # Rotate image randomly between -180 and +180 (Pollen has no "up")
    flipud=0.5,         # 50% chance to flip Up-Down (Pollen doesn't care about gravity)
    fliplr=0.5,         # 50% chance to flip Left-Right
    scale=0.1,
       # Zoom in/out by up to 10% (Simulates different crop sizes) - lets play with it, lets try to put it down
    hsv_h=0.005,        # Adjust Hue slightly (Color variation)
    hsv_s=0.1,          # Adjust Saturation (Some images might be paler)
    hsv_v=0.6,          # Adjust Value/Brightness (Microscope lighting changes)
    iou=0.45,           #

    # Advanced Mixes
    mosaic=1.0,         # (Default) Stitches 4 images together. Great for small objects.
    mixup=0,          # Blends 2 images together. Helps with overlapping pollen.
)

Ultralytics 8.3.235 üöÄ Python-3.12.12 torch-2.9.0+cu126 CUDA:0 (NVIDIA L4, 22693MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/cont\ent/drive/MyDrive/Pollen_viability/datasets/pollen_v1/data.yaml, degrees=180, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=1500, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.5, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.005, hsv_s=0.1, hsv_v=0.6, imgsz=640, int8=False, iou=0.45, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0, mode=train, model=yolov8l.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=pollen_v1_aug13, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto,

  data='/cont\ent/drive/MyDrive/Pollen_viability/datasets/pollen_v1/data.yaml',


RuntimeError: Dataset '/cont\ent/drive/MyDrive/Pollen_viability/datasets/pollen_v1/data.yaml' error ‚ùå '/cont\ent/drive/MyDrive/Pollen_viability/datasets/pollen_v1/data.yaml' does not exist

### üíæ Step 7: Save the "Golden" Model
The training results are temporarily stored in Colab's memory (`/content/runs/...`). This script copies the **best** performing weights to your Google Drive for permanent storage.

* **Destination:** `Pollen_viability/trained_models/`
* **Naming:** Includes the date (e.g., `pollen_model_2025-12-10.pt`) so you can track versions.

**Code (GitHub):**
The repository contains the *pipeline* (notebooks, scripts, config).
* `pollen_viability.ipynb`: Main training/inference workflow.
* `datasets/data.yaml`: Configuration map.

**Artifacts (Google Drive):**
Due to file size constraints, trained model weights (`.pt` files) are stored in Google Drive, not GitHub.
* Location: `Pollen_viability/trained_models/`
* Naming Convention: `pollen_yolo_aug_YYYY-MM-DD.pt`

**Reproducibility:**
To reproduce results, clone the GitHub repo, mount Drive, and point the `model = YOLO()` function to the specific `.pt` file in Drive.

In [None]:
import shutil
import os
from datetime import datetime

# 1. Define where your weights are NOW (Temporary Colab storage)
# Note: Check the folder name 'pollen_v1_aug' matches your training "name=" parameter
source_weight = '/content/runs/detect/pollen_v1_aug6/weights/best.pt'

# 2. Define where you want them PERMANENTLY (Google Drive)
# We add a timestamp so you never overwrite good models
date_str = datetime.now().strftime("%Y-%m-%d")
dest_folder = '/content/drive/MyDrive/Pollen_viability/trained_models'
dest_filename = f'pollen_yolo_aug_{date_str}.pt'
dest_path = os.path.join(dest_folder, dest_filename)

# 3. Copy the file
if os.path.exists(source_weight):
    os.makedirs(dest_folder, exist_ok=True)
    shutil.copy(source_weight, dest_path)
    print(f"‚úÖ Model saved safely to: {dest_path}")
    print("You can now download this file or load it from Drive anytime.")
else:
    print(f"‚ùå Could not find {source_weight}. Did the training finish?")

‚úÖ Model saved safely to: /content/drive/MyDrive/Pollen_viability/trained_models/pollen_yolo_aug_2025-12-10.pt
You can now download this file or load it from Drive anytime.


### üî¨ Step 8: Routine Detection Tool
**Use this section to process new experiments.**

1.  **Input:** Upload your raw microscope images to `detect_images/`.
2.  **Run:** Execute the cell below.
3.  **Output:**
    * **CSV:** `pollen_counts.csv` (Contains Viable/Non-Viable counts for every image).
    * **Visuals:** Annotated images saved in `detected/`.

**Dynamic Resolution Logic:**
The script automatically detects if an image is a **Full Slide Scan** (>1000px) or a **Crop**.
* **Full Slide:** Uses High-Res inference (`imgsz=1600`) to find small grains.
* **Crop:** Uses Standard inference (`imgsz=640`) to avoid scaling artifacts.

In [None]:
import os
import cv2
import pandas as pd
from ultralytics import YOLO

# --- CONFIGURATION ---
model_path = '/content/runs/detect/pollen_v1_aug6/weights/best.pt'
image_dir = '/content/drive/MyDrive/Pollen_viability/detect_images'
save_image_dir = '/content/drive/MyDrive/Pollen_viability/detected'
output_csv_path = '/content/drive/MyDrive/Pollen_viability/pollen_counts_universal.csv'

# --- THRESHOLDS (Set them here!) ---
CONF_VIABLE = 0.60
CONF_NON_VIABLE = 0.45

# Visualization Config
COLOR_VIABLE = (0, 200, 0)      # Green
COLOR_NON_VIABLE = (0, 0, 200)  # Red
BOX_THICKNESS = 4               # Thinned slightly for clarity

os.makedirs(save_image_dir, exist_ok=True)
model = YOLO(model_path)

print(f"--- STARTING DEBUG RUN ---")
print(f"Viable Threshold: {CONF_VIABLE}")
print(f"Non-Viable Threshold: {CONF_NON_VIABLE}")

data_rows = []
image_files = [f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]

for img_file in image_files:
    img_path = os.path.join(image_dir, img_file)

    # 1. READ IMAGE
    img_array = cv2.imread(img_path)
    height, width = img_array.shape[:2]

    # 2. DYNAMIC LOGIC
    if width > 1000 or height > 1000:
        inference_size = 1600
        mode = "High-Res"
    else:
        inference_size = 640
        mode = "Standard"

    # 3. RUN INFERENCE (Low conf to catch everything, filter later)
    results = model(img_path, verbose=False, imgsz=inference_size, conf=0.1)

    boxes = results[0].boxes
    keep_indices = []

    final_counts = {'viable': 0, 'non_viable': 0}

    if boxes:
        # Convert to numpy for easy looping
        cls_ids = boxes.cls.cpu().numpy().astype(int)
        confs = boxes.conf.cpu().numpy()

        for i, (cls_id, conf) in enumerate(zip(cls_ids, confs)):
            conf = float(conf) # Ensure it's a standard python float

            # --- DEBUG LOGIC ---
            is_kept = False

            if cls_id == 0: # Viable
                if conf > CONF_VIABLE:
                    final_counts['viable'] += 1
                    keep_indices.append(i)
                    is_kept = True
                else:
                    # Print when we REJECT something that is close
                    if conf > 0.5:
                        print(f"[{img_file}] REJECTED Viable with score {conf:.2f} (Threshold {CONF_VIABLE})")

            elif cls_id == 1: # Non-Viable
                if conf > CONF_NON_VIABLE:
                    final_counts['non_viable'] += 1
                    keep_indices.append(i)
                    is_kept = True
                else:
                    if conf > 0.2:
                        print(f"[{img_file}] REJECTED Non-Viable with score {conf:.2f} (Threshold {CONF_NON_VIABLE})")

    # 4. DRAWING LOOP
    for idx in keep_indices:
        x1, y1, x2, y2 = boxes.xyxy[idx].cpu().numpy().astype(int)
        cls_id = int(boxes.cls[idx])
        conf = float(boxes.conf[idx])

        color = COLOR_VIABLE if cls_id == 0 else COLOR_NON_VIABLE
        label = "V" if cls_id == 0 else "NV"

        cv2.rectangle(img_array, (x1, y1), (x2, y2), color, BOX_THICKNESS)

        label_text = f"{label} {conf:.2f}"
        (w, h), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
        cv2.rectangle(img_array, (x1, y1 - 20), (x1 + w, y1), color, -1)
        cv2.putText(img_array, label_text, (x1, y1 - 5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

    # Save
    save_path = os.path.join(save_image_dir, img_file)
    cv2.imwrite(save_path, img_array)

    # Log Data
    row = {
        'image_id': img_file,
        'viable_pollen': final_counts['viable'],
        'non_viable_pollen': final_counts['non_viable'],
        'Total Objects': final_counts['viable'] + final_counts['non_viable'],
        'resolution_mode': inference_size
    }
    data_rows.append(row)

# Save CSV
df_results = pd.DataFrame(data_rows).sort_values(by='image_id')
df_results.to_csv(output_csv_path, index=False)
print("\nProcessing Complete.")

--- STARTING DEBUG RUN ---
Viable Threshold: 0.6
Non-Viable Threshold: 0.45
[5-5-J_2x_B.jpg] REJECTED Non-Viable with score 0.33 (Threshold 0.45)
[5-5-J_2x_B.jpg] REJECTED Non-Viable with score 0.28 (Threshold 0.45)
[6-1-F_2x_D.jpg] REJECTED Viable with score 0.58 (Threshold 0.6)
[6-3-I_2x_B.jpg] REJECTED Non-Viable with score 0.44 (Threshold 0.45)
[6-1-F_2x_A.jpg] REJECTED Non-Viable with score 0.30 (Threshold 0.45)
[6-1-F_2x_A.jpg] REJECTED Non-Viable with score 0.24 (Threshold 0.45)
[6-1-F_2x_A.jpg] REJECTED Non-Viable with score 0.24 (Threshold 0.45)
[6-5-A_2x_A.jpg] REJECTED Viable with score 0.59 (Threshold 0.6)
[6-5-A_2x_A.jpg] REJECTED Viable with score 0.55 (Threshold 0.6)
[2-9-A_4x_A.jpg] REJECTED Non-Viable with score 0.32 (Threshold 0.45)
[2-9-A_4x_A.jpg] REJECTED Non-Viable with score 0.23 (Threshold 0.45)
[2-9-A_4x_D.jpg] REJECTED Non-Viable with score 0.31 (Threshold 0.45)
[6-5-A_2x_B.jpg] REJECTED Viable with score 0.55 (Threshold 0.6)
[6-5-A_2x_B.jpg] REJECTED Viable w