In [None]:
!pip install boto3
!pip install ultralytics

In [None]:
import os
import re
from datetime import datetime, timezone
import csv
import json
from pathlib import Path
import pandas as pd
import boto3
import shutil
import random
from pathlib import Path
import cv2
import numpy as np
from pathlib import Path
from ultralytics import YOLO

In [None]:
CSV_PATH = '/content/image_segmentation_grey.csv'  # CSV with filtering criteria
CSV_OUTPUT = '/content/image_boat_count.csv'

In [None]:
"""
Automatic Boat Detection with Enhanced Filtering
- Prevents double-counting with NMS tuning
- Detects tiny boats (10-20px) and large boats
- Filters: top 25% (mountains/sky), bottom 2%, right 5% corner
- Processes only daytime images
"""

def load_daytime_images(csv_path):
    """Load CSV and return set of daytime image filenames"""
    daytime_images = set()

    with open(csv_path, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) >= 3:
                filename = row[0].strip()
                time_of_day = row[2].strip().lower()

                if time_of_day == 'day':
                    daytime_images.add(filename)

    print(f"Loaded {len(daytime_images)} daytime images from CSV")
    return daytime_images


def is_in_exclusion_zone(box, width, height, exclude_top_percent=25, exclude_bottom_percent=20):
    """
    Check if bounding box is in any exclusion zone

    Args:
        box: dict with 'xyxy' key containing [x1, y1, x2, y2]
        width: image width
        height: image height
        exclude_top_percent: percentage of top to exclude
        exclude_bottom_percent: percentage of bottom to exclude

    Returns:
        (is_excluded, reason) tuple
    """
    x1, y1, x2, y2 = box['xyxy']

    # Calculate boundaries
    top_threshold = height * (exclude_top_percent / 100)
    bottom_threshold = height * (1 - exclude_bottom_percent / 100)

    # Check if center of box is in exclusion zone
    y_center = (y1 + y2) / 2
    x_center = (x1 + x2) / 2

    if y_center < top_threshold:
        return True, "top_zone"
    if y2 > bottom_threshold:  # Check bottom edge instead of center
        return True, "bottom_zone"
    return False, None


def calculate_iou(box1, box2):
    """Calculate Intersection over Union for two boxes"""
    x1_1, y1_1, x2_1, y2_1 = box1['xyxy']
    x1_2, y1_2, x2_2, y2_2 = box2['xyxy']

    # Calculate intersection
    x1_i = max(x1_1, x1_2)
    y1_i = max(y1_1, y1_2)
    x2_i = min(x2_1, x2_2)
    y2_i = min(y2_1, y2_2)

    if x2_i < x1_i or y2_i < y1_i:
        return 0.0

    intersection = (x2_i - x1_i) * (y2_i - y1_i)

    # Calculate union
    area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
    area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
    union = area1 + area2 - intersection

    return intersection / union if union > 0 else 0.0


def apply_custom_nms(boxes, iou_threshold=0.4):
    """
    Apply custom Non-Maximum Suppression to prevent double-counting
    Keeps highest confidence box when overlap exceeds threshold

    Args:
        boxes: list of box dicts with 'xyxy' and 'conf' keys
        iou_threshold: IoU threshold for considering boxes as duplicates

    Returns:
        tuple: (kept_boxes, suppressed_boxes)
    """
    if len(boxes) == 0:
        return [], []

    # Sort by confidence (highest first)
    boxes_sorted = sorted(boxes, key=lambda x: x['conf'], reverse=True)

    keep = []
    suppressed = []

    while len(boxes_sorted) > 0:
        # Take highest confidence box
        current = boxes_sorted.pop(0)
        keep.append(current)

        # Remove boxes that overlap significantly
        remaining = []
        for box in boxes_sorted:
            iou = calculate_iou(current, box)
            if iou < iou_threshold:
                remaining.append(box)
            else:
                suppressed.append(box)

        boxes_sorted = remaining

    return keep, suppressed


def detect_boats_yolo(model,
    image_folder,
    output_folder,
    csv_path=None,
    export_format='yolo',
    exclude_top_percent=25,
    exclude_bottom_percent=20,
    confidence_threshold=0.05,  # Very low for tiny boats (increase if too many false positives)
    iou_threshold=0.5,  # NMS threshold for duplicate detection
    imgsz=1920  # Larger image size critical for 10-20px boats
):
    """
    Detect boats using YOLO with enhanced filtering and NMS

    Args:
        image_folder: Input folder with images
        output_folder: Output folder for annotations
        csv_path: Path to CSV with daytime/nighttime labels (optional)
        export_format: 'yolo', 'coco', or 'voc'
        exclude_top_percent: Percentage of top to exclude (mountains/sky)
        exclude_bottom_percent: Percentage of bottom to exclude
        exclude_right_percent: Percentage of right edge to exclude
        confidence_threshold: Minimum confidence (0.05 aggressive, 0.15 conservative)
        iou_threshold: IoU threshold for NMS (0.3=aggressive, 0.6=conservative)
        model_name: YOLO model to use (yolo11x.pt recommended for tiny boats)
        imgsz: Image size for inference (larger = better tiny boat detection)
    """

    # Load daytime filter if CSV provided
    daytime_images = None
    if csv_path:
        daytime_images = load_daytime_images(csv_path)

    # Create output directories
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    labels_folder = Path(output_folder) / 'labels'
    images_folder = Path(output_folder) / 'images'
    labels_folder.mkdir(exist_ok=True)
    images_folder.mkdir(exist_ok=True)

    annotations = []
    image_id = 0
    total_detections = 0
    skipped_nighttime = 0

    # Tracking statistics
    stats = {
        'filtered_low_conf': 0,
        'filtered_top_zone': 0,
        'filtered_bottom_zone': 0,
        'filtered_nms': 0,
        'tiny_boats': 0,  # <= 20px max dimension
        'small_boats': 0,  # 21-50px
        'medium_boats': 0,  # 51-100px
        'large_boats': 0   # > 100px
    }

    # Process all images
    image_files = list(Path(image_folder).glob('*.jpg')) + \
                  list(Path(image_folder).glob('*.png')) + \
                  list(Path(image_folder).glob('*.jpeg'))

    print(f"\nProcessing {len(image_files)} image...")

    for img_path in image_files:
        # Skip nighttime images if CSV filter is active
        if daytime_images and img_path.name not in daytime_images:
            skipped_nighttime += 1
            continue

        # Read image
        img = cv2.imread(str(img_path))
        if img is None:
            print(f"Warning: Could not read {img_path.name}")
            continue

        height, width = img.shape[:2]

        # Run inference with settings optimized for tiny boats
        # Very low conf threshold and larger image size
        results = model(
            img,
            conf=confidence_threshold,
            iou=0.9,  # YOLO's built-in NMS - very high to keep almost all detections
            imgsz=imgsz,
            max_det=300,  # Allow more detections per image (default is 300)
            agnostic_nms=False,  # Class-specific NMS
            verbose=False
        )

        # Extract boat detections (class 8 in COCO)
        raw_boxes = []
        for result in results:
            for box in result.boxes:
                class_id = int(box.cls[0])
                if class_id == 8:  # boat class
                    x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                    confidence = float(box.conf[0])

                    raw_boxes.append({
                        'xyxy': [float(x1), float(y1), float(x2), float(y2)],
                        'conf': confidence,
                        'class': class_id
                    })

        # Apply region-based filtering
        filtered_boxes = []
        for box in raw_boxes:
            excluded, reason = is_in_exclusion_zone(
                box, width, height,
                exclude_top_percent,
                exclude_bottom_percent
            )

            if excluded:
                stats[f'filtered_{reason}'] += 1
            else:
                filtered_boxes.append(box)

        # Apply custom NMS to prevent double-counting
        final_boxes, suppressed_boxes = apply_custom_nms(filtered_boxes, iou_threshold)
        stats['filtered_nms'] += len(suppressed_boxes)

        # Categorize boats by size
        for box in final_boxes:
            x1, y1, x2, y2 = box['xyxy']
            box_width = x2 - x1
            box_height = y2 - y1
            max_dim = max(box_width, box_height)

            if max_dim <= 20:
                stats['tiny_boats'] += 1
                box['size_category'] = 'tiny'
            elif max_dim <= 50:
                stats['small_boats'] += 1
                box['size_category'] = 'small'
            elif max_dim <= 100:
                stats['medium_boats'] += 1
                box['size_category'] = 'medium'
            else:
                stats['large_boats'] += 1
                box['size_category'] = 'large'

        total_detections += len(final_boxes)

        # Print status
        status = f"{img_path.name}: {len(final_boxes)} boats"
        if len(final_boxes) > 0:
            size_counts = {}
            for box in final_boxes:
                cat = box['size_category']
                size_counts[cat] = size_counts.get(cat, 0) + 1
            size_str = ", ".join([f"{count} {cat}" for cat, count in sorted(size_counts.items())])
            status += f" ({size_str})"

        filters = []
        if len(suppressed_boxes) > 0:
            filters.append(f"{len(suppressed_boxes)} duplicates")
        if filters:
            status += f" [filtered: {', '.join(filters)}]"

        # Export annotations
        if export_format == 'yolo':
            export_yolo_format(img_path, final_boxes, width, height, labels_folder)
        elif export_format == 'coco':
            export_coco_format(img_path, final_boxes, width, height, image_id, annotations)
        elif export_format == 'voc':
            export_voc_format(img_path, final_boxes, width, height, labels_folder)

        # Save annotated image with color-coded boxes by size
        annotated_img = img.copy()

        # Draw exclusion zones first (as transparent overlays)
        overlay = annotated_img.copy()

        # Top zone (mountains/sky) - red
        top_threshold = int(height * (exclude_top_percent / 100))
        cv2.rectangle(overlay, (0, 0), (width, top_threshold), (0, 0, 255), -1)

        # Bottom zone - orange
        bottom_threshold = int(height * (1 - exclude_bottom_percent / 100))
        cv2.rectangle(overlay, (0, bottom_threshold), (width, height), (0, 165, 255), -1)

        # Blend overlay with original
        cv2.addWeighted(overlay, 0.2, annotated_img, 0.8, 0, annotated_img)

        # Draw boundary lines
        cv2.line(annotated_img, (0, top_threshold), (width, top_threshold), (0, 0, 255), 2)
        cv2.line(annotated_img, (0, bottom_threshold), (width, bottom_threshold), (0, 165, 255), 2)

        # Draw boat bounding boxes with color by size
        for box in final_boxes:
            x1, y1, x2, y2 = [int(v) for v in box['xyxy']]

            # Color code by size: tiny=green, small=cyan, medium=blue, large=magenta
            color_map = {
                'tiny': (0, 255, 0),      # Green
                'small': (255, 255, 0),    # Cyan
                'medium': (255, 128, 0),   # Blue
                'large': (255, 0, 255)     # Magenta
            }
            color = color_map.get(box['size_category'], (0, 255, 0))

            cv2.rectangle(annotated_img, (x1, y1), (x2, y2), color, 2)

            # Add label with confidence and size
            box_w, box_h = x2 - x1, y2 - y1
            label = f"{box['size_category']} {box['conf']:.2f} ({int(max(box_w, box_h))}px)"

            # Background for text
            (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)
            cv2.rectangle(annotated_img, (x1, y1 - text_h - 4), (x1 + text_w, y1), color, -1)
            cv2.putText(annotated_img, label, (x1, y1 - 2),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1)

        # Add legend
        legend_y = 30
        cv2.putText(annotated_img, f"Boats: {len(final_boxes)}", (10, legend_y),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

        cv2.imwrite(str(images_folder / img_path.name), annotated_img)
        image_id += 1

    # Save COCO JSON if needed
    if export_format == 'coco':
        save_coco_json(annotations, output_folder)

    return total_detections

def export_yolo_format(img_path, boxes, width, height, labels_folder):
    """YOLO format: class x_center y_center width height (normalized)"""
    label_path = labels_folder / f"{img_path.stem}.txt"

    with open(label_path, 'w') as f:
        for box in boxes:
            x1, y1, x2, y2 = box['xyxy']

            # Convert to YOLO format (normalized center coords + size)
            x_center = ((x1 + x2) / 2) / width
            y_center = ((y1 + y2) / 2) / height
            box_width = (x2 - x1) / width
            box_height = (y2 - y1) / height

            f.write(f"0 {x_center:.6f} {y_center:.6f} {box_width:.6f} {box_height:.6f}\n")


def export_coco_format(img_path, boxes, width, height, image_id, annotations):
    """COCO format: JSON with images and annotations arrays"""
    for idx, box in enumerate(boxes):
        x1, y1, x2, y2 = box['xyxy']

        annotation = {
            'id': image_id * 1000 + idx,
            'image_id': image_id,
            'category_id': 1,
            'bbox': [float(x1), float(y1), float(x2 - x1), float(y2 - y1)],
            'area': float((x2 - x1) * (y2 - y1)),
            'iscrowd': 0,
            'score': box['conf']
        }
        annotations.append(annotation)


def save_coco_json(annotations, output_folder):
    """Save COCO format JSON file"""
    coco_data = {
        'images': [],
        'annotations': annotations,
        'categories': [{'id': 1, 'name': 'boat'}]
    }

    json_path = Path(output_folder) / 'annotations.json'
    with open(json_path, 'w') as f:
        json.dump(coco_data, f, indent=2)


def export_voc_format(img_path, boxes, width, height, labels_folder):
    """Pascal VOC format: XML file per image"""
    import xml.etree.ElementTree as ET

    root = ET.Element('annotation')
    ET.SubElement(root, 'filename').text = img_path.name

    size = ET.SubElement(root, 'size')
    ET.SubElement(size, 'width').text = str(width)
    ET.SubElement(size, 'height').text = str(height)
    ET.SubElement(size, 'depth').text = '3'

    for box in boxes:
        x1, y1, x2, y2 = box['xyxy']

        obj = ET.SubElement(root, 'object')
        ET.SubElement(obj, 'name').text = 'boat'

        bndbox = ET.SubElement(obj, 'bndbox')
        ET.SubElement(bndbox, 'xmin').text = str(int(x1))
        ET.SubElement(bndbox, 'ymin').text = str(int(y1))
        ET.SubElement(bndbox, 'xmax').text = str(int(x2))
        ET.SubElement(bndbox, 'ymax').text = str(int(y2))

    tree = ET.ElementTree(root)
    xml_path = labels_folder / f"{img_path.stem}.xml"
    tree.write(xml_path)

# Get Filtered Image

In [None]:
import pandas as pd

def get_and_remove_day_image(csv_path, segment_column='segment', image_column=None):
    """
    Get one day image, remove it from CSV, and return the image name

    Args:
        csv_path: Path to local CSV file
        segment_column: Column name containing 'day'/'night' (default: 'segment')
        image_column: Column name containing image filenames (auto-detect if None)

    Returns:
        str: Image filename, or None if no day images found
    """
    # Load CSV
    df = pd.read_csv(csv_path)

    print(f"Loaded CSV with {len(df)} rows")

    # Filter for day images
    day_images = df[df[segment_column] == 'day']

    if len(day_images) == 0:
        print("No day images found in CSV")
        return None

    print(f"Found {len(day_images)} day images")

    # Get the first day image row
    day_row = day_images.iloc[0]

    # Auto-detect image column if not specified
    if image_column is None:
        for col in ['image', 'filename', 'file', 'image_name', 'image_path']:
            if col in df.columns:
                image_column = col
                break
        if image_column is None:
            image_column = df.columns[0]  # Use first column as fallback

    image_name = day_row[image_column]

    # Remove this row from dataframe
    df = df.drop(day_row.name)

    # Save modified CSV back to file
    df.to_csv(csv_path, index=False)

    print(f"Removed image '{image_name}' from CSV")
    print(f"CSV now has {len(df)} rows ({len(df[df[segment_column] == 'day'])} day images remaining)")

    return image_name

## Parse Timestamp

In [None]:
def parse_timestamp_from_filename(filename):
    """
    Extract and parse timestamp from filename.

    Args:
        filename: Image filename
        image_type: 'ccss' for raw CCSS images, 'gt' for ground truth images

    Examples:
        CCSS: AXISQ6074EPTZACCC8EACA584_20230901T210530.000Z.jpg
    """
    TIMESTAMP_PATTERN = r'_(\d{8}T\d{6}\.\d{3}Z)\.'
    match = re.search(TIMESTAMP_PATTERN, filename)
    if not match:
        return None
    timestamp_str = match.group(1)
    dt = datetime.strptime(timestamp_str, '%Y%m%dT%H%M%S.%fZ')
    return dt.strftime('%Y-%m-%d %H:%M:%S')


## Create CSV

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp
import os
import shutil
import boto3
import pandas as pd
from pathlib import Path
from ultralytics import YOLO
import threading
from tqdm import tqdm


CSV_PATH = '/content/image_segmentation_grey.csv'  # CSV with filtering criteria
CSV_OUTPUT = '/content/image_boat_count.csv'

def get_all_day_images(csv_path, filename_col=None, segment_col='segment', segment_val='day', limit=None):
    """
    Extract all day image filenames from CSV

    Args:
        csv_path: Path to CSV file
        filename_col: Name of column containing filenames (None = auto-detect)
        segment_col: Name of column containing time segment (None = use all)
        segment_val: Value indicating day segment (default: 'day')
    """
    try:
        df = pd.read_csv(csv_path)

        print(f"\n📋 CSV Info: {len(df)} rows, Columns: {list(df.columns)}")

        # Auto-detect filename column if not specified
        if filename_col is None:
            possible_names = ['filename', 'file_name', 'image', 'image_name',
                            'image_path', 'path', 'name', 'file']
            filename_col = next((col for col in df.columns
                               if col.lower() in possible_names), df.columns[0])

        print(f"✓ Using filename column: '{filename_col}'")

        # Filter by segment if specified
        if segment_col and segment_col in df.columns:
            day_images = df[df[segment_col] == segment_val][filename_col].tolist()
            print(f"✓ Filtered for '{segment_val}' in '{segment_col}'")
        else:
            day_images = df[filename_col].tolist()
            print(f"ℹ️  No segment filter applied")

        if limit is not None and limit > 0:
            original_count = len(day_images)
            day_images = day_images[:limit]
            print(f"🔒 Limited from {original_count} to {len(day_images)} images")

        print(f"✓ Processing {len(day_images)} images")

        # Show sample filenames
        if day_images:
            print(f"\n📝 Sample filenames:")
            for fname in day_images[:3]:
                print(f"   - {fname}")

        return day_images

        print(f"✓ Found {len(day_images)} images")
        print(f"   Sample: {day_images[0] if day_images else 'N/A'}")

        return day_images

    except Exception as e:
        print(f"✗ Error: {e}")
        return []


def process_single_image(file_name, s3, bucket_name, folder_prefix,
                        model, base_image_folder, output_folder, segment):
    """
    Process one image - download, detect boats, cleanup
    Thread-safe with isolated folder per thread
    """
    # Create thread-specific folder to avoid conflicts
    thread_id = threading.current_thread().ident
    thread_image_folder = os.path.join(base_image_folder, f"thread_{thread_id}")
    os.makedirs(thread_image_folder, exist_ok=True)

    local_file_path = os.path.join(thread_image_folder, os.path.basename(file_name))
    s3_key = f"{folder_prefix}{file_name}"

    try:
        # Check if file exists in S3
        s3.head_object(Bucket=bucket_name, Key=s3_key)

        # Download image
        s3.download_file(bucket_name, s3_key, local_file_path)

        # Detect boats - process only this thread's folder
        boat_count = detect_boats_yolo(
            model,
            image_folder=thread_image_folder,  # Thread-specific folder
            output_folder=output_folder,
            csv_path=None,  # Process all images in folder (just this one)
            export_format='yolo',
            exclude_top_percent=25,
            exclude_bottom_percent=25,
            confidence_threshold=0.10,
            iou_threshold=0.3,
            imgsz=1920
        )

        # Parse timestamp from filename
        timestamp = parse_timestamp_from_filename(file_name)

        result = {
            'filename': file_name,
            'timestamp': timestamp,
            'segment': segment,
            'boat_count': boat_count
        }

        return result

    except Exception as e:
        print(f"\n✗ Failed {file_name}: {e}")
        return None

    finally:
        # Always cleanup thread folder
        if os.path.exists(thread_image_folder):
            try:
                shutil.rmtree(thread_image_folder)
            except Exception as e:
                print(f"Warning: Could not cleanup {thread_image_folder}: {e}")


def main():
    """Main execution function"""

    # ============== CONFIGURATION ==============
    # Define your paths here
    CSV_PATH = "image_segmentation_grey.csv"  # UPDATE THIS
    CSV_OUTPUT = "boat_detection_results.csv"  # UPDATE THIS

    original_path = Path(CSV_PATH)
    CSV_PATH_DUP = str(original_path.parent / f"{original_path.stem}_dup{original_path.suffix}")

    # S3 Configuration
    #bucket_name =
    #folder_prefix =
    IMAGE_FOLDER = './CCSS_images/'
    OUTPUT_FOLDER = "./annotated_boats"

    # Processing Configuration
    segment = 'day'
    max_workers = 8  # Tune this: 4-8 for balanced, 10+ for network-heavy

    # ============== SETUP ==============
    print("="*60)
    print("🚀 Starting Parallel Boat Detection Pipeline")
    print("="*60)

    # Copy CSV for backup
    shutil.copy2(CSV_PATH, CSV_PATH_DUP)
    print(f"✓ Created backup CSV: {CSV_PATH_DUP}")

    # Initialize S3 client
    s3 = boto3.client(
        #'s3',
        #aws_access_key_id=,
        #aws_secret_access_key=,
        #endpoint_url=
    )
    print("✓ S3 client initialized")

    # Load YOLO model (once, shared across threads)
    print("⏳ Loading YOLO model...")
    model = YOLO('yolo11m.pt')
    print("✓ YOLO model loaded")

    # Create folders
    os.makedirs(IMAGE_FOLDER, exist_ok=True)
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    print(f"✓ Folders created: {IMAGE_FOLDER}, {OUTPUT_FOLDER}")

    # ============== GET FILE LIST ==============
    file_list = get_all_day_images(CSV_PATH_DUP)

    if not file_list:
        print("❌ No files to process!")
        return

    print(f"\n📋 Processing {len(file_list)} images with {max_workers} parallel workers")
    print("="*60)

    # ============== PARALLEL PROCESSING ==============
    results = []
    failed_files = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        futures = {
            executor.submit(
                process_single_image,
                fname,
                s3,
                bucket_name,
                folder_prefix,
                model,
                IMAGE_FOLDER,
                OUTPUT_FOLDER,
                segment
            ): fname
            for fname in file_list
        }

        # Process results with progress bar
        for future in tqdm(as_completed(futures), total=len(file_list),
                          desc="Processing images", unit="img"):
            result = future.result()
            if result:
                results.append(result)
            else:
                # Track failed files
                failed_files.append(futures[future])

    # ============== SAVE RESULTS ==============
    print("\n" + "="*60)
    print("💾 Saving results...")

    df = pd.DataFrame(results)
    df.to_csv(CSV_OUTPUT, index=False)

    # ============== SUMMARY ==============
    print("\n" + "="*60)
    print("✅ PROCESSING COMPLETE!")
    print("="*60)
    print(f"📄 Output CSV: {CSV_OUTPUT}")
    print(f"📊 Total images processed: {len(file_list)}")
    print(f"✓ Successful: {len(results)}")
    print(f"✗ Failed: {len(failed_files)}")

    if failed_files:
        print(f"\n⚠️  Failed files:")
        for f in failed_files[:10]:  # Show first 10
            print(f"   - {f}")
        if len(failed_files) > 10:
            print(f"   ... and {len(failed_files) - 10} more")

    print(f"\n📈 Results preview:")
    print(df.head(10))

    print(f"\n📊 Boat count statistics:")
    print(df['boat_count'].describe())

    # Cleanup backup CSV
    try:
        os.remove(CSV_PATH_DUP)
        print(f"\n✓ Cleaned up backup CSV")
    except:
        pass

    print("\n" + "="*60)
    print("🎉 All done!")
    print("="*60)


if __name__ == "__main__":
    main()

In [None]:
"""
Display single image with boat detection boxes
"""

import cv2
import matplotlib.pyplot as plt
from pathlib import Path


def read_yolo_labels(label_path, img_width, img_height):
    """
    Read YOLO format labels and convert to pixel coordinates
    Format: class x_center y_center width height (all normalized 0-1)
    """
    boxes = []

    if not label_path.exists():
        return boxes

    with open(label_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 5:
                class_id = int(parts[0])
                x_center = float(parts[1]) * img_width
                y_center = float(parts[2]) * img_height
                width = float(parts[3]) * img_width
                height = float(parts[4]) * img_height

                # Convert to corner coordinates
                x1 = int(x_center - width / 2)
                y1 = int(y_center - height / 2)
                x2 = int(x_center + width / 2)
                y2 = int(y_center + height / 2)

                boxes.append((x1, y1, x2, y2, class_id))

    return boxes


def draw_boxes_on_image(img, boxes, color=(0, 255, 0), thickness=2):
    """
    Draw bounding boxes on image
    """
    img_copy = img.copy()

    for box in boxes:
        x1, y1, x2, y2, class_id = box
        cv2.rectangle(img_copy, (x1, y1), (x2, y2), color, thickness)

        # Add label
        label = f"boat"
        cv2.putText(img_copy, label, (x1, y1 - 10),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    return img_copy


def display_single_image(base_folder, image_filename):
    """
    Display a single image with boat detection boxes

    Args:
        base_folder: Path to folder containing 'images' and 'labels' subfolders
        image_filename: Name of the image file (e.g., 'image001.jpg')
    """
    base_path = Path(base_folder)
    images_path = base_path / 'images'
    labels_path = base_path / 'labels'

    # Construct full image path
    img_path = images_path / image_filename

    if not img_path.exists():
        print(f"Image not found: {img_path}")
        return

    # Read image
    img = cv2.imread(str(img_path))
    if img is None:
        print(f"Failed to read image: {img_path}")
        return

    height, width = img.shape[:2]

    # Read corresponding label file
    label_path = labels_path / f"{img_path.stem}.txt"
    boxes = read_yolo_labels(label_path, width, height)

    # Draw boxes if present
    if boxes:
        img_with_boxes = draw_boxes_on_image(img, boxes, color=(0, 255, 0))
        img_rgb = cv2.cvtColor(img_with_boxes, cv2.COLOR_BGR2RGB)
        title_text = f'{image_filename}\n{len(boxes)} boat(s) detected'
        print(f"Displaying image with {len(boxes)} boat detection(s)")
    else:
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        title_text = f'{image_filename}\nNo boats detected'
        print(f"Displaying image with no boat detections")

    # Display
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.imshow(img_rgb)
    ax.axis('off')
    ax.set_title(title_text, fontsize=12, fontweight='bold')
    plt.tight_layout()
    plt.show()


# Example usage:
if __name__ == "__main__":
    # Example: display specific image
    image_filename = "AXISQ6074EPTZACCC8EACA584_20230901T000001.000Z.jpg"

    display_single_image('/content/annotated_boats',image_filename)

In [None]:
"""
Display daytime images split by boat detection status
"""

import cv2
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
import csv

def load_daytime_images(csv_path):
    """
    Load CSV and return set of daytime image filenames

    CSV format expected:
    filename,timestamp,time_of_day
    AXISQ6074EPTZACCC8EACA584_20230901T164001.000Z.jpg,2023-09-01 16:40:01.000 UTC,day
    """
    daytime_images = set()

    with open(csv_path, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) >= 3:
                filename = row[0].strip()
                time_of_day = row[2].strip().lower()

                if time_of_day == 'day':
                    daytime_images.add(filename)

    print(f"Loaded {len(daytime_images)} daytime images from CSV")
    return daytime_images


def read_yolo_labels(label_path, img_width, img_height):
    """
    Read YOLO format labels and convert to pixel coordinates
    Format: class x_center y_center width height (all normalized 0-1)
    """
    boxes = []

    if not label_path.exists():
        return boxes

    with open(label_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 5:
                class_id = int(parts[0])
                x_center = float(parts[1]) * img_width
                y_center = float(parts[2]) * img_height
                width = float(parts[3]) * img_width
                height = float(parts[4]) * img_height

                # Convert to corner coordinates
                x1 = int(x_center - width / 2)
                y1 = int(y_center - height / 2)
                x2 = int(x_center + width / 2)
                y2 = int(y_center + height / 2)

                boxes.append((x1, y1, x2, y2, class_id))

    return boxes


def draw_boxes_on_image(img, boxes, color=(0, 255, 0), thickness=2):
    """
    Draw bounding boxes on image
    """
    img_copy = img.copy()

    for box in boxes:
        x1, y1, x2, y2, class_id = box
        cv2.rectangle(img_copy, (x1, y1), (x2, y2), color, thickness)

        # Add label
        label = f"boat"
        cv2.putText(img_copy, label, (x1, y1 - 10),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    return img_copy


def display_daytime_split(base_folder, num_with_boats=20, num_without_boats=20):
    """
    Display daytime images split into two categories:
    1. Images WITH boat detections
    2. Images WITHOUT boat detections

    Args:
        base_folder: Path to folder containing 'images' and 'labels' subfolders
        csv_path: Path to CSV with daytime/nighttime labels
        num_with_boats: Number of images with boats to display
        num_without_boats: Number of images without boats to display
    """
    base_path = Path(base_folder)
    images_path = base_path / 'images'
    labels_path = base_path / 'labels'

    if not images_path.exists():
        print(f"Images folder not found: {images_path}")
        return

    if not labels_path.exists():
        print(f"Labels folder not found: {labels_path}")
        return

    # Get all image files
    all_image_files = (list(images_path.glob('*.jpg')) +
                      list(images_path.glob('*.png')) +
                      list(images_path.glob('*.jpeg')))

    # Filter for daytime only
    image_files = all_image_files

    print(f"Found {len(image_files)} daytime images (filtered from {len(all_image_files)} total)")

    # Categorize images
    images_with_boats = []
    images_without_boats = []

    for img_path in sorted(image_files):
        label_path = labels_path / f"{img_path.stem}.txt"

        # Check if label file exists and has content
        has_boats = False
        if label_path.exists():
            with open(label_path, 'r') as f:
                lines = f.readlines()
                has_boats = len(lines) > 0

        if has_boats:
            images_with_boats.append(img_path)
        else:
            images_without_boats.append(img_path)

    print(f"\nDaytime images with boats: {len(images_with_boats)}")
    print(f"Daytime images without boats: {len(images_without_boats)}")

    # Limit to requested numbers
    images_with_boats = images_with_boats[:num_with_boats]
    images_without_boats = images_without_boats[:num_without_boats]

    # Display images with boats
    if images_with_boats:
        print(f"\n=== DISPLAYING {len(images_with_boats)} IMAGES WITH BOATS ===")
        display_image_grid(images_with_boats, labels_path,
                          title="DAYTIME IMAGES WITH BOATS",
                          color=(0, 255, 0))

    # Display images without boats
    if images_without_boats:
        print(f"\n=== DISPLAYING {len(images_without_boats)} IMAGES WITHOUT BOATS ===")
        display_image_grid(images_without_boats, labels_path,
                          title="DAYTIME IMAGES WITHOUT BOATS",
                          color=(255, 0, 0))


def display_image_grid(image_files, labels_path, title="Images", color=(0, 255, 0)):
    """
    Display a grid of images with their bounding boxes
    """
    n = len(image_files)
    cols = 5  # 5 images per row
    rows = (n + cols - 1) // cols

    # Create figure
    fig, axes = plt.subplots(rows, cols, figsize=(20, 4 * rows))
    fig.suptitle(title, fontsize=16, fontweight='bold', y=0.995)

    # Handle single row case
    if rows == 1:
        axes = axes.reshape(1, -1)

    # Flatten axes for easy iteration
    axes = axes.flatten()

    # Display each image
    for idx, img_path in enumerate(image_files):
        # Read image
        img = cv2.imread(str(img_path))
        if img is None:
            continue

        height, width = img.shape[:2]

        # Read corresponding label file
        label_path = labels_path / f"{img_path.stem}.txt"
        boxes = read_yolo_labels(label_path, width, height)

        # Draw boxes if present
        if boxes:
            img_with_boxes = draw_boxes_on_image(img, boxes, color=color)
            img_rgb = cv2.cvtColor(img_with_boxes, cv2.COLOR_BGR2RGB)
            title_text = f'{img_path.name}\n{len(boxes)} boats'
        else:
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            title_text = f'{img_path.name}\nNo boats'

        # Display
        axes[idx].imshow(img_rgb)
        axes[idx].axis('off')
        axes[idx].set_title(title_text, fontsize=8)

    # Hide unused subplots
    for idx in range(len(image_files), len(axes)):
        axes[idx].axis('off')

    plt.tight_layout()
    plt.show()


def get_detection_summary(base_folder, csv_path):
    """
    Print summary statistics of daytime boat detections
    """
    base_path = Path(base_folder)
    images_path = base_path / 'images'
    labels_path = base_path / 'labels'

    # Load daytime filter
    daytime_images = load_daytime_images(csv_path)

    # Get all image files
    all_image_files = (list(images_path.glob('*.jpg')) +
                      list(images_path.glob('*.png')) +
                      list(images_path.glob('*.jpeg')))

    # Filter for daytime
    image_files = [f for f in all_image_files if f.name in daytime_images]

    total_daytime = len(image_files)
    with_boats = 0
    without_boats = 0
    total_boat_count = 0

    for img_path in image_files:
        label_path = labels_path / f"{img_path.stem}.txt"

        if label_path.exists():
            with open(label_path, 'r') as f:
                lines = f.readlines()
                boat_count = len(lines)
                total_boat_count += boat_count

                if boat_count > 0:
                    with_boats += 1
                else:
                    without_boats += 1
        else:
            without_boats += 1

    print("\n" + "="*60)
    print("DAYTIME BOAT DETECTION SUMMARY")
    print("="*60)
    print(f"Total daytime images: {total_daytime}")
    #print(f"Images with boats: {with_boats} ({with_boats/total_daytime*100:.1f}%)")
    #print(f"Images without boats: {without_boats} ({without_boats/total_daytime*100:.1f}%)")
    print(f"Total boat detections: {total_boat_count}")
    if with_boats > 0:
        print(f"Average boats per image (when present): {total_boat_count/with_boats:.2f}")
    print("="*60)


# ============================================================
# USAGE EXAMPLES
# ============================================================

if __name__ == "__main__":

    # Get summary statistics
    get_detection_summary(
        base_folder="annotated_boats",
        csv_path="image_segmentation_grey.csv"
    )

    # Display split view: images with boats vs without boats
    display_daytime_split(
        base_folder="annotated_boats",
        num_with_boats=100,      # Number of images WITH boats to show
        num_without_boats=100    # Number of images WITHOUT boats to show
    )

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rcParams

# Set professional style
plt.style.use('seaborn-v0_8-darkgrid')
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
rcParams['font.size'] = 11

# Read the CSV file
df = pd.read_csv('combined_boat_data.csv')  # Replace with your actual filename

# Extract the timestamp part (after the underscore, before the file extension)
df['timestamp'] = df['filename'].str.extract(r'_(\d{8}T\d{6})')[0]

# Convert to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%dT%H%M%S')

# Extract date components
df['month'] = df['timestamp'].dt.month
df['year'] = df['timestamp'].dt.year
df['week'] = df['timestamp'].dt.isocalendar().week
df['day_of_week'] = df['timestamp'].dt.day_name()
df['day_num'] = df['timestamp'].dt.dayofweek

# Filter for June and December
june_data = df[df['month'] == 6].copy()
dec_data = df[df['month'] == 12].copy()

# Day configuration
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_short = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

# Modern, vibrant color palette
day_colors = {
    'Monday': '#3498db',      # Bright blue
    'Tuesday': '#e74c3c',     # Vibrant red
    'Wednesday': '#2ecc71',   # Fresh green
    'Thursday': '#f39c12',    # Golden orange
    'Friday': '#9b59b6',      # Purple
    'Saturday': '#1abc9c',    # Turquoise
    'Sunday': '#e67e22'       # Deep orange
}

def create_professional_histogram(data, month_name, color_scheme='blue'):
    """Create a professional-looking grouped histogram"""
    if len(data) == 0:
        print(f"No data for {month_name}")
        return

    # Prepare data
    data['year_week'] = data['year'].astype(str) + '-W' + data['week'].astype(str).str.zfill(2)
    weeks = sorted(data[['year', 'week', 'year_week']].drop_duplicates().values.tolist(),
                   key=lambda x: (x[0], x[1]))

    week_labels = []
    data_matrix = []

    for year, week, year_week in weeks:
        week_labels.append(year_week)
        week_data = data[(data['year'] == year) & (data['week'] == week)]

        day_counts = []
        for day in day_order:
            day_df = week_data[week_data['day_of_week'] == day]
            count = day_df['boat_count'].sum() if len(day_df) > 0 else 0
            day_counts.append(count)

        data_matrix.append(day_counts)

    data_matrix = np.array(data_matrix)
    n_weeks = len(week_labels)

    # Create figure with better proportions
    fig = plt.figure(figsize=(max(18, n_weeks * 1.5), 9))
    ax = fig.add_subplot(111)

    # Set background color
    fig.patch.set_facecolor('#f8f9fa')
    ax.set_facecolor('#ffffff')

    # Bar configuration
    bar_width = 0.115
    x = np.arange(n_weeks)

    # Plot bars for each day
    bars_list = []
    for day_idx, day in enumerate(day_order):
        offset = (day_idx - 3) * bar_width
        bars = ax.bar(x + offset,
                     data_matrix[:, day_idx],
                     bar_width,
                     label=day_short[day_idx],
                     color=day_colors[day],
                     edgecolor='white',
                     linewidth=1.5,
                     alpha=0.85,
                     zorder=3)
        bars_list.append(bars)

        # Add value labels with better styling
        for i, bar in enumerate(bars):
            height = bar.get_height()
            if height > 0:
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{int(height)}',
                       ha='center', va='bottom',
                       fontsize=9, fontweight='600',
                       color='#2c3e50')

    # Styling
    ax.set_xlabel('Week', fontsize=14, fontweight='bold', color='#2c3e50', labelpad=10)
    ax.set_ylabel('Boat Count', fontsize=14, fontweight='bold', color='#2c3e50', labelpad=10)

    # Title with better styling
    title_color = '#16a085' if month_name == 'June' else '#d35400'
    ax.set_title(f'{month_name} Weekly Boat Traffic by Day',
                fontsize=20, fontweight='bold', color=title_color, pad=25)

    # X-axis configuration
    ax.set_xticks(x)
    ax.set_xticklabels(week_labels, rotation=45, ha='right', fontsize=11, color='#34495e')

    # Y-axis configuration
    ax.tick_params(axis='y', labelsize=11, colors='#34495e')

    # Legend styling
    legend = ax.legend(title='Day of Week',
                      loc='upper left',
                      fontsize=11,
                      title_fontsize=12,
                      frameon=True,
                      fancybox=True,
                      shadow=True,
                      ncol=7,
                      bbox_to_anchor=(0, 1.02, 1, 0.1),
                      mode='expand',
                      borderaxespad=0)
    legend.get_frame().set_facecolor('#ffffff')
    legend.get_frame().set_alpha(0.95)
    legend.get_frame().set_edgecolor('#bdc3c7')

    # Grid styling
    ax.grid(axis='y', alpha=0.3, linestyle='-', linewidth=0.8, color='#bdc3c7', zorder=0)
    ax.set_axisbelow(True)

    # Spines styling
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_color('#bdc3c7')
    ax.spines['bottom'].set_color('#bdc3c7')
    ax.spines['left'].set_linewidth(1.5)
    ax.spines['bottom'].set_linewidth(1.5)

    # Y-axis limits
    y_max = data_matrix.max()
    ax.set_ylim(0, y_max * 1.2)

    plt.tight_layout()
    plt.show()

    # Print clean statistics
    print(f"\n{'─' * 90}")
    print(f"  {month_name.upper()} WEEKLY ANALYSIS")
    print(f"{'─' * 90}\n")

    for idx, (year, week, year_week) in enumerate(weeks):
        week_total = int(data_matrix[idx].sum())
        peak_day_idx = data_matrix[idx].argmax()
        peak_day = day_order[peak_day_idx]
        peak_count = int(data_matrix[idx, peak_day_idx])

        print(f"  📅 {year_week}")
        print(f"     Total: {week_total} boats  |  Peak: {peak_day} ({peak_count} boats)")

        # Compact daily breakdown
        daily = " | ".join([f"{day_short[i]}: {int(data_matrix[idx, i])}"
                           for i in range(7) if data_matrix[idx, i] > 0])
        print(f"     {daily}\n")

# Generate histograms
print("\n" + "═" * 90)
print("  🌊 BOAT TRAFFIC ANALYSIS - VISUAL REPORTS")
print("═" * 90)

create_professional_histogram(june_data, 'June', 'blue')
create_professional_histogram(dec_data, 'December', 'orange')

print("═" * 90)
print("  ✅ Analysis Complete!")
print("═" * 90)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from datetime import datetime
from matplotlib import rcParams

# Set professional style
plt.style.use('seaborn-v0_8-darkgrid')
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']

# Read the CSV file
df = pd.read_csv('combined_boat_data.csv')  # Your combined CSV

print(f"Total rows loaded: {len(df)}")
print(f"Columns: {df.columns.tolist()}\n")

# Extract timestamp from filename
# Assuming filename format like: 20231215_143022.jpg or 2023-12-15_14-30-22.jpg
def extract_timestamp(filename):
    """Extract timestamp from various filename formats"""
    try:
        # Try format: YYYYMMDD_HHMMSS
        match = re.search(r'(\d{8})_(\d{6})', str(filename))
        if match:
            date_str = match.group(1) + match.group(2)
            return pd.to_datetime(date_str, format='%Y%m%d%H%M%S')

        # Try format: YYYY-MM-DD_HH-MM-SS
        match = re.search(r'(\d{4})-(\d{2})-(\d{2})_(\d{2})-(\d{2})-(\d{2})', str(filename))
        if match:
            date_str = ''.join(match.groups())
            return pd.to_datetime(date_str, format='%Y%m%d%H%M%S')

        # Try format: YYYYMMDD
        match = re.search(r'(\d{8})', str(filename))
        if match:
            return pd.to_datetime(match.group(1), format='%Y%m%d')

        return None
    except:
        return None

# Apply timestamp extraction
df['timestamp'] = df['filename'].apply(extract_timestamp)

# Check how many timestamps were successfully extracted
valid_timestamps = df['timestamp'].notna().sum()
print(f"✓ Successfully extracted {valid_timestamps} timestamps from filenames")

if valid_timestamps == 0:
    print("\n⚠️  Could not extract timestamps from filenames.")
    print("Sample filenames:")
    print(df['filename'].head(10))
    print("\nPlease check the filename format!")
    exit()

# Remove rows without valid timestamps
df = df[df['timestamp'].notna()].copy()

# Extract year-month for grouping
df['year_month'] = df['timestamp'].dt.to_period('M')

# Group by year-month and sum boat counts
monthly_counts = df.groupby('year_month')['boat_count'].sum().reset_index()

# Sort chronologically
monthly_counts = monthly_counts.sort_values('year_month')

# Convert period to string for plotting
monthly_counts['year_month_str'] = monthly_counts['year_month'].astype(str)

# Create the professional histogram
fig = plt.figure(figsize=(16, 8))
ax = fig.add_subplot(111)

# Set background colors
fig.patch.set_facecolor('#f8f9fa')
ax.set_facecolor('#ffffff')

# Create bars with gradient effect
bars = ax.bar(range(len(monthly_counts)),
              monthly_counts['boat_count'],
              color='#3498db',
              edgecolor='white',
              linewidth=2,
              alpha=0.85,
              zorder=3)

# Add value labels on bars
for i, (bar, count) in enumerate(zip(bars, monthly_counts['boat_count'])):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
           f'{int(count)}',
           ha='center', va='bottom',
           fontsize=10, fontweight='bold',
           color='#2c3e50')

# Styling
ax.set_xticks(range(len(monthly_counts)))
ax.set_xticklabels(monthly_counts['year_month_str'],
                   rotation=45, ha='right', fontsize=11, color='#34495e')
ax.set_xlabel('Month', fontsize=14, fontweight='bold', color='#2c3e50', labelpad=12)
ax.set_ylabel('Total Boat Count', fontsize=14, fontweight='bold', color='#2c3e50', labelpad=12)
ax.set_title('Monthly Boat Count Analysis',
            fontsize=20, fontweight='bold', color='#16a085', pad=25)

# Grid styling
ax.grid(axis='y', alpha=0.3, linestyle='-', linewidth=0.8, color='#bdc3c7', zorder=0)
ax.set_axisbelow(True)

# Spines styling
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_color('#bdc3c7')
ax.spines['bottom'].set_color('#bdc3c7')
ax.spines['left'].set_linewidth(1.5)
ax.spines['bottom'].set_linewidth(1.5)

# Y-axis configuration
ax.tick_params(axis='y', labelsize=11, colors='#34495e')
y_max = monthly_counts['boat_count'].max()
ax.set_ylim(0, y_max * 1.15)

plt.tight_layout()
plt.show()

# Print detailed statistics
print("\n" + "═" * 80)
print("  📊 MONTHLY BOAT COUNT STATISTICS")
print("═" * 80)
print(f"\n  Total months analyzed: {len(monthly_counts)}")
print(f"  Total boats detected: {monthly_counts['boat_count'].sum()}")
print(f"  Average boats per month: {monthly_counts['boat_count'].mean():.2f}")
print(f"  Median boats per month: {monthly_counts['boat_count'].median():.2f}")
print(f"  Peak month: {monthly_counts.loc[monthly_counts['boat_count'].idxmax(), 'year_month_str']} "
      f"with {monthly_counts['boat_count'].max()} boats")
print(f"  Lowest month: {monthly_counts.loc[monthly_counts['boat_count'].idxmin(), 'year_month_str']} "
      f"with {monthly_counts['boat_count'].min()} boats")

# Show top 5 busiest months
print(f"\n  🔥 Top 5 Busiest Months:")
print("  " + "─" * 50)
top_5 = monthly_counts.nlargest(5, 'boat_count')
for idx, row in top_5.iterrows():
    print(f"     {row['year_month_str']}: {int(row['boat_count'])} boats")

# Show monthly breakdown
print(f"\n  📅 Monthly Breakdown:")
print("  " + "─" * 50)
for idx, row in monthly_counts.iterrows():
    bar_length = int((row['boat_count'] / y_max) * 40)
    bar = "█" * bar_length
    print(f"     {row['year_month_str']}: {bar} {int(row['boat_count'])}")

print("\n" + "═" * 80)
print("  ✅ Analysis Complete!")
print("═" * 80)

In [None]:
import pandas as pd

# Read the two CSV files
csv1 = pd.read_csv('boat_detection_results.csv')  # Has 'filename' column
csv2 = pd.read_csv('boat_detections_parallel.csv')  # Has 'image' column

# Display original columns
print("CSV 1 columns:", csv1.columns.tolist())
print("CSV 2 columns:", csv2.columns.tolist())
print(f"\nCSV 1 rows: {len(csv1)}")
print(f"CSV 2 rows: {len(csv2)}")

# Standardize column names for both CSVs
# CSV1: keep only filename and boat_count columns
if 'filename' in csv1.columns:
    csv1_clean = csv1[['filename', 'boat_count']].copy()
else:
    print("ERROR: 'filename' column not found in CSV 1")
    exit()

# CSV2: rename 'image' to 'filename' and keep only filename and boat_count
if 'image' in csv2.columns:
    csv2_clean = csv2[['image', 'boat_count']].copy()
    csv2_clean = csv2_clean.rename(columns={'image': 'filename'})
    print("\n✓ Renamed 'image' to 'filename' in CSV 2")
elif 'filename' in csv2.columns:
    csv2_clean = csv2[['filename', 'boat_count']].copy()
else:
    print("ERROR: Neither 'image' nor 'filename' column found in CSV 2")
    exit()

print("✓ Keeping only 'filename' and 'boat_count' columns")

# Combine the two dataframes
combined_df = pd.concat([csv1_clean, csv2_clean], ignore_index=True)

print(f"\n✓ Combined CSV rows: {len(combined_df)}")
print(f"✓ Combined CSV columns: {combined_df.columns.tolist()}")

# Remove any duplicate rows (optional)
combined_df = combined_df.drop_duplicates()
print(f"✓ After removing duplicates: {len(combined_df)} rows")

# Display sample of combined data
print("\n" + "="*60)
print("SAMPLE OF COMBINED DATA (first 10 rows):")
print("="*60)
print(combined_df.head(10))

# Display summary statistics
print("\n" + "="*60)
print("SUMMARY STATISTICS:")
print("="*60)
if 'boat_count' in combined_df.columns:
    print(f"Total boat count: {combined_df['boat_count'].sum()}")
    print(f"Average boat count: {combined_df['boat_count'].mean():.2f}")
    print(f"Max boat count: {combined_df['boat_count'].max()}")
    print(f"Min boat count: {combined_df['boat_count'].min()}")

# Save the combined data to a new CSV file
output_filename = 'combined_boat_data.csv'
combined_df.to_csv(output_filename, index=False)

print(f"\n✅ SUCCESS! Combined data saved to: {output_filename}")
print("="*60)