# Automation ship detection

In [101]:
import os
import random
import time
from typing import List, Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path
from collections import Counter
import re
import folium
from folium import plugins

In [102]:
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

In [128]:
#-------PARAMETERS----------
# Assumed size of all cropped images
IMAGE_SIZE = 80
# Fixed size of the rough, proxy bounding box (e.g. 10 x 10 pixels)
PROXY_BOX_SIZE = 10
# Class ID for 'ship' in the detection model
SHIP_CLASS_ID = 1
OUTPUT_SIZE = 640

# INPUT_PATH
INPUT_PATH = "/home/jsancheg/git_environment/CalEnvAgency/data/raw/shipdata_2025/cropped_ship_dataset"
# OUTPUT PATH
OUTPUT_PATH = "/home/jsancheg/git_environment/CalEnvAgency/data/processed/upscaled_images"
LABEL_OUTPUT_DIR = "/home/jsancheg/git_environment/CalEnvAgency/data/processed/labeled_images"
YOLO_PROXY_LABEL_CONTENT = "1 0.5 0.5 0.125 0.125"

In [132]:


def parse_filename(filename):
  """
  Parse filename to extract metadata.
  Format: {label}__{scene_id}__{longitude}_{latitude}.png
  """
  try:
    parts = filename.replace('.png','').split('__')
    label = int(parts[0])
    scene_id = parts[1]
    coords = parts[2].split('_')
    longitude = float(coords[0])
    latitude = float(coords[1])
    
    return {
    'filename': filename,
    'label': label,
    'label_name': 'ship' if label == 1 else 'no-ship',
    'scene_id': scene_id,
    'longitude': longitude,
    'latitude': latitude
    }
  except Exception as e:
    print("Error parsing {filename}: {e}")
    return None

def load_dataset_metadata(dataset_path):
  """
  Load and parse all image metadata from the dataset.
  """
  print("="*88)
  print("SHIP DETECTION DATASET - EXPLORATORY DATA ANALYSIS")
  print("="*88)
  
  image_files = [ f for f in os.listdir(dataset_path) if f.endswith('.png')]
  metadata_list = []
  total_files = len(image_files)

  for filename  in image_files:
    meta = parse_filename(filename)
    if meta:
      metadata_list.append(meta)

  df = pd.DataFrame(metadata_list)
  print(f"\n Successfully loaded {len(df)} inages from a total of {total_files}")
  return df

def analyze_class_distribution(df):
  """
  Analyze the distribution of ship vs no-ship classes
  """
  
  print("\n"+ "="*80)
  print("1. CLASS DISTRIBUTION ANALYSIS")
  print("="*80)
  
  class_counts = df['label_name'].value_counts()
  print(f"\nClass Distribution:")
  
  for label, count in class_counts.items():
    percentage = (count / len(df)) * 100
    print(f"  {label:10s}:  {count:3d}  images  ({percentage:5.2f}%)" )
    
  # Calculate class inbalance ratio
  ship_count = class_counts.get('ship',0)
  no_ship_count = class_counts.get('no-ship',0)
  imbalance_ratio = max(ship_count, no_ship_count)/(min(ship_count, no_ship_count))
  print(f"\nClass Imbalance Ratio: {imbalance_ratio:2f}:1")
  
  if imbalance_ratio > 1.5:
    print("Warning: Significant class imbalance detected")
    print("Recommendation: Considerar class weighting or data augmentation")
    
  # Visualization
  fig, axes = plt.subplots(1, 2, figsize = (14,5))
  
  # Bar chart
  class_counts.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
  axes[0].set_title('Class_Distribution', fontsize = 14, fontweight = 'bold')
  axes[0].set_xlabel('Class', fontsize = 12)
  axes[0].set_ylabel('Number of Images', fontsize = 12)
  axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation = 0)
  axes[0].grid(axis='y', alpha = 0.3)
  
  # Pie chart
  colors = ['#2ecc71', '#e74c3c']
  axes[1].pie(class_counts.values, labels=class_counts.index, autopct="%1.1f%%",
  colors = colors, startangle=90)
  axes[1].set_title('Class Proportion', fontsize = 14, fontweight = 'bold')
  
  plt.tight_layout()
  plt.savefig('class_distribution.png', dpi = 300, bbox_inches = 'tight')
  print("\n Saved visualization: class_distribution.png")
  plt.close()
  
  return class_counts

def generate_proxy_label() -> str:
    """
    Calculates the simple, normalized YOLO label string for an 80x80 image
    containing a 10x10 proxy bounding box at the center.

    YOLO format: [class_id x_center_norm y_center_norm w_norm h_norm]
    """
    
    # 1. Calculate normalised center point (always the center of the image)
    x_center_norm = (IMAGE_SIZE / 2) / IMAGE_SIZE # 40/80 = 0.5
    y_center_norm = (IMAGE_SIZE / 2) / IMAGE_SIZE # 40/80 = 0.5

    # 2. Calculate normalised width and height
    w_norm = PROXY_BOX_SIZE / IMAGE_SIZE # 10/80 = 0.125
    h_norm = PROXY_BOX_SIZE / IMAGE_SIZE # 10/80 = 0.125

    # 3. Format the final YOLO string
    # We use the fixed SHIP_CLASS_ID (1)
    label_string = f"{SHIP_CLASS_ID} {x_center_norm:.3f} {y_center_norm:.3f} {w_norm:.3f} {h_norm:.3f}"
    return label_string
    
    

def analyze_spatial_distribution(df):
  """
  Analyze the geographic distribution of images
  """
  
  print("\n"+"="*80)
  print("2. SPATIAL DISTRIBUTION ANALYSIS")
  print("="*80)
  
  print(f"\nGeographic Coverage:")
  print(f"  Longitude range: {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")
  print(f"  Latitude range: {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
  
  # Calculate center point
  center_lon = df['longitude'].mean()
  center_lat = df['latitude'].mean()
  print(f"  Center point: ({center_lon:.4f}, {center_lat:.4f})")
  
  # Visualization
  fig, axes = plt.subplots(1,2, figsize = (16,6))
  
  # Scatter plot with ship/no-ship distinction
  for idx, (label,color) in enumerate([('ship', '#e74c3c'), ('no-ship', '#2ecc71')]):
    subset = df[df['label_name'] == label]
    axes[0].scatter(subset['longitude'], subset['latitude'],
    c=color, label = label, alpha = 0.6, s = 50, edgecolors = 'black', linewidth = 0.5)
    
  axes[0].set_title('Geographic Distribution of Images', fontsize = 14, fontweight = 'bold')
  axes[0].set_xlabel('Longitude', fontsize = 12)
  axes[0].set_ylabel('Latitude', fontsize = 12)
  axes[0].legend(loc = 'best')
  axes[0].grid(True, alpha = 0.3)
  
  # Density heatmap
  axes[1].hexbin(df['longitude'], df['latitude'], gridsize = 30, cmap = 'YlOrRd', mincnt = 1)
  axes[1].set_title('Spatial Density Heatmap', fontsize = 14, fontweight = 'bold')
  axes[1].set_xlabel('Longitude', fontsize = 12)
  axes[1].set_ylabel('Latitude', fontsize = 12)
  plt.colorbar(axes[1].collections[0], ax = axes[1], label='Number of Images')
  
  plt.tight_layout()
  plt.savefig('spatial_distribution.png', dpi = 300, bbox_inches = 'tight')
  print("\n Saved visualization: spatial_distribution.png")
  
  plt.close()

def analyze_spatial_distribution_map(df):
    """Analyze gepgraphic distribution using OpenStreetMap interactive map."""
    print("\n" + "="*80)
    print("2B. SPATIAL DISTRIBUTION ON OPENSTREETMAP")
    print("="*80)

    # Calculate center plot for map
    center_lat = df['latitude'].mean()
    center_lon = df['longitude'].mean()

    print(f"\nCreative interactive map centered at ({center_lat:.4f}, {center_lon:.4f})")

    # Create base map
    m = folium.Map(
        location = [center_lat, center_lon],
        zoom_start = 11,
        tiles ='OpenStreetMap'
    )

    # Add additional tile layers
    # Add additional tile layers with proper attributions
    folium.TileLayer(
        tiles='https://tiles.stadiamaps.com/tiles/stamen_terrain/{z}/{x}/{y}.png',
        attr='Map tiles by Stadia Maps, under CC BY 3.0. Data by OpenStreetMap, under ODbL',
        name='Terrain'
    ).add_to(m)
    
    folium.TileLayer(
        tiles='https://tiles.stadiamaps.com/tiles/stamen_toner/{z}/{x}/{y}.png',
        attr='Map tiles by Stadia Maps, under CC BY 3.0. Data by OpenStreetMap, under ODbL',
        name='Toner'
    ).add_to(m)
    
    folium.TileLayer(
        tiles='CartoDB positron',
        name='CartoDB Light'
    ).add_to(m)

    # Create separate feature groups for ships and no-ships
    ship_group = folium.FeatureGroup(name = 'Ships', show = True)
    no_ship_group = folium.FeatureGroup(name = 'No-Ships', show = True)

    # Add markers for each image location
    ship_count = 0
    no_ship_count = 0

    for _, row in df.iterrows():
        lat = row['latitude']
        lon = row['longitude']
        label = row['label_name']
        scene_id = row['scene_id']

        # Create popup content
        popup_html = f"""
        <div style="font-family: Arial; font-size: 12px;">
            <b>Label:</b> {label}<br>
            <b>Scene ID:</b> {scene_id} <br>
            <b> Coordinates: </b><br>
            Lat: {lat:.6f}<br>
            Lon: {lon:.6f}
        </div>
        """

        if label == 'ship':
            folium.CircleMarker(
                location = [lat, lon],
                radius = 5,
                popup = folium.Popup(popup_html, max_width = 250),
                color = '#e74c3c',
                fill = True,
                fillColor = '#e74c3c',
                fillOpacity = 0.7,
                weight = 1,
            ).add_to(ship_group)
            ship_count += 1
        else:
            folium.CircleMarker(
                location = [lat, lon],
                radius = 5,
                popup = folium.Popup(popup_html, max_width = 250),
                color = '#2ecc71',
                fill = True,
                fillColor = '#2ecc71',
                fillOpacity = 0.7,
                weight = 1
            ).add_to(no_ship_group)
            no_ship_count += 1

    # Add feature groups to map
    ship_group.add_to(m)
    no_ship_group.add_to(m)

    # Add heatmap layer for overall density
    heat_data = [[row['latitude'], row['longitude']] for _, row in df.iterrows()]
    plugins.HeatMap(
        heat_data,
        name = 'Density Heatmap',
        min_opacity = 0.3,
        max_zoom = 13,
        radius = 15,
        blur = 20,
        show = False
    ).add_to(m)

    # Add a marker cluster layer (optional, for dense data)
    marker_cluster = plugins.MarkerCluster(name = 'Clustered View', show = False)

    for _, row in df.iterrows():
        lat = row['latitude']
        long = row['longitude']
        label = row['label_name']
        color = 'red' if label == 'ship' else 'green'

        folium.Marker(
            location = [lat, lon],
            popup = f"{label} <br> { row['scene_id']}",
            icon = folium.Icon(color = color, icon = 'ship' if label == 'ship' else 'water', prefix = 'fa')
        ).add_to(marker_cluster)

    marker_cluster.add_to(m)

    # Add layer control
    folium.LayerControl(collapse = False).add_to(m)

    # Add legend
    legend_html = """
    <div style="positron: fixed;
                bottom: 50px; right: 50px; width: 180px; height: 120px;
                background-color: white; border:2px solid grey; z-index:9999;
                font-size:14px; padding: 10px>
            <p style = "margin: 0; font-weight: bold;">Legend</p>
            <p style = "margin: 5px 0;">
                <spam style ="color: #e74c3c; font-size: 20px;">●</span> Ships ({})
            </p>
            <p style = "margin: 5px 0;">
                <spam style = "color: #2ecc71; font-size: 20px;">●</span> No Ships ({})
            </p>
            <p style = "margin: 5px 0; font-size: 11px; color = #666;">
                Click markers for details
            </p>
    </div>
    """.format(ship_count, no_ship_count)

    m.get_root().html.add_child(folium.Element(legend_html))

    # Save map
    map_file = 'spatial_distribution_map.html'
    m.save(map_file)

    print(f"  Ship locations: {ship_count}")
    print(f"  No-ship locations: {no_ship_count}")
    print(f"\n Saved interactive map: {map_file}")
    print("  Open the HTML file in a web browser to explore the map")
    print(" Features: Toggle layers, zoom, pan, click marker for details")

def upscale_positive_ships(input_dir, output_dir, df):
    """
    Filters the DataFrame for 'ship' labels, loads the corresponfing 80x80 images,
    resizes them to 640x640, and saves to the output path

    Args:
        input_dir: Path where the original 80x80 images are stores
        output_dir: Path where the upscaled 640x640 images will be saved.
        df: DataFrame containing image metadata, including the 'label' column.
    """

    print("\n---- Starting image Upscaling Process ----")

    # 1. Ensure output directory exists
    os.makedirs(output_dir, exist_ok = True)

    # 2. Filter for positive samples (images containing ships)
    positive_df = df[df['label'] == 1].reset_index(drop = True)
    num_to_upscale = len(positive_df)

    print(f"Found {num_to_upscale} positive samples ('ship' label) to upscale.")

    if num_to_upscale == 0:
        print("No positive samples found. Skipping upscaling.")
        return

    # 3. Iterate, Upscale, and Save
    start_time = time.time()

    for index, row in positive_df.iterrows():
        filename = row['filename']
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)

        try:
            # Load the 80x80 image using PIL
            img = Image.open(input_path)

            # Upscale the image to 640x640 (YOLO input size)
            # Image Resampling.BILINEAR or BICUBIC
            upscaled_img = img.resize((OUTPUT_SIZE, OUTPUT_SIZE), Image.Resampling.BICUBIC)

            # Savet the upscale image
            upscaled_img.save(output_path)

            if index % 10 == 0:
                print(f"[{index}/{num_to_upscale}] Upscaled and saved: {filename}")

        except Exception as e:
            print(f"ERROR processing {filename}: {e}. Skipping.")

    end_time = time.time()
    print(f"\nSuccessfully upscaled {num_to_upscale} images to {OUTPUT_SIZE}x{OUTPUT_SIZE}px.")
    print(f"Total processing time: {end_time - sart_time:.2f} seconds.")
          
def generate_yolo_proxy_labels(image_dir, label_dir, label_content):
    """
    Reads image files from the upscaled directory and generate a corresponding
    YOLO label file (.txt) for each one with the fixed proxy bounding box content.
    """

    print(f"\n--- Starting YOLO proxy Label Generation ---")

    # 1. Ensure output directory exists
    os.makedirs(label_dir, exist_ok = True)

    # 2. Get list all upscale image files
    image_files = [f for f in os.listdir(image_dir) if f.endswith('.png')]
    num_labels_created = 0
    start_time = time.time()

    print(f"Found {len(image_files)}: upscale images to label.")

    # 3. Iterate, rename, and save the label file
    for filename in image_files:
        # 3a. Get the base name (e.g. "1__scene_001__long_lat")
        base_name = os.path.splitext(filename)[0]

        # 3b. Define the output label path (.txt extension)
        label_filename = base_name + '.txt'
        label_path = os.path.join(label_dir, label_filename)

        # 3c. Write the fixed proxy label content to the new .txt file
        try:
            with open(label_path, 'w') as f:
                f.write(label_content + '\n')
            num_labels_created += 1

        except Exception as e:
            print(f"ERROR writing label for {filename}: {e}")

    end_time = time.time()
    print(f"\nSuccessfully created {num_labels_created} YOLO label files.")
    print(f"Labels saved in: '{label_dir}'")
    print(f"Total processing time: {end_time - start_time:.2f} seconds.")
    


def analyze_scene_distribution(df):
    """Analyze distribution across differente satellite scenes."""
    print("\n" + "="*80)
    print("3. SCENE DISTRIBUTION ANALYSIS")
    print("="*80)

    scene_counts = df['scene_id'].value_counts()
    print(f"\nTotal unique scenes: {len(scene_counts)}")
    print(f"Images per scene - Mean: {scene_counts.mean():.1f}, Median: {scene_counts.median():.1f}")
    print(f"Images per scene - Min: {scene_counts.min()}, Max: {scene_counts.max()}")

    print(f"\nTop 10 scenes by image count:")
    for i, (scene_id, count) in enumerate(scene_counts.head(10).items(), 1):
        print(f" {i:2d}, {scene_id}:  {count} images")

    # Analize class distribution per scene
    
    scene_class_dist = df.groupby(['scene_id', 'label_name']).size().unstack(fill_value = 0)
    print(f"\nScenes with only ships: {(scene_class_dist['ship'] > 0) & (scene_class_dist['no-ship'] == 0).sum()}")
    print(f"Scenes with only no-ships: {(scene_class_dist['no-ship'] > 0) & (scene_class_dist['ship'] == 0).sum()}")
    print(f"Scenes with both classes: {((scene_class_dist['ship'] > 0) & (scene_class_dist['no-ship'] > 0)).sum()}")


    # Visualization
    fig, axes = plt.subplots(1, 2, figsize = (16 , 6))

    # Distribution of images per scene
    axes[0].hist(scene_counts.values, bins = 30, color = '#3498db', edgecolor = 'black', alpha = 0.7)
    axes[0].set_title('Distribution of Images per Scene', fontsize = 14, fontweight = 'bold')
    axes[0].set_xlabel('Number of Images', fontsize = 12)
    axes[0].set_ylabel('Number of Scenes', fontsize = 12)
    axes[0].axvline(scene_counts.mean(), color = 'red', linestyle = '--', linewidth = 2, label = f'Mean: {scene_counts.mean():.1f}')
    axes[0].axvline(scene_counts.median(), color = 'green', linestyle = '--', linewidth = 1, label = f'Median: {scene_counts.median():.1f}')
    axes[0].legend()
    axes[0].grid(axis = 'y', alpha = 0.3)

    # Top scenes bar plot
    top_scenes = scene_counts.head(15)
    axes[1].barh(range(len(top_scenes)), top_scenes.values, color = '#9b59b6')
    axes[1].set_yticks(range(len(top_scenes)))
    axes[1].set_yticklabels([scene_id[:20] + '...' if len(scene_id) > 20 else scene_id for scene_id in top_scenes.index], fontsize = 9)
    axes[1].set_xlabel('Number of Images', fontsize = 12)
    axes[1].set_title('Top 15 scenes by Image Count', fontsize = 14, fontweight = 'bold')
    axes[1].invert_yaxis()
    axes[1].grid(axis = 'x', alpha = 0.3)

    plt.tight_layout()
    plt.savefig('scene_distribution.png', dpi = 300, bbox_inches = 'tight')
    print("\n Saved visualization: scene_distribution.png")
    plt.close()
    

  
def analyze_image_properties(dataset_path, df, sample_size = 50):
  """
  Analyze image properties like dimension, color distribution, etc.
  """
  
  print("\n"+"="*80)
  print("4. IMAGE PROPERTIES ANALYSIS")
  print("="*80)
  
  # Sample images for detailed analysis
  sample_df = df.sample(min(sample_size, len(df)),random_state = 42)
  
  image_stats = []
  for _, row in sample_df.iterrows():
    img_path = os.path.join(dataset_path, row['filename'])
    try:
      img = Image.open(img_path)
      img_array = np.array(img)
      
      stats = {
        'filename': row['filename'],
        'label': row['label_name'],
        'width': img.width,
        'height': img.height,
        'channels': img_array.shape[2] if len(img_array.shape) == 3 else 1,
        'mean_intensity': img_array.mean(),
        'std_intensity': img_array.std(),
        'min_intensity': img_array.min(),
        'max_intensity': img_array.max()
      }
      
      image_stats.append(stats)
    except Exception as e:
      print(f"Error processing {row['filename']}: {e}")
  stats_df = pd.DataFrame(image_stats)
  
  print(f"\nImage properties (based on {len(stats_df)} sample images):")
  print(f"  Image dimensions: {stats_df['width'].iloc[0]}x{stats_df['height'].iloc[0]} pixels")
  print(f"  Color chanels: {stats_df['channels'].iloc[0]} (RGB)")
  print(f"  Pixel coverage: {stats_df['width'].iloc[0] * 3}m x {stats_df['height'].iloc[0] * 3}m = {(stats_df['width'].iloc[0] * 3 * stats_df['height'].iloc[0] * 3) / 1000000:.2f} km²")
  
  print(f"\nIntensive Statistics:")
  print(f"  Mean intensity - Ship: {stats_df[stats_df['label'] == 'ship']['mean_intensity'].mean():.2f}")
  print(f"  Mean intensity - No-ship: {stats_df[stats_df['label'] == 'no-ship']['mean_intensity'].mean():.2f}")
  print(f"  Std intensity - Ship: {stats_df[stats_df['label'] == 'ship']['std_intensity'].mean():.2f}")
  print(f"  Std intensity - No-ship: {stats_df[stats_df['label'] == 'no-ship']['std_intensity'].mean():.2f}")
  
  # Visualization
  fig, axes = plt.subplots(2,2, figsize = (14, 10))
  
  # Mean intensity distribution
  stats_df[stats_df['label'] == 'ship']['mean_intensity'].hist(ax=axes[0,0], bins = 20, color = '#e74c3c', alpha = 0.7, label = 'Ship')
  stats_df[stats_df['label'] == 'no-ship']['mean_intensity'].hist(ax=axes[0,0], bins = 20, color = '#2ecc71', alpha = 0.7, label = 'No-ship')
  axes[0,0].set_title('Mean Pixel Intensity Distribution', fontsize = 12, fontweight = 'bold')
  axes[0,0].set_xlabel('Mean Intensity', fontsize = 10)
  axes[0,0].set_ylabel('Frequency', fontsize = 10)
  axes[0,0].legend()
  axes[0,0].grid(axis = 'y', alpha = 0.3)
  
  # Std intensity distribution
  stats_df[stats_df['label'] == 'ship']['std_intensity'].hist(ax = axes[0,1], bins = 20, color = '#e74c3c', alpha = 0.7, label = 'Ship')
  stats_df[stats_df['label'] == 'no-ship']['std_intensity'].hist(ax = axes[0,1], bins = 20, color = '#2ecc71', alpha = 0.7, label = 'No-ship')
  axes[0,1].set_title('Std Pixel Itensity Distribution', fontsize = 12, fontweight = 'bold')
  axes[0,1].set_xlabel('Std Intensity', fontsize = 10)
  axes[0,1].set_ylabel('Frequency', fontsize = 10)
  axes[0,1].legend()
  axes[0,1].grid(axis = 'y', alpha = 0.3)
  
  # Sample images - Ships
  ship_samples = df[df['label_name'] == 'ship'].sample(min(4, len(df[df['label_name'] == 'ship'])), random_state = 42)
  for idx, (_, row) in enumerate(ship_samples.iterrows()):
    if idx >= 2:
      break
    img_path = os.path.join(dataset_path, row['filename'])
    try:
      img = Image.open(img_path)
      axes[1, idx].imshow(img)
      axes[1, idx].set_title(f'Ship Example {idx+1}', fontsize = 10, fontweight = 'bold')
      axes[1, idx].axis('off')
    except:
      pass
    
  plt.tight_layout()
  plt.savefig('image_properties.png', dpi = 300, bbox_inches = 'tight')
  print("\n Saved visualizations: image_properties.png")
  plt.close()
  
  return stats_df

def generate_summary_report(df, class_counts):
  """Generate a comprehensive summary report."""
  print("\n" + "="*80)
  print("5. DATASET SUMMARY REPORT")
  print("="*80)
  
  summary = f"""
  Data Characteristics:
    ------------------
    Total Images: {len(df)}
    Imate Resulution: 80x80 pixels (240m x 240m ground coverage)
    Pixel Resolution: 3 meters per nivel 
    Color Channels: 3 (RGB)
    
    Class Distribution:
      ----------------
      Ship Images: {class_counts.get('ship',0)} ({(class_counts.get('ship', 0)/len(df)*100):.1f}%)
      No-Ship Images: {class_counts.get('no-ship', 0)} ({(class_counts.get('no-ship', 0)/ len(df)*100):,.1f}%)
    
    Geopraphic Coverage:
      -----------------
      Longitude Range: {df['longitude'].min():.4f}" to {df['longitude'].max():.4f}"
      Latitude Range: {df['latitude'].min():.4f}" to {df['latitude'].max():.4f}"
      Center Point: ({df['Longitude'].mean():4f}",{df['latitude'].mean():.4f}")
      
    Scene Information:
      ----------------
      Unique Scenes: {df['scene_id'].unique()}
      Average Images per Scene: {df.groupby('scene_id').size().mean():.1f)}
      
      Key Findings:
        -----------
        1. The dataset is relatively balanced between ship and no-ship classes
        2. Images are concentrated around the California coastline.
        3. Multiple scenes provide temporal diversity
        4. Consistent 80x80 pixel format suitable for CNN-based approaches
        5. 3-meter resolution is appropriate for detecting large vessels
        
        Recomendations:
          ------------
          1. Dataset is suitable for supervised learning approaches
          2. Consider data augmentation to improve model robustness
          3. Implement train/validation/test split stratified by scene_id
          4. Use transfer learning with pre-trained models (e.g ResNet, EfficientNet)
          5. Consider essemble methods for production deployment
"""

  print(summary)
  
  # Save report to file
  with open('dataset_summary_report.txt', 'w') as f:
    f.write("SHIP DETECTION DATASET - EXPLORATORY DATA ANALYSIS\n")
    f.write("California Environmental Agency\n")
    f.write("="*80 + "\n")
    f.write(summary)
    
  print("\n Saved report: dataset_summary_report.txt")


  
      
  
  
  
  

    


In [133]:
df = load_dataset_metadata(INPUT_PATH)

# Perform analyses
class_counts = analyze_class_distribution(df)
analyze_spatial_distribution(df)
analyze_spatial_distribution_map(df)
analyze_scene_distribution(df)
stats_df = analyze_image_properties(INPUT_PATH, df)



SHIP DETECTION DATASET - EXPLORATORY DATA ANALYSIS

 Successfully loaded 4000 inages from a total of 4000

1. CLASS DISTRIBUTION ANALYSIS

Class Distribution:
  no-ship   :  3000  images  (75.00%)
  ship      :  1000  images  (25.00%)

Class Imbalance Ratio: 3.000000:1
Recommendation: Considerar class weighting or data augmentation

 Saved visualization: class_distribution.png

2. SPATIAL DISTRIBUTION ANALYSIS

Geographic Coverage:
  Longitude range: -122.6763 to -117.9021
  Latitude range: 33.6290 to 37.9035
  Center point: (-121.7676, 37.2023)

 Saved visualization: spatial_distribution.png

2B. SPATIAL DISTRIBUTION ON OPENSTREETMAP

Creative interactive map centered at (37.2023, -121.7676)
  Ship locations: 1000
  No-ship locations: 3000

 Saved interactive map: spatial_distribution_map.html
  Open the HTML file in a web browser to explore the map
 Features: Toggle layers, zoom, pan, click marker for details

3. SCENE DISTRIBUTION ANALYSIS

Total unique scenes: 434
Images per scene 

In [131]:
generate_yolo_proxy_labels(OUTPUT_PATH, LABEL_OUTPUT_DIR, YOLO_PROXY_LABEL_CONTENT)


--- Starting YOLO proxy Label Generation ---
Found 1000: upscale images to label.

Successfully created 1000 YOLO label files.
Labels saved in: '/home/jsancheg/git_environment/CalEnvAgency/data/processed/labeled_images'
Total processing time: 0.10 seconds.


In [None]:
directory = Path(OUTPUT_PATH)

if directory.exists() and directory.is_dir():
    # Count files (not including subdirectories)
    file_count = len([f for f in directory.iterdir() if f.is_file()])
    if file_count == 0:
        try:
            upscale_positive_ships(INPUT_PATH, OUTPUT_PATH, df)
        except Exception as e:
            print(f"ERROR: There are files in the directory {OUTPUT_PATH}. Elimine those files from the directory first.") 


In [121]:
directory = Path(OUTPUT_PATH)
print(directory.exists())
print(directory.is_dir())

print(df['label'].value_counts())

print(df.columns)
print(df['scene_id'].value_counts())
print(df.groupby('scene_id')['label'].value_counts())

True
True
label
0    3000
1    1000
Name: count, dtype: int64
Index(['filename', 'label', 'label_name', 'scene_id', 'longitude', 'latitude'], dtype='object')
scene_id
20180712_180429_101b      39
20180712_180755_0f2d      38
20161218_180845_0e26      37
20180708_180453_0f28      36
20170515_180653_1007      34
                          ..
20160905_193459_0c37       1
20171203_185452_1_0f2b     1
20160704_204236_0c41       1
20171213_185421_0f3b       1
20171023_181357_1044       1
Name: count, Length: 434, dtype: int64
scene_id                label
20150718_184300_090b    0        4
20150720_184302_0906    0        1
20150830_000650_0b07    0        3
20150830_000652_1_0b07  0        2
                        1        1
                                ..
20180714_180427_1029    0        4
                        1        3
20180714_182155_1051    0        4
                        1        2
20180714_182329_101d    0        4
Name: count, Length: 647, dtype: int64


In [109]:
print(df.shape)
#print(df)

class_counts = analyze_class_distribution(df)
print(class_counts)
print(df.columns)
print(df['filename'].head(1))

True
True
0
(4000, 6)

1. CLASS DISTRIBUTION ANALYSIS

Class Distribution:
  no-ship   :  3000  images  (75.00%)
  ship      :  1000  images  (25.00%)

Class Imbalance Ratio: 3.000000:1
Recommendation: Considerar class weighting or data augmentation

 Saved visualization: class_distribution.png
label_name
no-ship    3000
ship       1000
Name: count, dtype: int64
Index(['filename', 'label', 'label_name', 'scene_id', 'longitude', 'latitude'], dtype='object')
0    0__20170917_190707_0f44__-122.3946224097109_37...
Name: filename, dtype: object


In [52]:
class NearestNeighbor:
    def __init__(self):
        pass

    def train(self, X, y):
        """ X is N x D where each row is an example. Y is 1-dimension of size N """
        # the nearest neighbor classifier simply remembers all the training data
        self.Xtr = X
        self.ytr = y

    def predict(self, X):
        """ X is N X D where each row is an example we wish to predict label for """
        num_test = X.shape[0]
        # lets make sure that the output type matches the input type
        Ypred = np.zeros(num_test, dtype = self.ytr.dtype)

        # loop over all test rows
        for i in xrange(num_test):
            # find the nearest training image to the i'th test image
            # using the L1 distance (sum of absolute value differences)
            distances = np.sum(np.abs(self.Xtr - X[i,:]), axis = 1)
            min_index = np.argmin(distances) # get the index with smallest distance
            Ypred[1] = self.ytr[min_index] # predict the label of the nearest example

        return Ypred

In [53]:
#htpps://github.com/facebookresearch/faiss 'Johnson et al "Billion-scale similarity search with GPU's"

In [48]:
pip install folium

Collecting folium
  Using cached folium-0.20.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting branca>=0.6.0 (from folium)
  Downloading branca-0.8.2-py3-none-any.whl.metadata (1.7 kB)
Collecting xyzservices (from folium)
  Downloading xyzservices-2025.10.0-py3-none-any.whl.metadata (4.3 kB)
Using cached folium-0.20.0-py2.py3-none-any.whl (113 kB)
Downloading branca-0.8.2-py3-none-any.whl (26 kB)
Downloading xyzservices-2025.10.0-py3-none-any.whl (92 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.7/92.7 kB[0m [31m148.8 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: xyzservices, branca, folium
Successfully installed branca-0.8.2 folium-0.20.0 xyzservices-2025.10.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/home/jsanche