# Climate Feature Extraction for Hospital Service Areas

**Purpose**: Extract comprehensive climate features at facility locations from multiple satellite/reanalysis datasets

**Study Period**: January 2019 - January 2024

**Datasets**:
- CHIRPS: Daily precipitation
- ERA5-Land: Temperature, humidity, wind
- TerraClimate: Water balance (ET, PET, VPD, runoff)
- SRTM: Elevation

**Output**: Per-facility climate summary statistics suitable for k-means clustering

In [None]:
#@title Install and import libraries
!pip -q install earthengine-api geemap geopandas shapely pandas numpy scikit-learn matplotlib

import ee
import pandas as pd
import numpy as np
import geopandas as gpd
from google.colab import files
from datetime import datetime
import json

print("Libraries installed successfully")

In [None]:
#@title Authenticate and initialize Earth Engine
try:
    ee.Initialize()
    print('✓ Earth Engine initialized')
except Exception:
    ee.Authenticate()
    ee.Initialize(project='ee-izaslavsky')
    print('✓ Earth Engine authenticated and initialized')

## Configuration

In [None]:
#@title Configuration parameters

# Study period
START_DATE = '2019-01-01'  #@param {type:"string"}
END_DATE = '2024-01-31'    #@param {type:"string"}

# Spatial parameters
BUFFER_METERS = 2500  #@param {type:"number"}
SCALE_METERS = 5000   #@param {type:"number"}

# Output settings
OUTPUT_PREFIX = 'Hospitals_Climate_Features'  #@param {type:"string"}

print(f"Study period: {START_DATE} to {END_DATE}")
print(f"Buffer radius: {BUFFER_METERS}m")
print(f"Spatial scale: {SCALE_METERS}m")

## Load Facility Data

In [None]:
#@title Upload facility file

USE_DEMO = False  #@param {type:"boolean"}

if USE_DEMO:
    # Demo data for Jordan (approximate locations)
    df_demo = pd.DataFrame({
        'FacilityName': ['Amman_Hospital', 'Zarqa_Hospital', 'Irbid_Hospital', 'Aqaba_Hospital', 'Mafraq_Hospital'],
        'Category': ['NCD', 'NCD', 'INF', 'NCD', 'INF'],
        'lat': [31.95, 32.07, 32.55, 29.53, 32.34],
        'lon': [35.93, 36.09, 35.85, 35.01, 36.21]
    })
    gdf = gpd.GeoDataFrame(
        df_demo,
        geometry=gpd.points_from_xy(df_demo['lon'], df_demo['lat']),
        crs='EPSG:4326'
    )
    print(f"✓ Using demo data: {len(gdf)} facilities")
else:
    print('Upload your facility file (CSV/GeoJSON/Shapefile/GPKG):')
    print('It will look like INF_facility_coordinates.csv:')
    uploaded = files.upload()
    fname = next(iter(uploaded.keys()))
    ext = fname.lower().split('.')[-1]

    if ext == 'csv':
        df = pd.read_csv(fname)
        # Find coordinate columns
        cols_lower = {c.lower(): c for c in df.columns}
        lat_col = cols_lower.get('lat') or cols_lower.get('latitude') or cols_lower.get('y')
        lon_col = cols_lower.get('lon') or cols_lower.get('lng') or cols_lower.get('longitude') or cols_lower.get('x')
        id_col = cols_lower.get('id') or cols_lower.get('healthfacility') or cols_lower.get('name') or df.columns[0]

        if not (lat_col and lon_col):
            raise ValueError(f"Could not find lat/lon columns. Available: {df.columns.tolist()}")

        gdf = gpd.GeoDataFrame(
            df.rename(columns={id_col: 'FacilityName'}),
            geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
            crs='EPSG:4326'
        )
    else:
        gdf = gpd.read_file(fname)
        # Ensure FacilityName column exists
        if 'FacilityName' not in gdf.columns:
            for candidate in ['id', 'healthfacility', 'name', 'Name', 'Hospital']:
                if candidate in gdf.columns:
                    gdf = gdf.rename(columns={candidate: 'FacilityName'})
                    break
            else:
                gdf['FacilityName'] = [f'facility_{i}' for i in range(len(gdf))]

    print(f"✓ Loaded {len(gdf)} facilities")

# Ensure WGS84
if gdf.crs is None:
    gdf = gdf.set_crs('EPSG:4326')
else:
    gdf = gdf.to_crs('EPSG:4326')

print(f"\nColumns: {gdf.columns.tolist()}")
print(f"\nFirst few rows:")
print(gdf.head())

In [None]:
#@title Convert to Earth Engine FeatureCollection

def gdf_to_ee_fc(gdf_input, buffer_m=0):
    """Convert GeoDataFrame to EE FeatureCollection with optional buffer"""
    features = []
    for idx, row in gdf_input.iterrows():
        geom = row.geometry
        if geom is None or geom.is_empty:
            continue

        # Create EE geometry
        if geom.geom_type == 'Point':
            ee_geom = ee.Geometry.Point([geom.x, geom.y])
        else:
            ee_geom = ee.Geometry(geom.__geo_interface__)

        # Apply buffer if specified
        if buffer_m > 0:
            ee_geom = ee_geom.buffer(buffer_m)

        # Add properties (convert complex types to strings)
        props = {
            'FacilityName': str(row.get('FacilityName', f'facility_{idx}')),
            'lat': float(geom.y) if geom.geom_type == 'Point' else None,
            'lon': float(geom.x) if geom.geom_type == 'Point' else None,
        }

        # Add Category if present
        if 'Category' in row:
            props['Category'] = str(row['Category'])

        features.append(ee.Feature(ee_geom, props))

    return ee.FeatureCollection(features)

fc_facilities = gdf_to_ee_fc(gdf, buffer_m=BUFFER_METERS)
n_facilities = fc_facilities.size().getInfo()
print(f"✓ Created FeatureCollection with {n_facilities} facilities")
print(f"  Buffer: {BUFFER_METERS}m around each point")

## Define Climate Feature Extraction Functions

In [None]:
#@title CHIRPS Precipitation Features

def extract_chirps_features(fc, start, end, scale):
    """
    Extract precipitation statistics from CHIRPS daily dataset
    Returns: P_mean, P_total, wetday_frac, heavy_days, P95_threshold
    """
    chirps = ee.ImageCollection('UCSB-CHG/CHIRPS/DAILY').filterDate(start, end)

    # Mean precipitation
    precip_mean = chirps.select('precipitation').mean()

    # Total precipitation
    precip_total = chirps.select('precipitation').sum()

    # Wet day fraction (>1mm)
    wet_days = chirps.select('precipitation').map(lambda img: img.gt(1.0))
    wet_day_frac = wet_days.mean()

    # Heavy precipitation days (>95th percentile)
    # First calculate 95th percentile
    p95 = chirps.select('precipitation').reduce(ee.Reducer.percentile([95]))

    # Count days exceeding p95 (expressed as days per year)
    n_years = ee.Date(end).difference(ee.Date(start), 'year')
    heavy_days = chirps.select('precipitation').map(
        lambda img: img.gte(p95)
    ).sum().divide(n_years)

    # Combine into single image
    combined = ee.Image.cat([
        precip_mean.rename('P_mean_mm'),
        precip_total.rename('P_total_mm'),
        wet_day_frac.rename('wetday_frac'),
        heavy_days.rename('heavy_days_per_year'),
        p95.rename('P95_threshold_mm')
    ])

    # Extract to facilities
    def reduce_feature(feat):
        reduced = combined.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=feat.geometry(),
            scale=scale,
            maxPixels=1e9
        )
        return feat.set(reduced)

    return fc.map(reduce_feature)

print("✓ CHIRPS extraction function defined")

In [None]:
#@title ERA5-Land Temperature & Meteorological Features

def extract_era5_features(fc, start, end, scale):
    """
    Extract temperature and meteorological variables from ERA5-Land
    Returns: T_mean, T_min, T_max, DTR, T_hot_days, dewpoint, wind_speed
    """
    era5 = ee.ImageCollection('ECMWF/ERA5_LAND/DAILY_AGGR').filterDate(start, end)

    # Temperature (convert from K to C)
    temp_mean = era5.select('temperature_2m').mean().subtract(273.15)
    temp_min = era5.select('temperature_2m_min').mean().subtract(273.15)
    temp_max = era5.select('temperature_2m_max').mean().subtract(273.15)

    # Diurnal Temperature Range
    dtr = era5.map(lambda img:
        img.select('temperature_2m_max').subtract(img.select('temperature_2m_min'))
    ).mean()

    # Hot days (>35°C)
    n_years = ee.Date(end).difference(ee.Date(start), 'year')
    hot_days = era5.select('temperature_2m_max').map(
        lambda img: img.subtract(273.15).gt(35)
    ).sum().divide(n_years)

    # Dewpoint temperature (proxy for humidity)
    dewpoint_mean = era5.select('dewpoint_temperature_2m').mean().subtract(273.15)

    # Wind speed (u and v components)
    u_wind = era5.select('u_component_of_wind_10m')
    v_wind = era5.select('v_component_of_wind_10m')
    wind_speed = u_wind.map(lambda img:
        img.pow(2).add(v_wind.filterDate(img.date(), img.date().advance(1, 'day')).first().pow(2)).sqrt()
    ).mean()

    # Combine
    combined = ee.Image.cat([
        temp_mean.rename('T_mean_C'),
        temp_min.rename('T_min_C'),
        temp_max.rename('T_max_C'),
        dtr.rename('DTR_C'),
        hot_days.rename('hot_days_per_year'),
        dewpoint_mean.rename('dewpoint_C'),
        wind_speed.rename('wind_speed_ms')
    ])

    # Extract to facilities
    def reduce_feature(feat):
        reduced = combined.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=feat.geometry(),
            scale=scale,
            maxPixels=1e9
        )
        return feat.set(reduced)

    return fc.map(reduce_feature)

print("✓ ERA5-Land extraction function defined")

In [None]:
#@title TerraClimate Water Balance Features

def extract_terraclimate_features(fc, start, end, scale):
    """
    Extract water balance variables from TerraClimate
    Returns: PET, AET, deficit, runoff, soil_moisture, VPD
    """
    # TerraClimate is monthly, so we'll get monthly means
    terra = ee.ImageCollection('IDAHO_EPSCOR/TERRACLIMATE').filterDate(start, end)

    # Potential Evapotranspiration (mm)
    pet_mean = terra.select('pet').mean()

    # Actual Evapotranspiration (mm)
    aet_mean = terra.select('aet').mean()

    # Climate Water Deficit (PET - AET)
    deficit_mean = terra.select('def').mean()

    # Runoff (mm)
    runoff_mean = terra.select('ro').mean()

    # Soil moisture (mm)
    soil_mean = terra.select('soil').mean()

    # Vapor Pressure Deficit (kPa)
    vpd_mean = terra.select('vpd').mean().divide(10)  # Convert to kPa

    # Combine
    combined = ee.Image.cat([
        pet_mean.rename('PET_mm'),
        aet_mean.rename('AET_mm'),
        deficit_mean.rename('deficit_mm'),
        runoff_mean.rename('runoff_mm'),
        soil_mean.rename('soil_moisture_mm'),
        vpd_mean.rename('VPD_kPa')
    ])

    # Extract to facilities
    def reduce_feature(feat):
        reduced = combined.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=feat.geometry(),
            scale=scale,
            maxPixels=1e9
        )
        return feat.set(reduced)

    return fc.map(reduce_feature)

print("✓ TerraClimate extraction function defined")

In [None]:
#@title Elevation from SRTM

def extract_elevation(fc, scale):
    """
    Extract elevation from SRTM
    """
    srtm = ee.Image('USGS/SRTMGL1_003')
    elevation = srtm.select('elevation')

    # Extract to facilities
    def reduce_feature(feat):
        reduced = elevation.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=feat.geometry(),
            scale=scale,
            maxPixels=1e9
        )
        return feat.set({'elevation_m': reduced.get('elevation')})

    return fc.map(reduce_feature)

print("✓ Elevation extraction function defined")

## Extract Features

In [None]:
#@title Extract all climate features (this may take several minutes)

print("Starting feature extraction...\n")

# Start with facilities
fc_result = fc_facilities

# 1. CHIRPS Precipitation
print("[1/4] Extracting CHIRPS precipitation features...")
try:
    fc_result = extract_chirps_features(fc_result, START_DATE, END_DATE, SCALE_METERS)
    print("      ✓ CHIRPS complete")
except Exception as e:
    print(f"      ✗ CHIRPS failed: {e}")

# 2. ERA5-Land Temperature & Met
print("[2/4] Extracting ERA5-Land temperature & meteorological features...")
try:
    fc_result = extract_era5_features(fc_result, START_DATE, END_DATE, SCALE_METERS)
    print("      ✓ ERA5-Land complete")
except Exception as e:
    print(f"      ✗ ERA5-Land failed: {e}")

# 3. TerraClimate Water Balance
print("[3/4] Extracting TerraClimate water balance features...")
try:
    fc_result = extract_terraclimate_features(fc_result, START_DATE, END_DATE, SCALE_METERS)
    print("      ✓ TerraClimate complete")
except Exception as e:
    print(f"      ✗ TerraClimate failed: {e}")

# 4. Elevation
print("[4/4] Extracting elevation...")
try:
    fc_result = extract_elevation(fc_result, SCALE_METERS)
    print("      ✓ Elevation complete")
except Exception as e:
    print(f"      ✗ Elevation failed: {e}")

print("\n✓ All extractions complete!")

## Export Results

In [None]:
#@title Convert to DataFrame and export

print("Converting to DataFrame...")

# Get the feature collection info
fc_info = fc_result.getInfo()

# Extract properties from each feature
rows = []
for feat in fc_info['features']:
    props = feat['properties']
    rows.append(props)

df_result = pd.DataFrame(rows)

# Add metadata columns
df_result['start_date'] = START_DATE
df_result['end_date'] = END_DATE
df_result['buffer_m'] = BUFFER_METERS
df_result['scale_m'] = SCALE_METERS
df_result['extraction_date'] = datetime.now().strftime('%Y-%m-%d')

print(f"\n✓ Created DataFrame with {len(df_result)} facilities and {len(df_result.columns)} columns")
print(f"\nColumns: {df_result.columns.tolist()}")
print(f"\nSummary statistics:")
print(df_result.describe())

# Show first few rows
print(f"\nFirst few rows:")
print(df_result.head())

In [None]:
#@title Export to CSV

# Generate filename
timestamp = datetime.now().strftime('%Y%m%d')
csv_filename = f"{OUTPUT_PREFIX}_{timestamp}.csv"

# Save
df_result.to_csv(csv_filename, index=False)

print(f"✓ Saved to {csv_filename}")
print(f"\nDownload the file:")
files.download(csv_filename)

In [None]:
#@title Optional: Split by Category (INF/NCD)

if 'Category' in df_result.columns:
    # Split by category
    for category in df_result['Category'].unique():
        df_cat = df_result[df_result['Category'] == category]
        cat_filename = f"{OUTPUT_PREFIX}_{category.lower()}_{timestamp}.csv"
        df_cat.to_csv(cat_filename, index=False)
        print(f"✓ Saved {category}: {cat_filename} ({len(df_cat)} facilities)")
        files.download(cat_filename)
else:
    print("No 'Category' column found - skipping split export")

## Feature Analysis & Visualization

In [None]:
#@title Visualize climate feature distributions

import matplotlib.pyplot as plt
import seaborn as sns

# Select key climate features for visualization
viz_features = [
    'P_mean_mm', 'T_mean_C', 'DTR_C', 'PET_mm',
    'VPD_kPa', 'elevation_m', 'wetday_frac', 'hot_days_per_year'
]

# Filter to available features
viz_features = [f for f in viz_features if f in df_result.columns]

if len(viz_features) >= 4:
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    axes = axes.flatten()

    for i, feat in enumerate(viz_features[:8]):
        if i < len(axes):
            df_result[feat].hist(ax=axes[i], bins=20, edgecolor='black')
            axes[i].set_title(feat, fontsize=10)
            axes[i].set_xlabel('')

    # Hide unused subplots
    for i in range(len(viz_features), len(axes)):
        axes[i].set_visible(False)

    plt.tight_layout()
    plt.savefig('climate_feature_distributions.png', dpi=150, bbox_inches='tight')
    plt.show()
    files.download('climate_feature_distributions.png')


    print("✓ Feature distributions saved as 'climate_feature_distributions.png'")
else:
    print("Not enough features available for visualization")

In [None]:
#@title Correlation matrix of climate features

# Select numeric climate features
numeric_cols = df_result.select_dtypes(include=[np.number]).columns
# Exclude metadata
climate_cols = [c for c in numeric_cols if c not in ['lat', 'lon', 'buffer_m', 'scale_m']]

if len(climate_cols) > 3:
    # Calculate correlation matrix
    corr_matrix = df_result[climate_cols].corr()

    # Plot
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        corr_matrix,
        annot=True,
        fmt='.2f',
        cmap='RdBu_r',
        center=0,
        square=True,
        linewidths=0.5,
        cbar_kws={"shrink": 0.8}
    )
    plt.title('Climate Feature Correlation Matrix', fontsize=14, pad=20)
    plt.tight_layout()
    plt.savefig('climate_feature_correlations.png', dpi=150, bbox_inches='tight')
    plt.show()

    print("✓ Correlation matrix saved as 'climate_feature_correlations.png'")
    files.download('climate_feature_correlations.png')
else:
    print("Not enough features for correlation analysis")

## K-Means Clustering Preview

In [None]:
#@title Preview k-means clustering (k=8)

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Select features for clustering
cluster_features = [c for c in climate_cols if c in df_result.columns]

if len(cluster_features) >= 3 and len(df_result) >= 8:
    # Prepare data
    X = df_result[cluster_features].values
    X = np.nan_to_num(X, nan=np.nanmedian(X, axis=0))

    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # K-means clustering
    k = min(8, len(df_result))  # Use k=8 or fewer if limited facilities
    kmeans = KMeans(n_clusters=k, n_init=20, random_state=42)
    df_result['climate_k'] = kmeans.fit_predict(X_scaled)

    print(f"✓ K-means clustering complete (k={k})")
    print(f"\nCluster sizes:")
    print(df_result['climate_k'].value_counts().sort_index())

    # Visualize clusters in 2D (PCA)
    from sklearn.decomposition import PCA

    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)

    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(
        X_pca[:, 0],
        X_pca[:, 1],
        c=df_result['climate_k'],
        cmap='tab10',
        s=100,
        alpha=0.7,
        edgecolors='black',
        linewidth=0.5
    )
    plt.colorbar(scatter, label='Climate Cluster')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
    plt.title(f'Facility Climate Clusters (k={k})', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('climate_clusters_pca.png', dpi=150, bbox_inches='tight')
    plt.show()

    print("\n✓ Cluster visualization saved as 'climate_clusters_pca.png'")
    files.download('climate_clusters_pca.png')


    # Export with cluster labels
    csv_with_clusters = f"{OUTPUT_PREFIX}_with_clusters_{timestamp}.csv"
    df_result.to_csv(csv_with_clusters, index=False)
    print(f"\n✓ Exported with cluster labels: {csv_with_clusters}")
    files.download(csv_with_clusters)

else:
    print("Insufficient data for clustering preview")

## Summary Report

In [None]:
#@title Generate summary report

print("="*60)
print("CLIMATE FEATURE EXTRACTION SUMMARY")
print("="*60)
print(f"\nStudy Period: {START_DATE} to {END_DATE}")
print(f"Number of facilities: {len(df_result)}")
print(f"Buffer radius: {BUFFER_METERS}m")
print(f"Spatial scale: {SCALE_METERS}m")

if 'Category' in df_result.columns:
    print(f"\nFacilities by category:")
    for cat, count in df_result['Category'].value_counts().items():
        print(f"  {cat}: {count}")

print(f"\nClimate features extracted ({len(climate_cols)}):")
for i, col in enumerate(climate_cols, 1):
    print(f"  {i:2d}. {col}")

print(f"\nMissing data:")
missing = df_result[climate_cols].isna().sum()
if missing.sum() == 0:
    print("  None - all features extracted successfully!")
else:
    for col, count in missing[missing > 0].items():
        print(f"  {col}: {count} missing ({count/len(df_result)*100:.1f}%)")

if 'climate_k' in df_result.columns:
    print(f"\nClimate regime distribution (k={df_result['climate_k'].nunique()}):")
    for cluster, count in df_result['climate_k'].value_counts().sort_index().items():
        print(f"  Cluster {cluster}: {count} facilities ({count/len(df_result)*100:.1f}%)")

print("\n" + "="*60)
print("✓ Extraction complete! Files ready for HSA delineation.")
print("="*60)