# Cyprus Terrace Detection: Feature Extraction Workflow

This notebook processes satellite imagery and DEM data to extract features for terrace detection across Cyprus. The workflow includes:
- Training data processing and feature extraction
- Data cleaning and NaN handling for water-dominated areas
- Parallel processing of polygon grids across the entire island

**Data Sources:**
- Sentinel-2 imagery
- Google Earth imagery  
- ALOS DEM
- Land cover classification

## 1. Import Libraries and Dependencies

Import required libraries for geospatial processing, parallel computing, and feature extraction.

In [None]:
# Core geospatial and data processing libraries
import os
import glob
import time
import multiprocessing
import numpy as np
import pandas as pd
import geopandas as gpd

# Raster processing libraries
import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.mask import mask
import geowombat as gw
import richdem as rd
from pysheds.grid import Grid
from rasterstats import zonal_stats

# Parallel processing and utilities
from joblib import Parallel, delayed
from tqdm import tqdm
from shapely.geometry import Polygon, mapping

# Custom modules for terrace detection
from workflow_functions import process_workflow
import fast_glcm
from edges_cyprus import edge_detection

## 2. Data Paths Configuration

Define paths to input datasets and configure data sources for the feature extraction workflow.

In [None]:
# Base data directory
DATA_DIR = "E:/Cyprus_paper_data"

# Training polygon files for initial feature extraction
training_polygon_paths = glob.glob(f"{DATA_DIR}/training_new_gpkg/*.gpkg")

# Core raster datasets
landcover_raster = f"{DATA_DIR}/mosaic_Cyprus36n.tif"
dem_raster = f"{DATA_DIR}/ALOS_newCyprus.tif"

# Sentinel-2 imagery bands
sentinel_imagery = {
    "blue": f"{DATA_DIR}/Sentinel2/blueSen.tif",
    "red": f"{DATA_DIR}/Sentinel2/redSen.tif", 
    "green": f"{DATA_DIR}/Sentinel2/greenSen.tif",
    "gray": f"{DATA_DIR}/Sentinel2/graySen.tif"
}

# Google Earth imagery
google_imagery = {
    "red": f"{DATA_DIR}/red.tif",
    "gray": f"{DATA_DIR}/grayscale.tif"
}

print(f"Found {len(training_polygon_paths)} training polygon files")

## 3. Training Data Processing

Extract features from training/validation polygon grids using parallel processing.

In [None]:
def process_single_polygon(polygon_file):
    """
    Process a single polygon file through the feature extraction workflow.
    
    Args:
        polygon_file (str): Path to polygon GPKG file
    """
    process_workflow(polygon_file, google_imagery, sentinel_imagery, landcover_raster, dem_raster)

# Configure parallel processing
num_cores = multiprocessing.cpu_count()
print(f"Using {num_cores} CPU cores for parallel processing")

# Process all training polygon files
print("Processing training polygon files...")
Parallel(n_jobs=num_cores)(
    delayed(process_single_polygon)(polygon_file) 
    for polygon_file in training_polygon_paths
)
print("Training data processing completed!")

## 4. Data Cleaning: Handle Missing Values

Clean processed data by filling NaN values in topographic features for water-dominated areas (land cover class 80).

In [None]:
def clean_water_dominated_polygons():
    """
    Fill NaN values with 0 for topographic features in water-dominated areas.
    Water areas (land cover = 80) naturally have undefined topographic statistics.
    """
    # Get all processed polygon files
    processed_polygon_paths = glob.glob(f"{DATA_DIR}/grid_all_polysSen/*.gpkg")
    
    # Topographic columns that should be 0 in water areas
    topographic_columns = [
        'range_elevation', 'std_elevation', 'percentile_10_slope',
        'percentile_90_slope', 'mean_slope', 'std_slope', 'range_slope',
        'range_profcurv', 'std_profcurv', 'range_plancurv', 'std_plancurv'
    ]
    
    total_nans_remaining = 0
    
    print(f"Cleaning {len(processed_polygon_paths)} polygon files...")
    
    for polygon_file in processed_polygon_paths:
        # Load polygon data
        polygons = gpd.read_file(polygon_file)
        
        # Identify water-dominated polygons with missing topographic data
        water_polygons_mask = (
            (polygons['majority_landcover'] == 80) & 
            (polygons['range_elevation'].isnull())
        )
        
        # Fill NaN values with 0 for water-dominated areas
        polygons.loc[water_polygons_mask, topographic_columns] = (
            polygons.loc[water_polygons_mask, topographic_columns].fillna(0)
        )
        
        # Save cleaned data
        polygons.to_file(polygon_file, driver='GPKG')
        
        # Count remaining NaN values (excluding terrace labels)
        remaining_nans = (
            polygons.drop(columns=['terrace'], errors='ignore')
            .isnull().sum().sum()
        )
        total_nans_remaining += remaining_nans
        
        if remaining_nans > 0:
            print(f"  {os.path.basename(polygon_file)}: {remaining_nans} NaN values remaining")
    
    print(f"\nCleaning completed. Total NaN values remaining: {total_nans_remaining}")

# Execute cleaning for training data
clean_water_dominated_polygons()

## 5. Merge Training Data

Combine all processed training polygon files into a single dataset for model training.

In [None]:
def merge_training_polygons():
    """
    Merge all processed training polygon files into a single GeoPackage.
    
    Returns:
        str: Path to the merged training dataset
    """
    # Get all processed polygon files
    processed_files = glob.glob(f"{DATA_DIR}/grid_all_polysSen/*.gpkg")
    
    print(f"Merging {len(processed_files)} polygon files...")
    
    # Load and combine all polygon files
    polygon_dataframes = []
    for polygon_file in processed_files:
        polygons = gpd.read_file(polygon_file)
        polygon_dataframes.append(polygons)
    
    # Concatenate all dataframes
    merged_training_data = gpd.GeoDataFrame(
        pd.concat(polygon_dataframes, ignore_index=True)
    )
    
    # Save merged dataset
    output_path = f"{DATA_DIR}/TrainingCyprus_popsegs.gpkg"
    merged_training_data.to_file(output_path, driver="GPKG")
    
    print(f"Training data merged successfully!")
    print(f"  Total polygons: {len(merged_training_data):,}")
    print(f"  Output file: {output_path}")
    
    return output_path

# Execute merge
training_data_path = merge_training_polygons()

## 6. Full Cyprus Processing

Process the remaining polygon grids to cover the entire island of Cyprus for comprehensive feature extraction.

In [None]:
# Find all remaining polygon files for full Cyprus coverage
cyprus_polygon_folder = f"{DATA_DIR}/CyprusMap/polygons_all"

# Collect all GPKG files in the directory
remaining_polygon_files = []
for root, dirs, filenames in os.walk(cyprus_polygon_folder):
    for filename in filenames:
        if filename.endswith('.gpkg'):
            remaining_polygon_files.append(os.path.join(root, filename))

print(f"Found {len(remaining_polygon_files)} polygon files for full Cyprus processing")
print(f"Sample files: {remaining_polygon_files[:3]}")  # Show first 3 files

In [None]:
# Configure parallel processing for full Cyprus
NUM_CORES_CYPRUS = 8  # Reduced cores to prevent system overload

print(f"Processing {len(remaining_polygon_files)} polygon files using {NUM_CORES_CYPRUS} cores...")

# Process all remaining polygon files for full Cyprus coverage
Parallel(n_jobs=NUM_CORES_CYPRUS)(
    delayed(process_single_polygon)(polygon_file) 
    for polygon_file in remaining_polygon_files
)

print("Full Cyprus processing completed!")

## 7. Final Data Cleaning

Apply the same cleaning procedure to the full Cyprus dataset to handle water-dominated areas.

In [None]:
# Apply final cleaning to all processed data
print("Applying final data cleaning to all processed Cyprus data...")
clean_water_dominated_polygons()

print("\n" + "="*60)
print("WORKFLOW COMPLETED SUCCESSFULLY!")
print("="*60)
print("Summary:")
print("- Training data processed and merged")
print("- Full Cyprus polygon grids processed")  
print("- Water-dominated areas cleaned")
print("- All data ready for terrace detection modeling")
print("="*60)