### **Setup and Configuration for CMIP6 Data Download**
This cell sets up the required configurations for downloading CMIP6 climate model data from the NASA AWS S3 bucket.  

- **Lists available CMIP6 datasets** from the public **nex-gddp-cmip6** S3 bucket.  
- **Defines supported climate models** along with their unique realization patterns (e.g., `r1i1p1f1_gr`).  
- **Specifies emission scenarios and time periods**, including historical (`1950–2014`) and future projections (`2015–2100`) for SSP pathways (`ssp126`, `ssp245`, etc.).  
- **Lists climate variables** to be downloaded, such as precipitation (`pr`), maximum/minimum temperature (`tasmax`, `tasmin`), and radiation variables (`rsds`, `rlds`).  
- **Includes multiple data versions** (e.g., `_v1.2`, `_v1.1`) to ensure compatibility with different CMIP6 dataset releases.  


In [None]:
import os
import subprocess
import time  # For adding delays
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
from typing import List, Tuple

!aws s3 ls --no-sign-request s3://nex-gddp-cmip6/


# Models with their unique patterns
models = {
    "EC-Earth3": "r1i1p1f1_gr",
    "EC-Earth3-Veg-LR": "r1i1p1f1_gr",
    "CNRM-CM6-1": "r1i1p1f2_gr",
    "CNRM-ESM2-1": "r1i1p1f2_gr",
    "IPSL-CM6A-LR": "r1i1p1f1_gr",
    "KACE-1-0-G": "r1i1p1f1_gr",
    
    "ACCESS-CM2": "r1i1p1f1_gn",
    "ACCESS-ESM1-5": "r1i1p1f1_gn",
    "BCC-CSM2-MR": "r1i1p1f1_gn",
    "CanESM5": "r1i1p1f1_gn",
    "CESM2": "r4i1p1f1_gn",
    "CESM2-WACCM": "r3i1p1f1_gn",
    "CMCC-CM2-SR5": "r1i1p1f1_gn",
    "CMCC-ESM2": "r1i1p1f1_gn",
    "MIROC-ES2L": "r1i1p1f2_gn",
    "MIROC6": "r1i1p1f1_gn",
    "MPI-ESM1-2-HR": "r1i1p1f1_gn",
    "MPI-ESM1-2-LR": "r1i1p1f1_gn",
    "MRI-ESM2-0": "r1i1p1f1_gn",
    "NESM3": "r1i1p1f1_gn",
    "NorESM2-LM": "r1i1p1f1_gn",
    "NorESM2-MM": "r1i1p1f1_gn",
    "TaiESM1": "r1i1p1f1_gn",
    "UKESM1-0-LL": "r1i1p1f2_gn",
    "FGOALS-g3": "r3i1p1f1_gn",
    "GISS-E2-1-G": "r1i1p1f2_gn",
    "HadGEM3-GC31-LL": "r1i1p1f3_gn",
    "HadGEM3-GC31-MM": "r1i1p1f3_gn",
    "IITM-ESM": "r1i1p1f1_gn",
    
    "GFDL-CM4": "r1i1p1f1_gr1",
    "GFDL-ESM4": "r1i1p1f1_gr1",
    "INM-CM4-8": "r1i1p1f1_gr1",
    "INM-CM5-0": "r1i1p1f1_gr1",
    "KIOST-ESM": "r1i1p1f1_gr1",

    "GFDL-CM4_gr2": "r1i1p1f1_gr2"
}

# Define scenarios and their year ranges
scenarios = {
    "historical": range(1950, 2015),
    "ssp126": range(2015, 2101),
    "ssp245": range(2015, 2101),
    "ssp370": range(2015, 2101),
    "ssp585": range(2015, 2101)
}

# Variables to download and possible suffixes
variables = {
    "pr": ["_v1.2", "_v1.1", ""],
    "tasmax": ["_v1.2", "_v1.1", ""],
    "tasmin": ["_v1.2", "_v1.1", ""],
    "hurs": ["_v1.2", "_v1.1", ""],
    "sfcWind": ["_v1.2", "_v1.1", ""],
    "rsds": ["_v1.2", "_v1.1", ""],
    "rlds": ["_v1.2", "_v1.1", ""],
    "huss": ["_v1.2", "_v1.1", ""],
    "tas": ["_v1.2", "_v1.1", ""]
}

### **Parallel Downloading of CMIP6 Data from AWS S3**
This section automates the bulk download of **CMIP6 climate model data** from NASA’s **AWS S3 storage** using **multi-threading** for efficiency.

- **Defines the base S3 bucket and output directory** to store downloaded NetCDF files.
- **Lists available files in the S3 directory** before attempting downloads.
- **Implements parallel downloading using Python’s `ThreadPoolExecutor`** to speed up the process.
- **Ensures the latest available file versions (`_v1.2`, `_v1.1`, etc.) are prioritized** for each variable.
- **Creates and maintains the folder structure** based on **model, scenario, and variable**.
- **Skips existing files** to avoid redundant downloads and reduce bandwidth usage.
- **Handles errors gracefully**, ensuring the script continues even if individual downloads fail.

At the end of execution, all requested **CMIP6 historical and future projection data** is downloaded and stored in the specified directory.


In [None]:
# Base AWS S3 bucket
s3_bucket = "nex-gddp-cmip6/NEX-GDDP-CMIP6"

# Base output directory
base_output_dir = "D:\Hesham\WhiteNile\CMIP6-BiasCorrection-SWAT\workingfolder\CMIP6_GDDP-NEX"


# Download parallel
# Function to list files in the S3 directory
def list_s3_files(bucket, prefix):
    try:
        command = f"aws s3 ls --no-sign-request s3://{bucket}/{prefix}/"
        result = subprocess.run(command, shell=True, capture_output=True, text=True)
        if result.returncode == 0:
            return [line.split()[-1] for line in result.stdout.splitlines()]
        else:
            print(f"Error listing S3 files: {result.stderr}")
            return []
    except Exception as e:
        print(f"Error listing files: {e}")
        return []

# Function to download a file from S3
def download_s3_file(bucket, s3_path, local_path):
    try:
        command = f"aws s3 cp --no-sign-request s3://{bucket}/{s3_path} \"{local_path}\""
        result = subprocess.run(command, shell=True, capture_output=True, text=True)
        if result.returncode == 0:
            print(f"Successfully downloaded: {local_path}")
        else:
            print(f"Error downloading {s3_path}: {result.stderr}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# Function to handle downloading files for provided parameters (used by the thread pool)
def handle_download(s3_bucket, model, scenario, realization, variable, year, available_files, base_output_dir):
    suffixes = sorted(["_v1.2", "_v1.1", ""], reverse=True)  # Latest versions first
    file_found = False
    for suffix in suffixes:
        for grid in ['gr', 'gr1', 'gr2', 'gn']:
            file_name = f"{variable}_day_{model}_{scenario}_{realization}_{grid}_{year}{suffix}.nc"
            if file_name in available_files:
                local_dir = os.path.join(base_output_dir, model, scenario, variable)
                local_file = os.path.join(local_dir, file_name)
                os.makedirs(local_dir, exist_ok=True)

                if not os.path.exists(local_file):
                    print(f"Downloading {file_name} from S3...")
                    download_s3_file(s3_bucket, f"{model}/{scenario}/{realization}/{variable}/{file_name}", local_file)
                else:
                    print(f"File already exists, skipping: {local_file}")
                
                file_found = True
                break
        if file_found:
            break

    if not file_found:
        print(f"No available files for {variable}, {model}, {scenario}, {year}. Skipping.")
    return file_found

# Configure the number of concurrent threads
max_workers = 5

# Main script logic
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = []

    for model, pattern in models.items():
        for scenario, year_range in scenarios.items():
            realization = pattern.split("_")[0]
            for variable in variables:
                s3_prefix = f"{model}/{scenario}/{realization}/{variable}"
                available_files = list_s3_files(s3_bucket, s3_prefix)

                for year in year_range:
                    future = executor.submit(handle_download, s3_bucket, model, scenario, realization, variable, year, available_files, base_output_dir)
                    futures.append(future)

    for future in as_completed(futures):
        future.result()  # Ensure all futures are completed

print("All downloads completed.")

### **Clipping CMIP6 NetCDF Files to the Nile Basin Shapefile**
This section processes **downloaded CMIP6 NetCDF files** by clipping them to the exact boundary of the **Nile Basin shapefile**.

- **Defines input and output directories** for raw and clipped NetCDF files.
- **Loads the Nile Basin shapefile** to extract the required spatial extent.
- **Loops through all downloaded NetCDF files**, ensuring only `.nc` files are processed.
- **Clips each NetCDF file** to match the spatial boundaries of the shapefile.
- **Maintains the original folder structure** in the output directory.
- **Saves the clipped NetCDF files** with `_clipped.nc` appended to the filename.
- **Handles errors gracefully**, ensuring failed files do not interrupt the workflow.

At the end of execution, all CMIP6 climate data is **spatially cropped** to your region and stored in the specified output directory.


In [None]:
import os
import xarray as xr
import rioxarray  # Required for spatial operations
import geopandas as gpd

# Define paths
input_base_dir = r"D:/Hesham/WhiteNile/CMIP6-BiasCorrection-SWAT/workingfolder/CMIP6_GDDP-NEX"
output_base_dir = r"D:/Hesham/WhiteNile/CMIP6-BiasCorrection-SWAT/workingfolder/clipped_data"
shapefile_path = r"D:/Hesham/WhiteNile/CMIP6-BiasCorrection-SWAT/workingfolder/NileBasin/NileBasin.shp"

# Load the shapefile
gdf = gpd.read_file(shapefile_path)

# Function to clip NetCDF file
def clip_nc_to_shapefile(nc_path, output_path):
    try:
        ds = xr.open_dataset(nc_path)
        ds = ds.rio.write_crs("EPSG:4326")  # Ensure correct projection
        clipped_ds = ds.rio.clip(gdf.geometry.apply(lambda x: x.__geo_interface__), gdf.crs)
        clipped_ds.to_netcdf(output_path)
        print(f"Clipped and saved: {output_path}")
    except Exception as e:
        print(f"Error processing {nc_path}: {e}")

# Loop through downloaded files and clip them
for root, _, files in os.walk(input_base_dir):
    for file in files:
        if file.endswith(".nc"):  # Ensure we're working with NetCDF files
            input_nc_path = os.path.join(root, file)
            
            # Create a similar folder structure in the output directory
            relative_path = os.path.relpath(root, input_base_dir)
            output_dir = os.path.join(output_base_dir, relative_path)
            os.makedirs(output_dir, exist_ok=True)
            
            output_nc_path = os.path.join(output_dir, file.replace(".nc", "_clipped.nc"))
            
            # Process the file
            clip_nc_to_shapefile(input_nc_path, output_nc_path)

print("All NetCDF files have been clipped and saved.")