# Return Period Analysis and Bootstrap Uncertainty Intervals

## Overview
This script analyzes observed and model-based annual maximum series (AMS) for streamflow and fits probability distributions to estimate return periods. A bootstrap resampling approach is applied to generate Uncertainty intervals for future climate projections.

## Steps Performed:

1. **Setup & Configuration**
   - Defines file paths for observed and modeled AMS data (csv format).
   - Specifies probability distributions to test, including GEV, Gumbel, Weibull, and Pearson-III.

2. **Fit Distributions to Observed Data**
   - Reads observed AMS data.
   - Fits multiple probability distributions and selects the best one using the Kolmogorov-Smirnov (KS) test.
   - Computes return periods for the best-fit distribution.

3. **Fit Distributions to Model Data**
   - Iterates through climate model projections.
   - Fits distributions to each dataset and selects the best fit.
   - Stores AMS values for ensemble analysis.

4. **Bootstrap Resampling for Uncertainty Estimation**
   - Resamples AMS data using bootstrapping (1000 iterations).
   - Fits the same distribution family as observed.
   - Computes Uncertainty intervals (5th, 50th, and 95th percentiles) for return periods.

5. **Plotting & Exporting**
   - Plots return period curves for:
     - **Historical observations** (20th century).
     - **Future climate projections** (21st century, SSP scenarios).
     - **Bootstrap confidence intervals** (shaded in gray).
   - Saves the plot as an SVG file for high-quality output.

## Instructions for Use:
- Ensure that the observed and modeled AMS datasets are correctly formatted and stored in the specified directories.
- Modify `model_files` to include additional climate models if needed.
- Run the script to generate return period plots with confidence intervals.

### Output:
- A **return period curve plot** comparing historical and future projections.
- An **SVG file** saved at the specified output location.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import (genextreme, expon, genlogistic,
                         genpareto, gamma, pearson3, weibull_min, invgamma, kstest)
from scipy.stats import gumbel_r


# ------------------------------------------------------------------
# 1. SETUP: Paths, distributions
# ------------------------------------------------------------------

base_dir = r"D:/CMIP6-BiasCorrection-SWAT/workingfolder/AMS_Final"
observed_file = f"{base_dir}/observed_ams.csv"
model_files = [
#    f"{base_dir}/ACCESS-ESM1-5_ssp245.csv",
#    f"{base_dir}/CNRM-ESM2-1_ssp245.csv",
#    f"{base_dir}/MPI-ESM1-2-HR_ssp25.csv"
#]


    f"{base_dir}/ACCESS-ESM1-5_ssp585.csv",
    f"{base_dir}/GISS-E2-1-G_ssp585.csv",
    f"{base_dir}/CNRM-ESM2-1_ssp585.csv"
]


distributions = {
    "GEV": genextreme,
    "EXP": expon,
    "GLO": genlogistic,
    "GPA": genpareto,
    "GAM": gamma,
    "PE3": pearson3,
    "WEI": weibull_min,
    "GUM": gumbel_r,
    "PTV": invgamma 
}

n_boot = 1000
# We want return periods from 5 to 200, for the PPF
return_periods = np.linspace(5, 200, 100)
cdf_vals = 1 - 1.0 / return_periods


# ------------------------------------------------------------------
# 2. HELPER FUNCTIONS
# ------------------------------------------------------------------

def fit_all_distributions(data, dists):
    """
    Fit each distribution in 'dists' to 'data' and compute 
    the KS statistic. Return a dict: 
        { dist_name: (params, ks_stat), ... }
    """
    fits = {}
    for name, dist in dists.items():
        try:
            params = dist.fit(data)
            ks_stat, _ = kstest(
                data,
                lambda x: dist.cdf(x, *params[:-2], loc=params[-2], scale=params[-1])
            )
            fits[name] = (params, ks_stat)
        except:
            # If fit fails, store None / infinite KS
            fits[name] = (None, np.inf)
    return fits

def pick_best_distribution(fits_dict):
    """
    From a dict of {dist_name: (params, ks_stat)}, pick 
    the one with the lowest KS stat.
    """
    best_name = None
    best_params = None
    best_ks = np.inf
    for dist_name, (params, ks_stat) in fits_dict.items():
        if ks_stat < best_ks:
            best_ks = ks_stat
            best_name = dist_name
            best_params = params
    return best_name, best_params, best_ks

def print_fit_results(label, fits_dict):
    """
    Pretty-print the KS stats for all distributions, plus
    highlight which is best.
    """
    print(f"\n{label}")
    best_name, _, best_ks = pick_best_distribution(fits_dict)
    for dist_name, (params, ks_stat) in fits_dict.items():
        if np.isfinite(ks_stat):
            print(f"   {dist_name:3} => KS={ks_stat:.4f}")
        else:
            print(f"   {dist_name:3} => Fit failed")
    print(f"   BEST => {best_name} (KS={best_ks:.4f})")


# ------------------------------------------------------------------
# 3. FIT OBSERVED
# ------------------------------------------------------------------

observed_df = pd.read_csv(observed_file)
obs_data = observed_df["AMS"]

obs_fits = fit_all_distributions(obs_data, distributions)
print_fit_results("Observed Data Fit Results:", obs_fits)

# Decide which distribution to use for Observed
use_fit_choice_obs = "BEST"
if use_fit_choice_obs == "BEST":
    obs_dist_name, obs_dist_params, _ = pick_best_distribution(obs_fits)
else:
    obs_dist_name = use_fit_choice_obs
    obs_dist_params, _ = obs_fits[obs_dist_name]

obs_dist_obj = distributions[obs_dist_name]

# Evaluate the observed return‐period curve
obs_curve = obs_dist_obj.ppf(
    cdf_vals, 
    *obs_dist_params[:-2],
    loc=obs_dist_params[-2],
    scale=obs_dist_params[-1]
)


# ------------------------------------------------------------------
# 4. FIT MODEL DATA
# ------------------------------------------------------------------

model_datasets = []
combined_ams = []

for file in model_files:
    df = pd.read_csv(file)
    model_name = file.split("\\")[-1]  # or some other ID

    fits_dict = fit_all_distributions(df["AMS"], distributions)
    print_fit_results(f"Model: {model_name}", fits_dict)

    # Decide which distribution to *use* for the final results
    use_fit_choice = "BEST"
    if use_fit_choice == "BEST":
        best_name, best_params, _ = pick_best_distribution(fits_dict)
        chosen_name = best_name
        chosen_params = best_params
    else:
        chosen_name = use_fit_choice
        chosen_params, _ = fits_dict[chosen_name]

    model_datasets.append({
        "name": model_name,
        "data": df["AMS"].values,
        "dist_name": chosen_name,
        "dist_params": chosen_params
    })

    # For ensemble, we just gather all AMS in combined_ams
    combined_ams.append(df["AMS"].values)

combined_ams = np.concatenate(combined_ams)


# ------------------------------------------------------------------
# 5. BOOTSTRAP (using Observed's distribution family)
# ------------------------------------------------------------------

dist_obj = distributions[obs_dist_name]
bootstrap_curves = []

for _ in range(n_boot):
    sample = np.random.choice(combined_ams, size=len(combined_ams), replace=True)
    b_params = dist_obj.fit(sample)
    curve = dist_obj.ppf(
        cdf_vals,
        *b_params[:-2],
        loc=b_params[-2],
        scale=b_params[-1]
    )
    bootstrap_curves.append(curve)

bootstrap_curves = np.array(bootstrap_curves)
low_curve  = np.percentile(bootstrap_curves, 5,  axis=0)
med_curve  = np.percentile(bootstrap_curves, 50, axis=0)
high_curve = np.percentile(bootstrap_curves, 95, axis=0)


# ------------------------------------------------------------------
# 6. PLOTTING
# ------------------------------------------------------------------

# ------------------------------------------------------------------
# 6. PLOTTING AND EXPORTING TO SVG
# ------------------------------------------------------------------

plt.figure(figsize=(9, 6))

# Plot each bootstrap in gray
for i in range(n_boot):
    plt.plot(return_periods, bootstrap_curves[i,:],
             color='gray', alpha=0.2, linewidth=0.8)

# Plot percentile summary lines
plt.plot(return_periods, low_curve,  linestyle='--', color='blue',
         label="21st Century Low (5th)")
plt.plot(return_periods, med_curve,  linestyle='-',  color='green',
         label="21st Century Median (50th)")
plt.plot(return_periods, high_curve, linestyle='-.', color='red',
         label="21st Century High (95th)")

# Observed
plt.plot(return_periods, obs_curve, color='purple', linewidth=2,
         label="20th Century")

# Set log scale on the x-axis, but fix ticks at 5,10,25,50,100,200
plt.xscale('log')
plt.xlim(5, 200)
plt.xticks([5, 10, 25, 50, 100, 200],
           ['5', '10', '25', '50', '100', '200'])

plt.xlabel("Return Period (Years)")
plt.ylabel("Streamflow (m³/s)")
plt.title("Return Period Curves with 500 Bootstrap Samples (SSP2-4.5)")

plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()

# Save the plot to SVG format
output_svg_path = r"D:/CMIP6-BiasCorrection-SWAT/workingfolder/Results_Plots/ssp245_return_period.svg"  #Example
plt.savefig(output_svg_path, format="svg", dpi=300)  # Save before showing the plot

# Show the plot
plt.show()

# Notify that the SVG has been saved
print(f"Plot saved as SVG at: {output_svg_path}")
