In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import xarray as xr
import xclim
import xdatasets as xd
from lmoments3.distr import KappaGen
from scipy import stats
from sklearn.cluster import HDBSCAN, OPTICS, AgglomerativeClustering
from sklearn.decomposition import PCA

import xhydro as xh
import xhydro.frequency_analysis as xhfa
import xhydro.gis as xhgis
from xhydro.frequency_analysis.regional import *

This notebook will demonstrate how to use the xhydro package to perform regional frequency analysis on a dataset of streamflow data. The first steps will be similar to the local frequency analysis notebook, but will will keep it simple to focus on the regional frequency analysis.

Lets start with getting the 02 region stations that are natural and have a minimum duration of 15 years

In [3]:
ds = (
    xd.Query(
        **{
            "datasets": {
                "deh": {
                    "id": ["02*"],
                    "regulated": ["Natural"],
                    "variables": ["streamflow"],
                }
            },
            "time": {"start": "1970-01-01", "minimum_duration": (15 * 365, "d")},
        }
    )
    .data.squeeze()
    .load()
)

# This dataset lacks some of the aforementioned attributes, so we need to add them.
ds["id"].attrs["cf_role"] = "timeseries_id"
ds["streamflow"].attrs = {
    "long_name": "Streamflow",
    "units": "m3 s-1",
    "standard_name": "water_volume_transport_in_river_channel",
    "cell_methods": "time: mean",
}

ds

Here, we hide years with more than 15% of missing data and get yearly max and spring max


In [4]:
timeargs = {
    "spring": {"date_bounds": ["02-11", "06-19"]},
    "annual": {},
}

ds_4fa = xh.indicators.get_yearly_op(
    ds, op="max", timeargs=timeargs, missing="pct", missing_options={"tolerance": 0.15}
)

ds_4fa

Again the following operations are simillar to the ones performed in the previous notebook.

In [5]:
ds["volume"] = xh.indicators.compute_volume(ds["streamflow"], out_units="hm3")

timeargs_vol = {"spring": {"date_bounds": ["04-30", "06-15"]}, "annual": {}}

ds_4fa = xr.merge(
    [
        ds_4fa,
        xh.indicators.get_yearly_op(
            ds,
            op="sum",
            input_var="volume",
            timeargs=timeargs_vol,
            missing="pct",
            missing_options={"tolerance": 0.15},
            interpolate_na=True,
        ),
    ]
)
ds_4fa

In [6]:
gdf = xd.Query(
    **{
        "datasets": {
            "deh_polygons": {
                "id": ["02*"],
                "regulated": ["Natural"],
                "variables": ["streamflow"],
            }
        },
        "time": {"start": "1970-01-01", "minimum_duration": (15 * 365, "d")},
    }
).data.reset_index()
gdf

Unnamed: 0,Station,Superficie,geometry
0,20302,1071.505249,"POLYGON ((-65.54653 48.91282, -65.54639 48.912..."
1,20404,664.096924,"POLYGON ((-65.1477 49.05904, -65.14748 49.0589..."
2,20502,57.292057,"POLYGON ((-64.45703 48.9948, -64.45692 48.9946..."
3,20602,626.996155,"POLYGON ((-64.97292 49.17614, -64.97283 49.176..."
4,20802,1184.306641,"POLYGON ((-65.26495 49.21157, -65.26473 49.211..."
5,21407,763.051636,"POLYGON ((-66.01417 49.1084, -66.0141 49.1083,..."
6,21502,718.665344,"POLYGON ((-66.58245 49.05639, -66.58243 49.056..."
7,21915,484.327454,"POLYGON ((-68.17734 48.54264, -68.17717 48.542..."
8,21916,93.784286,"POLYGON ((-68.33648 48.42673, -68.33656 48.426..."
9,22507,515.224609,"POLYGON ((-69.6483 47.61478, -69.64672 47.6137..."


To do a regional analysis, we'll need some explainatory variables. 
So with those catchments, we can now calculate some of the catchments properties.
We could also get meteorological values and land use data. Refer to GIS example for more details.

In [7]:
dswp = xhgis.watershed_properties(
    gdf[["Station", "geometry"]], unique_id="Station", output_format="xarray"
)
cent = dswp["centroid"].to_numpy()
lon = [ele[0] for ele in cent]
lat = [ele[1] for ele in cent]
dswp = dswp.assign(lon=("Station", lon))
dswp = dswp.assign(lat=("Station", lat))
dswp = dswp.drop("centroid")
dswp

To do our regional frequency analysis, we'll process the data with a principal component analysis (PCA)

In [8]:
data, pca = xhfa.regional.fit_pca(dswp, n_components=3)

we can see that the correlation is close to 0 between the components

In [9]:
data.to_dataframe(name="value").reset_index().pivot(
    index="Station", columns="components"
).corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value,value
Unnamed: 0_level_1,components,0,1,2
Unnamed: 0_level_2,components,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
value,0,1.0,-2.9650210000000004e-17,-1.579885e-16
value,1,-2.9650210000000004e-17,1.0,-7.699615000000001e-17
value,2,-1.579885e-16,-7.699615000000001e-17,1.0


In [10]:
data

Different clustering methods can be used. Their parameters can be passed as a dict.
We're using AgglomerativeClustering here but later we'll show and exammple using a combination of different methods.

In [11]:
xhfa.regional.get_group_from_fit(HDBSCAN, {"min_cluster_size": 2}, data)
xhfa.regional.get_group_from_fit(OPTICS, {"min_samples": 2}, data)
groups = xhfa.regional.get_group_from_fit(
    AgglomerativeClustering, {"n_clusters": 3}, data
)
groups

[array(['022507', '022601', '022704', '023002', '023004', '023303',
        '023401', '023422', '023428', '023432', '023701', '023702',
        '024003', '024010', '024013', '024015'], dtype=object),
 array(['020302', '020404', '020502', '020602', '020802', '021407',
        '021502', '021915', '021916'], dtype=object),
 array(['024007', '024014'], dtype=object)]

We calculate the L-moments for each station


In [12]:
ds_moment = xr.apply_ufunc(
    moment_l_vector, ds_4fa, input_core_dims=[["time"]], output_core_dims=[["lmom"]]
).assign_coords(lmom=["l1", "l2", "l3", "tau", "tau3", "tau4"])
ds_moment

We then create groupes of values and moments for each group.

In [35]:
ds_groups = xr.concat(
    [
        ds_4fa.sel(id=groups[i]).assign_coords(group_id=i).expand_dims("group_id")
        for i in range(len(groups))
    ],
    dim="group_id",
)
ds_moments_groups = xr.concat(
    [
        ds_moment.sel(id=groups[i]).assign_coords(group_id=i).expand_dims("group_id")
        for i in range(len(groups))
    ],
    dim="group_id",
)

For each group, calculate the H and Z values

In [39]:
kap = KappaGen()
ds_H_Z = calc_h_z(ds_groups, ds_moments_groups, kap)
ds_H_Z

We filter the data to only include the data that has H and Z below the thresholds

In [None]:
mask = mask_h_z(ds_H_Z)
ds_groups_H1 = ds_groups.where(mask).load()
ds_moments_groups_H1 = ds_moments_groups.where(mask).load()

In [None]:
# Centiles and return periods :
centiles = [x / 100.0 for x in range(101)]
return_periods = [
    1.010101,
    1.052632,
    1.111111,
    1.25,
    1.5,
    2,
    3,
    5,
    10,
    20,
    50,
    100,
    200,
    500,
    1000,
    2000,
    5000,
    10000,
]

We can now calculate the values for each group and return period and we remove the regions with less stations than a threshold

In [None]:
Q_T = calculate_rp_from_afr(ds_groups_H1, ds_moments_groups_H1, return_periods)
Q_T = remove_small_regions(Q_T)

To plot, let see what it looks like on 023401

In [None]:
Q_reg = Q_T.sel(id="023401").dropna(dim="group_id", how="all")
reg = Q_reg.streamflow_max_annual.squeeze()

Let's compare local and regional

In [None]:
params_loc = xhfa.local.fit(ds_4fa)
Q_loc = xhfa.local.parametric_quantiles(params_loc, return_periods)
loc = Q_loc.sel(id="023401", scipy_dist="genextreme").streamflow_max_annual

In [None]:
import matplotlib.pyplot as plt

Q_reg
plt.plot(reg.rp.values, reg.values, "blue")
plt.plot(loc.return_period.values, loc.values, "red")

Now let's add some uncertainities
But we will work with only one catchemnt and two distributions as uncertinities can be intensive in computation.
We selct the station 023401, and distribution 'genextreme' and 'pearson3'

In [None]:
ds_4fa_one_station = ds_4fa.sel(id="023401")
params_loc_one_station = params_loc.sel(
    id="023401", scipy_dist=["genextreme", "pearson3"]
)

We bootstrap the observations 200 times to get the uncertainty

In [None]:
ds_4fa_iter = xhfa.uncertainities.boostrap_obs(ds_4fa_one_station, 200)
params_boot_obs = xhfa.local.fit(ds_4fa_iter, distributions=["genextreme", "pearson3"])

In [None]:
Q_boot_obs = xhfa.local.parametric_quantiles(
    params_boot_obs.load(), return_periods
).squeeze()
Q_boot_obs = Q_boot_obs.streamflow_max_annual

Here, instead of ressampling the observations, we ressamplee the fittted distributions 200 times to get the uncertainty

In [None]:
#
values = xhfa.uncertainities.boostrap_dist(
    ds_4fa_one_station, params_loc_one_station, 200
)
params_boot_dist = xhfa.uncertainities.fit_boot_dist(values)

In [None]:
Q_boot_dist = xhfa.local.parametric_quantiles(
    params_boot_dist.load(), return_periods
).squeeze()
Q_boot_dist = Q_boot_dist.streamflow_max_annual

In [None]:
loc_dist = Q_boot_dist.sel(scipy_dist="genextreme")
loc_obs = Q_boot_obs.sel(scipy_dist="genextreme")

In [None]:
import matplotlib.pyplot as plt

plt.plot(reg.rp.values, reg.values, "blue", label="Regional")
plt.plot(
    loc_obs.return_period.values,
    loc_obs.quantile(0.5, "samples"),
    "red",
    label="bootstrap obs",
)
plt.plot(
    loc_obs.return_period.values,
    loc_obs.quantile(0.05, "samples"),
    "pink",
    label="95% CI",
)
plt.plot(loc_obs.return_period.values, loc_obs.quantile(0.95, "samples"), "pink")
plt.plot(
    loc_dist.return_period.values,
    loc_dist.quantile(0.5, "samples"),
    "green",
    label="bootstrap dist",
)
plt.plot(
    loc_dist.return_period.values,
    loc_dist.quantile(0.05, "samples"),
    "green",
    label="95% CI",
)
plt.plot(loc_dist.return_period.values, loc_dist.quantile(0.95, "samples"), "green")

For regional, we need to ressample al stations, but this time, it's much faster as no fit is involved

In [None]:
ds_reg_samples = xhfa.uncertainities.boostrap_obs(ds_4fa, 200)
ds_moments_iter = xhfa.uncertainities.calc_moments_iter(ds_reg_samples).load()

In [None]:
Q_reg_boot = xhfa.uncertainities.calc_q_iter(
    "023401", "streamflow_max_annual", ds_groups_H1, ds_moments_iter, return_periods
)

In [None]:
reg_boot = Q_reg_boot.streamflow_max_annual.sel(id="023401")

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(15, 4))
plt.xscale("log")
plt.grid(visible=True)
plt.plot(
    reg_boot.rp.values, reg_boot.quantile(0.5, "samples"), "blue", label="Regional"
)
plt.plot(reg_boot.rp.values, reg_boot.quantile(0.05, "samples"), "cyan", label="95% CI")
plt.plot(reg_boot.rp.values, reg_boot.quantile(0.95, "samples"), "cyan")
plt.plot(
    loc_obs.return_period.values,
    loc_obs.quantile(0.5, "samples"),
    "red",
    label="bootstrap obs",
)
plt.plot(
    loc_obs.return_period.values,
    loc_obs.quantile(0.05, "samples"),
    "pink",
    label="95% CI",
)
plt.plot(loc_obs.return_period.values, loc_obs.quantile(0.95, "samples"), "pink")
plt.plot(
    loc_dist.return_period.values,
    loc_dist.quantile(0.5, "samples"),
    "green",
    label="bootstrap dist",
)
plt.plot(
    loc_dist.return_period.values,
    loc_dist.quantile(0.05, "samples"),
    "green",
    label="95% CI",
)
plt.plot(loc_dist.return_period.values, loc_dist.quantile(0.95, "samples"), "green")
plt.legend()

Second we try different clustering methods
We dont do too many tests here since it can take quite a while to run and the goal is just to illustrate the possibilities

In [None]:
PARAM = {
    AgglomerativeClustering: {"arg_name": "n_clusters", "range": range(2, 12)},
    HDBSCAN: {"arg_name": "min_cluster_size", "range": range(6, 7)},
    OPTICS: {"arg_name": "min_samples", "range": range(4, 5)},
}

So our station instead of beein in one region, will be in many of the regions

In [None]:
combinations_list = xhfa.uncertainities.generate_combinations(data, 2)

In [None]:
groups = []

for model in [AgglomerativeClustering, HDBSCAN, OPTICS]:

    for p in PARAM[model]["range"]:
        d_param = {}
        d_param[PARAM[model]["arg_name"]] = p
        for combination in combinations_list:
            # Extract data for the current combination
            data_com = data.sel(Station=list(combination))
            # Get groups from the fit and add to the list
            groups = groups + get_group_from_fit(model, d_param, data_com)
unique_groups = [list(x) for x in {tuple(x) for x in groups}]

The followin steps are similar to the previous one, just with more regions. 

In [None]:
ds_groups = xr.concat(
    [
        ds_4fa.sel(id=unique_groups[i])
        .assign_coords(group_id=i)
        .expand_dims("group_id")
        for i in range(len(unique_groups))
    ],
    dim="group_id",
)
ds_moments_groups = xr.concat(
    [
        ds_moment.sel(id=unique_groups[i])
        .assign_coords(group_id=i)
        .expand_dims("group_id")
        for i in range(len(unique_groups))
    ],
    dim="group_id",
)

In [None]:
kap = KappaGen()
ds_H_Z = calc_h_z(ds_groups, ds_moments_groups, kap)

In [None]:
mask = mask_h_z(ds_H_Z)
ds_groups_H1 = ds_groups.where(mask).load()
ds_moments_groups_H1 = ds_moments_groups.where(mask).load()

Q_T = calculate_rp_from_afr(ds_groups_H1, ds_moments_groups_H1, return_periods)
Q_T = remove_small_regions(Q_T)

Q = Q_T.sel(id="023401").dropna(dim="group_id", how="all")

In [None]:
regional_multiple_region = Q.streamflow_max_annual

In [None]:
ds_moment = xr.apply_ufunc(
    moment_l_vector, ds_4fa, input_core_dims=[["time"]], output_core_dims=[["lmom"]]
).assign_coords(lmom=["l1", "l2", "l3", "tau", "tau3", "tau4"])
ds_moment

In [None]:
fig = plt.figure(figsize=(15, 4))

plt.plot(
    regional_multiple_region.rp.values,
    regional_multiple_region.quantile(0.5, "group_id"),
    "blue",
    label="regional_multiple_region",
)
plt.plot(
    regional_multiple_region.rp.values,
    regional_multiple_region.quantile(0.05, "group_id"),
    "cyan",
    label="95% CI",
)
plt.plot(
    regional_multiple_region.rp.values,
    regional_multiple_region.quantile(0.95, "group_id"),
    "cyan",
)
plt.plot(
    loc_obs.return_period.values,
    loc_obs.quantile(0.5, "samples"),
    "red",
    label="bootstrap obs",
)
plt.plot(
    loc_obs.return_period.values,
    loc_obs.quantile(0.05, "samples"),
    "pink",
    label="95% CI",
)
plt.plot(loc_obs.return_period.values, loc_obs.quantile(0.95, "samples"), "pink")
plt.plot(
    loc_dist.return_period.values,
    loc_dist.quantile(0.5, "samples"),
    "green",
    label="bootstrap dist",
)
plt.plot(
    loc_dist.return_period.values,
    loc_dist.quantile(0.05, "samples"),
    "green",
    label="95% CI",
)
plt.plot(loc_dist.return_period.values, loc_dist.quantile(0.95, "samples"), "green")
plt.xscale("log")
plt.grid(visible=True)
plt.legend()

We could also combine multiple regions and ressampling.
calc_q_iter will check in how many group_id the station is present, and stack it with samples
In this case, it will be stacked with 200 samples, and it's in 533 groupes so 103600 samples are generated.


In [None]:
Q_reg_boot = xhfa.uncertainities.calc_q_iter(
    "023401", "streamflow_max_annual", ds_groups_H1, ds_moments_iter, return_periods
)
Q_reg_boot

In [None]:
regional_multiple_region_boot = Q_reg_boot.sel(id="023401").streamflow_max_annual

In [None]:
fig = plt.figure(figsize=(15, 4))

plt.plot(
    loc_dist.return_period.values,
    loc_dist.quantile(0.5, "samples"),
    "green",
    label="bootstrap dist",
)
plt.plot(
    loc_dist.return_period.values,
    loc_dist.quantile(0.05, "samples"),
    "green",
    label="95% CI",
)
plt.plot(loc_dist.return_period.values, loc_dist.quantile(0.95, "samples"), "green")
plt.plot(
    loc_obs.return_period.values,
    loc_obs.quantile(0.5, "samples"),
    "red",
    label="bootstrap obs",
)
plt.plot(
    loc_obs.return_period.values,
    loc_obs.quantile(0.05, "samples"),
    "pink",
    label="95% CI",
)
plt.plot(loc_obs.return_period.values, loc_obs.quantile(0.95, "samples"), "pink")
plt.plot(
    regional_multiple_region_boot.rp.values,
    regional_multiple_region_boot.quantile(0.5, "samples"),
    "black",
    label="regional multiple regions and boot",
)
plt.plot(
    regional_multiple_region_boot.rp.values,
    regional_multiple_region_boot.quantile(0.05, "samples"),
    "grey",
    label="95% CI",
)
plt.plot(
    regional_multiple_region_boot.rp.values,
    regional_multiple_region_boot.quantile(0.95, "samples"),
    "grey",
)
plt.plot(
    regional_multiple_region.rp.values,
    regional_multiple_region.quantile(0.5, "group_id"),
    "blue",
    label="regional multiple regions",
)
plt.plot(
    regional_multiple_region.rp.values,
    regional_multiple_region.quantile(0.05, "group_id"),
    "cyan",
    label="95% CI",
)
plt.plot(
    regional_multiple_region.rp.values,
    regional_multiple_region.quantile(0.95, "group_id"),
    "cyan",
)
plt.xscale("log")
plt.grid(visible=True)
plt.legend()