Merge 3f4259f into 320f299

hydrologie · Feb 14, 2024 · 97e2a99 · 97e2a99
2 parents 320f299 + 3f4259f
commit 97e2a99
Show file tree

Hide file tree

Showing 4 changed files with 319 additions and 4 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -13,6 +13,7 @@ New features and enhancements
 * Added a new set of functions to support creating and updating `pooch` registries, caching testing datasets from `hydrologie/xhydro-testdata`, and ensuring that testing datasets can be loaded into temporary directories. (:pull:`62`).
 * `xhydro` is now configured to use `pooch` to download and cache testing datasets from `hydrologie/xhydro-testdata`. (:pull:`62`).
 * `xhydro` is now `Semantic Versioning v2.0.0 <https://semver.org/spec/v2.0.0.html>`_ compliant. (:pull:`70`).
+* Added new functions to `xhydro.frequency_analysis.local` to calculate plotting positions and to prepare plots. (:pull:`87`).
 
 Breaking changes
 ^^^^^^^^^^^^^^^^

diff --git a/docs/notebooks/local_frequency_analysis.ipynb b/docs/notebooks/local_frequency_analysis.ipynb
@@ -47,8 +47,7 @@
         "    **{\n",
         "        \"datasets\":{\n",
         "            \"deh\":{\n",
-        "                \"id\" :[\"020*\"],\n",
-        "                \"regulated\":[\"Natural\"],\n",
+        "                \"id\" :[\"0510*\"],\n",
         "                \"variables\":[\"streamflow\"],\n",
         "            }\n",
         "        }, \"time\":{\"start\": \"1970-01-01\", \n",
@@ -141,6 +140,13 @@
         "ds_4fa"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": []
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -314,8 +320,90 @@
       "source": [
         "rp = xhfa.local.parametric_quantiles(params, t=[20, 100])\n",
         "\n",
-        "rp"
+        "rp.load()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "In a future release, plotting will be handled by a proper function. For now, we'll show an example in this Notebook using preliminary utilities.\n",
+        "\n",
+        "`xhfa.local._prepare_plots` generates datapoints required to plot the results of the frequency analysis. If `log=True`,will it return log-spaced x values between `xmin` and `xmax`.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "data = xhfa.local._prepare_plots(params, xmin=1, xmax = 1000, npoints=50, log=True)\n",
+        "data.load()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "`xhfa.local._get_plotting_positions` allows you to get plotting positions for all variables in the dataset.  It accepts `alpha` `beta` arguments. See the [SciPy documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.plotting_positions.html) for typical values. By default, (0.4, 0.4) will be used, which corresponds to approximately quantile unbiased (Cunnane).\n",
+        "\n",
+        "    "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "pp = xhfa.local._get_plotting_positions(ds_4fa[['streamflow_max_spring']])\n",
+        "pp"
       ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "#Lets plot the observations\n",
+        "p1 = data.streamflow_max_spring.hvplot(x='return_period', by='scipy_dist', grid=True, groupby=['id'], logx=True) "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "#Lets now plot the distributions\n",
+        "p2 = pp.hvplot.scatter(x='streamflow_max_spring_pp',y='streamflow_max_spring', grid=True, groupby=['id'], logx=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "#And now combining the plots\n",
+        "p1 * p2"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": []
     }
   ],
   "metadata": {
@@ -334,7 +422,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.10.12"
+      "version": "3.11.7"
     },
     "vscode": {
       "interpreter": {

diff --git a/tests/test_local.py b/tests/test_local.py
@@ -1,8 +1,10 @@
 import numpy as np
 import pytest
+import xarray as xr
 from xclim.testing.helpers import test_timeseries as timeseries
 
 import xhydro.frequency_analysis as xhfa
+from xhydro.frequency_analysis.local import _get_plotting_positions, _prepare_plots
 
 
 class TestFit:
@@ -153,3 +155,117 @@ def test_criteria():
             [118.12140939, 118.51930466, 118.56585383],
         ],
     )
+
+
+class TestGetPlottingPositions:
+    def test_default(self):
+        data = timeseries(
+            np.array([50, 65, 80, 95]),
+            variable="streamflow",
+            start="2001-01-01",
+            freq="YS",
+            as_dataset=True,
+        )
+        expected = [1.16666667, 1.61538462, 2.625, 7.0]
+        result = _get_plotting_positions(data)
+        np.testing.assert_array_almost_equal(result.streamflow_pp, expected)
+        np.testing.assert_array_almost_equal(result.streamflow, data.streamflow)
+
+        data_2d = xr.concat([data, data], dim="id")
+        result = _get_plotting_positions(data_2d)
+        np.testing.assert_array_almost_equal(result.streamflow_pp, [expected, expected])
+        np.testing.assert_array_equal(result.streamflow, data_2d.streamflow)
+
+    def test_nan(self):
+        data = timeseries(
+            np.array([50, np.nan, 80, 95]),
+            variable="streamflow",
+            start="2001-01-01",
+            freq="YS",
+            as_dataset=True,
+        )
+        expected = [1.23076923, np.nan, 2.0, 5.33333333]
+        result = _get_plotting_positions(data)
+        np.testing.assert_array_almost_equal(result.streamflow_pp, expected)
+        np.testing.assert_array_almost_equal(result.streamflow, data.streamflow)
+
+        data_2d = xr.concat([data, data], dim="id")
+        result = _get_plotting_positions(data_2d)
+        np.testing.assert_array_almost_equal(result.streamflow_pp, [expected, expected])
+        np.testing.assert_array_equal(result.streamflow, data_2d.streamflow)
+
+    def test_return_period(self):
+        data = timeseries(
+            np.array([50, 65, 80, 95]),
+            variable="streamflow",
+            start="2001-01-01",
+            freq="YS",
+            as_dataset=True,
+        )
+        expected = [0.14285714, 0.38095238, 0.61904762, 0.85714286]
+        result = _get_plotting_positions(data, return_period=False)
+        np.testing.assert_array_almost_equal(result.streamflow_pp, expected)
+        np.testing.assert_array_almost_equal(result.streamflow, data.streamflow)
+
+        data_2d = xr.concat([data, data], dim="id")
+        result = _get_plotting_positions(data_2d, return_period=False)
+        np.testing.assert_array_almost_equal(result.streamflow_pp, [expected, expected])
+        np.testing.assert_array_equal(result.streamflow, data_2d.streamflow)
+
+    def test_alpha_beta(self):
+        data = timeseries(
+            np.array([50, 65, 80, 95]),
+            variable="streamflow",
+            start="2001-01-01",
+            freq="YS",
+            as_dataset=True,
+        )
+        expected = [1.25, 1.66666667, 2.5, 5.0]
+        result = _get_plotting_positions(data, alpha=0, beta=0)
+        np.testing.assert_array_almost_equal(result.streamflow_pp, expected)
+        np.testing.assert_array_almost_equal(result.streamflow, data.streamflow)
+
+        data_2d = xr.concat([data, data], dim="id")
+        result = _get_plotting_positions(data_2d, alpha=0, beta=0)
+        np.testing.assert_array_almost_equal(result.streamflow_pp, [expected, expected])
+        np.testing.assert_array_equal(result.streamflow, data_2d.streamflow)
+
+
+class Testprepare_plots:
+    ds = timeseries(
+        np.array([50, 65, 80, 95, 110, 125, 140, 155, 170, 185, 200]),
+        variable="streamflow",
+        start="2001-01-01",
+        freq="YS",
+        as_dataset=True,
+    )
+    params = xhfa.local.fit(ds, distributions=["gamma", "pearson3"])
+
+    def test_prepare_plots_default(self):
+        result = _prepare_plots(self.params)
+        assert result.streamflow.shape == (2, 100)
+        assert result.return_period.min() == 1
+        assert result.return_period.max() == 10000
+        expected = [
+            [-30.78466504, 63.64266447, 78.19229358, 88.62699148, 97.15369501],
+            [-np.inf, 61.08708903, 79.72126025, 92.05411647, 101.59650405],
+        ]
+        np.testing.assert_array_almost_equal(result.streamflow.head(), expected)
+
+    def test_prepare_plots_linear(self):
+        result = _prepare_plots(self.params, log=False)
+
+        expected = [
+            [-30.78466504, 262.72577254, 281.56238078, 292.27851004, 299.75980757],
+            [-np.inf, 235.6878466, 247.41104463, 253.8825982, 258.32114457],
+        ]
+        np.testing.assert_array_almost_equal(result.streamflow.head(), expected)
+
+    def test_prepare_plots_range(self):
+        result = _prepare_plots(self.params, xmin=5, xmax=500)
+        assert result.return_period.min() == pytest.approx(5)
+        assert result.return_period.max() == pytest.approx(500)
+
+    def test_prepare_plots_npoints(self):
+        result = _prepare_plots(self.params, npoints=50).load()
+        assert result.streamflow.shape == (2, 50)
diff --git a/xhydro/frequency_analysis/local.py b/xhydro/frequency_analysis/local.py
@@ -7,6 +7,7 @@
 import statsmodels
 import xarray as xr
 import xclim.indices.stats
+from scipy.stats.mstats import plotting_positions
 from statsmodels.tools import eval_measures
 
 __all__ = [
@@ -265,3 +266,112 @@ def _get_criteria_1d(da, params, dist):
     out.attrs = ds.attrs
 
     return out
+
+
+def _get_plotting_positions(data_array, alpha=0.4, beta=0.4, return_period=True):
+    """Calculate plotting positions for data.
+
+    Parameters
+    ----------
+    data_array : xarray DataArray
+        Input data.
+    alpha : float, optional
+        Plotting position parameter, by default 0.4.
+    beta : float, optional
+        Plotting position parameter, by default 0.4.
+    return_period : bool, optional
+        If True, return periods instead of probabilities, by default True.
+
+    Returns
+    -------
+    xarray DataArray
+        The data with plotting positions assigned.
+
+    Notes
+    -----
+    See scipy.stats.mstats.plotting_positions for typical values for alpha and beta. (0.4, 0.4) : approximately quantile unbiased (Cunnane).
+    """
+    data_copy = data_array.copy(deep=True)
+
+    def vec_plotting_positions(vec_data, alpha=0.4, beta=0.4):
+        """Calculate plotting positions for vectorized data.
+
+        Parameters
+        ----------
+        vec_data : ndarray
+            Input data, with time dimension first.
+        alpha : float, optional
+            Plotting position parameter.
+        beta : float, optional
+            Plotting position parameter.
+
+        Returns
+        -------
+        ndarray
+            Array with plotting positions assigned to valid data points,
+            and NaNs assigned to invalid data points.
+        """
+        out = []
+        if vec_data.ndim == 1:
+            valid = ~np.isnan(vec_data)
+            pp = plotting_positions(vec_data[valid], alpha, beta)
+
+            out = np.full_like(vec_data.astype(float), np.nan)
+            out[valid] = pp
+
+        else:
+            for data in vec_data:
+                valid = ~np.isnan(data)
+                pp = plotting_positions(data[valid], alpha, beta)
+
+                pp_full = np.full_like(data.astype(float), np.nan)
+                pp_full[valid] = pp
+
+                out.append(pp_full)
+        return out
+
+    pp = xr.apply_ufunc(
+        vec_plotting_positions,
+        data_array,
+        alpha,
+        beta,
+        input_core_dims=[["time"], [], []],
+        output_core_dims=[["time"]],
+    )
+
+    if return_period:
+        pp = -1 / (pp - 1)
+
+    for name in pp.data_vars:
+        pp = pp.rename({name: name + "_pp"})
+        pp[name] = data_copy[name]
+
+    return pp
+
+
+def _prepare_plots(params, xmin=1, xmax=10000, npoints=100, log=True):
+    """Prepare x-values for plotting frequency analysis results.
+
+    Parameters
+    ----------
+    params : xarray DataArray
+        Input data.
+    xmin : float, optional
+        Minimum x value, by default 1.
+    xmax : float, optional
+        Maximum x value, by default 10000.
+    npoints : int, optional
+        Number of x values, by default 100.
+    log : bool, optional
+        If True, return log-spaced x values, by default True.
+
+    Returns
+    -------
+    xarray DataArray
+        The data with plotting positions assigned.
+    """
+    if log:
+        x = np.logspace(np.log10(xmin), np.log10(xmax), npoints, endpoint=True)
+    else:
+        x = np.linspace(xmin, xmax, npoints)
+    return parametric_quantiles(params, x)