# Data Sampling
climatrix allows sampling data from a dataset using various strategies, such as uniform sampling (sample_uniform) and normal sampling (sample_normal). This notebook demonstrates these features.










In [2]:
import climatrix as cm
import xarray as xr
import numpy as np
from climatrix.dataset.domain import SamplingNaNPolicy #
from datetime import datetime

## Creation of simple Dataset

In [7]:
xarray_da = xr.DataArray(
    data=np.random.rand(1, 91, 181),
    dims=("time", "latitude", "longitude"),
    coords={
        "time": np.array(
            [
                datetime(2000, 1, 1),
            ],
            dtype="datetime64",
        ),
        "latitude": (
            ("latitude",),
            np.arange(-90, 92, 2, dtype=np.float32),
        ),
        "longitude": (
            ("longitude",),
            np.arange(-180, 182, 2, dtype=np.float32),
        ),
    },
)
cm_dense = xarray_da.to_dataset(name="temperature").cm
print(f"Domain size: {cm_dense.domain.size}")

Domain size: 16471


## Uniform Sampling (`sample_uniform`)

In [9]:
# Sampling by specifying a portion
cm_sampled_uniform_portion = cm_dense.sample_uniform(portion=0.01)  # 1% of the data
print("Sampled dataset (uniformly, 1% of data):")
print(cm_sampled_uniform_portion.da)
print(f"Number of points in the sampled dataset: {cm_sampled_uniform_portion.domain.size}")


# Sampling by specifying the number of points
cm_sampled_uniform_number = cm_dense.sample_uniform(number=15)
print("\nSampled dataset (uniformly, 15 points):")
print(cm_sampled_uniform_number.da)
print(f"Number of points: {cm_sampled_uniform_number.domain.size}")


Sampled dataset (uniformly, 1% of data):
<xarray.DataArray 'temperature' (point: 164)> Size: 1kB
array([0.52712286, 0.63928844, 0.03646083, 0.8241649 , 0.57609229,
       0.01474859, 0.6195384 , 0.14689573, 0.51154453, 0.94101298,
       0.67256902, 0.54556217, 0.2944918 , 0.02490532, 0.19157309,
       0.95268051, 0.78059175, 0.02017985, 0.96598026, 0.33529364,
       0.33838272, 0.76792389, 0.89029098, 0.09715077, 0.73804814,
       0.62272207, 0.42154448, 0.2208285 , 0.83581114, 0.46591751,
       0.55609206, 0.63954741, 0.52492922, 0.92202429, 0.12271198,
       0.83099301, 0.36464272, 0.92975865, 0.89890718, 0.53546936,
       0.82406661, 0.41654791, 0.69182799, 0.14956867, 0.26567454,
       0.05442339, 0.22792661, 0.54056607, 0.97275205, 0.21003985,
       0.26660876, 0.99562391, 0.43779213, 0.18616756, 0.87016829,
       0.17665773, 0.90638697, 0.9556379 , 0.10803328, 0.35395568,
       0.55222438, 0.93422497, 0.70865649, 0.56365971, 0.08950165,
       0.13886274, 0.83514772, 0

## Normal Sampling (`sample_normal`)

In [10]:
center_lon = -95.0
center_lat = 35.0
sigma_val = 5.0

cm_sampled_normal = cm_dense.sample_normal(
    number=20,
    center_point=(center_lon, center_lat),
    sigma=sigma_val
)
print("Sampled dataset (normally, 20 points):")
print(cm_sampled_normal.da)
print(f"Number of points: {cm_sampled_normal.domain.size}")


Sampled dataset (normally, 20 points):
<xarray.DataArray 'temperature' (point: 20)> Size: 160B
array([0.02035756, 0.56597235, 0.19078197, 0.67679772, 0.29520872,
       0.38199055, 0.37565171, 0.19078197, 0.29520872, 0.06971324,
       0.66248246, 0.16371587, 0.98684646, 0.22591755, 0.94101298,
       0.19078197, 0.39942364, 0.51499686, 0.05414148, 0.5965287 ])
Coordinates:
    latitude   (point) float32 80B 28.0 36.0 30.0 28.0 ... 30.0 40.0 34.0 22.0
    longitude  (point) float32 80B -90.0 -108.0 -92.0 ... -100.0 -96.0 -96.0
Dimensions without coordinates: point
Number of points: 20


## Handling NaN Values During Sampling
Sampling methods include a `nan` parameter to control behavior when encountering NaN values.
Available strategies (`SamplingNaNPolicy`):
- `IGNORE`: Ignores NaNs (default); the sample may contain NaN values.
- `RESAMPLE`: If a sampled point contains NaN, it resamples.
- `RAISE`: Raises a `ValueError` if any sampled point contains a NaN.

In [19]:
data_with_nans = np.random.rand(1, 91, 181)
data_with_nans[0:20, 0:30] = np.nan

xarray_da = xr.DataArray(
    data=data_with_nans,
    dims=("time", "latitude", "longitude"),
    coords={
        "time": np.array(
            [
                datetime(2000, 1, 1),
            ],
            dtype="datetime64",
        ),
        "latitude": (
            ("latitude",),
            np.arange(-90, 92, 2, dtype=np.float32),
        ),
        "longitude": (
            ("longitude",),
            np.arange(-180, 182, 2, dtype=np.float32),
        ),
    },
)
cm_nans = xarray_da.to_dataset(name="temperature").cm

print(f"Dataset size: {cm_nans.domain.size}")
print(f"Number of NaNs in the original dataset: {np.isnan(cm_nans.da.values).sum()}")

# IGNORE strategy
sampled_ignore_nan = cm_nans.sample_uniform(number=10000, nan=SamplingNaNPolicy.IGNORE)
print(f"\nSampling with IGNORE: number of NaNs in the sample = {np.isnan(sampled_ignore_nan.da.values).sum()}")

# RESAMPLE strategy
sampled_resample_nan = cm_nans.sample_uniform(number=10000, nan=SamplingNaNPolicy.RESAMPLE)
print(f"Sampling with RESAMPLE: number of NaNs in the sample = {np.isnan(sampled_resample_nan.da.values).sum()}")

# RAISE strategy
try:
    cm_nans.sample_uniform(number=12000, nan=SamplingNaNPolicy.RAISE)
except ValueError as error:
    print(f"Expected error caught for RAISE: {error}")

Dataset size: 16471
Number of NaNs in the original dataset: 5430

Sampling with IGNORE: number of NaNs in the sample = 3271
Sampling with RESAMPLE: number of NaNs in the sample = 0
Expected error caught for RAISE: Not all points have data
