In [1]:
import xarray as xr
import numpy as np
import pathlib
import datetime

In [2]:
SATELLITE_ZARR_PATH = "gs://public-datasets-eumetsat-solar-forecasting/satellite/EUMETSAT/SEVIRI_RSS/v3/eumetsat_seviri_hrv_uk.zarr"

dataset = xr.open_dataset(
    SATELLITE_ZARR_PATH, 
    engine="zarr",
    chunks="auto",  # Load the data as a Dask array
)

print(dataset)


<xarray.Dataset>
Dimensions:  (time: 173624, y: 891, x: 1843)
Coordinates:
  * time     (time) datetime64[ns] 2020-01-01T00:05:00 ... 2021-11-07T15:50:00
  * x        (x) float32 2.8e+04 2.7e+04 2.6e+04 ... -1.813e+06 -1.814e+06
    x_osgb   (y, x) float32 dask.array<chunksize=(891, 1843), meta=np.ndarray>
  * y        (y) float32 4.198e+06 4.199e+06 4.2e+06 ... 5.087e+06 5.088e+06
    y_osgb   (y, x) float32 dask.array<chunksize=(891, 1843), meta=np.ndarray>
Data variables:
    data     (time, y, x) int16 dask.array<chunksize=(22, 891, 1843), meta=np.ndarray>


In [3]:
def get_random_day_slice(skip):
    date = None
    while date in skip or date is None:
        year = int(np.random.choice([2020, 2021], size=1, replace=False)[0])
        doy = int(np.random.choice(range(1, 365), size=1, replace=False)[0])
        date = datetime.datetime(year, 1, 1) + datetime.timedelta(doy - 1)
    
    skip.add(date)
    
    # get data from 7:00 to 17:00 on this day
    data_slice = dataset.loc[
        {
            "time": slice(
                date + datetime.timedelta(hours=7),
                date + datetime.timedelta(hours=17),
            )
        }
    ]
    return data_slice

In [4]:
DAYS_TO_DOWNLOAD = 50

In [5]:
np.random.seed(7)

slices = []
skip = set()
for _ in range(DAYS_TO_DOWNLOAD):
    slc = get_random_day_slice(skip)
    slices.append(slc)

In [6]:
combined = xr.concat(slices, dim='time')

In [7]:
# takes a minute
times = combined['time'].to_numpy()
x = combined['x'].to_numpy()
x_osgb = combined['x_osgb'].to_numpy()
y = combined['y'].to_numpy()
y_osgb = combined['y_osgb'].to_numpy()
data = combined['data'].to_numpy()

In [8]:
times.shape, data.shape

((5323,), (5323, 891, 1843))

In [9]:
# save to data folder
p = pathlib.Path(f'data')
p.mkdir(parents=True, exist_ok=True)
p = p / f'data_random_{DAYS_TO_DOWNLOAD}.npz'
if p.exists():
    raise ValueError(f'Path {p} already exists!')

np.savez(
    p,
    times=times,
    x=x,
    x_osgb=x_osgb,
    y=y,
    y_osgb=y_osgb,
    data=data,
)