In [None]:
import os
import re
from glob import glob
from pathlib import Path
import zipfile
import tempfile

import pandas as pd
from requests import HTTPError

import cdsapi
import xarray as xr


In [None]:
PATH_ROOT = Path().resolve()
PATH_DATA = PATH_ROOT / "data"

COPERNICUS_DATASET = "sis-european-wind-storm-indicators"
COPERNICUS_PRODUCT = "windstorm_footprints"

In [None]:
def pad(num: int) -> str:
    return f"{num:02d}"

days = [pad(i) for i in range(1, 32) if  i != 9]
winter_months = [pad(i) for i in [1, 2, 3, 10, 11 , 12]]

missing_yrs = [2003, 2004, 2010, 2018, 2019]
years = [str(yr) for yr in range(1979, 2022) if yr not in missing_yrs]

In [None]:
def download_data_files(years, months, days, save_dir):
    client = cdsapi.Client()

    for year in years:
        for month in months:

            request = {
                "product": [COPERNICUS_PRODUCT],
                "variable": "all",
                "year": [year],
                "month": [month],
                "day": days,
            }

            target = os.path.join(save_dir, f"{year}_{month}.zip")

            try:
                client.retrieve(COPERNICUS_DATASET, request, target)
            except HTTPError:
                print(f"Download failed for {target}")


In [None]:
def _extract_timestamp(string):
    time_str = re.search(r'(\d{8})', string).group(1)
    return pd.to_datetime(time_str)

In [None]:
def preprocess(ds):
    filename = ds["FX"].encoding['source']
    timestamp = _extract_timestamp(filename)
    return (
        ds
        .drop_vars("z")
        .squeeze()
        .expand_dims(event=[timestamp])
        .rename({"FX": "wind_footprint", "Longitude": "lon", "Latitude": "lat"})
    )

In [None]:
with tempfile.TemporaryDirectory() as tmpdir:

    # Load zip files from Copernicus data store.
    download_data_files(years[:5], winter_months, days, save_dir=tmpdir)

    # Unpack all zipped netcdf files.
    zipfiles = glob(os.path.join(tmpdir, "*.zip"))
    for zfile in zipfiles:
        with zipfile.ZipFile(zfile, "r") as file_ref:
            file_ref.extractall(tmpdir)

    # Combine all netcdf files into a single dataset.
    ncfiles = glob(os.path.join(tmpdir, "*.nc"))
    ds = xr.open_mfdataset(
        ncfiles,
        preprocess=preprocess,
        chunks={"lat": 32, "lon": 34}
    ).chunk({"event": -1})

    # Write to a zarr file. 
    #ds = ds.chunk({"lat": 32, "lon": 34, "event": -1})
    ds.to_zarr(PATH_DATA / "wind_footprints.zarr", compute=True, mode="w")

In [None]:
xr.open_zarr("data/wind_footprints.zarr/")["wind_footprint"]#.chunk({"lat": 32, "lon": 34, "event": -1})