---
title: AEMET - Precipitation
subject: Tutorials
short_title: Precipitation
authors:
  - name: J. Emmanuel Johnson
    affiliations:
      - CSIC
      - UCM
      - IGEO
    orcid: 0000-0002-6739-0053
    email: juanjohn@ucm.es
license: CC-BY-4.0
keywords: notation
---

* Load Raw Data File (`.csv`)
* Create `xarray.Dataset`

In [None]:
import autoroot
from dotenv import load_dotenv
load_dotenv()

from tqdm.auto import tqdm
import xarray as xr
import numpy as np
from st_evt import validate_longitude, validate_latitude

import pint_xarray
from loguru import logger
import pandas as pd

xr.set_options(display_width=40)

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Paths

In [2]:
import os
from pathlib import Path

logger.info("Initializaing paths...")

raw_data_dir = Path(os.getenv("RAW_DATA_SAVEDIR"))
clean_data_dir = Path(os.getenv("CLEAN_DATA_SAVEDIR"))

[32m2024-12-04 13:00:08.667[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mInitializaing paths...[0m


In [3]:
logger.info("Opening dataset...")

logger.debug(f"Raw Data: {raw_data_dir}")
ds_ = xr.open_dataset(raw_data_dir.joinpath("pr.nc"))

assert ds_.pr.shape[:2] == (2_407, 22_645)
logger.debug(f"Shape: {ds_.pr.shape}")

[32m2024-12-04 13:00:08.711[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mOpening dataset...[0m
[32m2024-12-04 13:00:08.712[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [34m[1mRaw Data: /home/juanjohn/pool_data/dynev4eo/data/raw[0m


[32m2024-12-04 13:00:09.152[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [34m[1mShape: (2407, 22645)[0m


## Temperature

### Load Dataframes

#### Coordinates

In [4]:
logger.info("Loading stations...")

# Load Station Coordinates
df_coords = pd.read_csv(raw_data_dir.joinpath("ubicacion_estaciones_spain.csv"), delimiter=";", index_col=0, decimal=",")

logger.info("Checking Coords shapes...")
assert df_coords.shape == (5238, 5)
logger.debug(f"Shape: {df_coords.shape}")
logger.info("Checking Column names...")
columns = ["name", "alt", 'lon', "lat", "prov"]
assert set(columns).issubset(df_coords.columns)

[32m2024-12-04 13:00:09.197[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading stations...[0m


[32m2024-12-04 13:00:09.207[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mChecking Coords shapes...[0m
[32m2024-12-04 13:00:09.208[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [34m[1mShape: (5238, 5)[0m
[32m2024-12-04 13:00:09.208[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mChecking Column names...[0m


#### Precipitation

In [5]:
logger.info("Loading precipitation...")
# Load Max Temperature Values
df_precip = pd.read_csv(raw_data_dir.joinpath("pr.csv"), index_col=0)

[32m2024-12-04 13:00:09.247[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading precipitation...[0m


In [6]:
logger.info("Checking Coords shapes...")
assert df_precip.shape == (22_645, 2_448)

[32m2024-12-04 13:00:12.277[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mChecking Coords shapes...[0m


### Create XArray Dataset

In [7]:
logger.info("Creating empty dataset...")
coordinates = dict(
    station_id=list(),
    station_name=list(),
    lat=list(),
    lon=list(),
    alt=list(),
    values=list()
)

[32m2024-12-04 13:00:12.319[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mCreating empty dataset...[0m


In [8]:
xr_datasets = xr.Dataset()
pbar = tqdm(df_precip.columns, leave=True)
for iname in pbar:

    try:
        ids = df_precip[str(iname)]
        icoords = df_coords.loc[str(iname)]
        # extract coordinates
        coordinates["station_id"].append(icoords.name)
        coordinates["station_name"].append(icoords["name"].lower())
        coordinates["lat"].append(np.float32(icoords["lat"]))
        coordinates["lon"].append(np.float32(icoords["lon"]))
        coordinates["alt"].append(np.float32(icoords["alt"]))
        coordinates["values"].append(np.float32(ids.values))
    except KeyError:
        pass

ds_precip = xr.Dataset(
    {
        "pr": (("station_id", "time"), coordinates['values']),
        "lon": (("station_id"), coordinates['lon']),
        "lat": (("station_id"), coordinates['lat']),
        "alt": (("station_id"), coordinates['alt']),
        "station_name": (("station_id"), coordinates['station_name']),
    },
    coords={
        "station_id": coordinates["station_id"],
        "time": ds_.time
    }
)

logger.info(f"Cleaning metadata and coordinates...")

# assign coordinates
ds_precip = ds_precip.set_coords(["lon", "lat", "alt", "station_name"])
ds_precip


  0%|          | 0/2448 [00:00<?, ?it/s]

[32m2024-12-04 13:00:12.530[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m32[0m - [1mCleaning metadata and coordinates...[0m


In [9]:
logger.info("Checking xarray.dataset shape...")
assert ds_precip.pr.shape == (1_200, 22_645)

[32m2024-12-04 13:00:12.584[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mChecking xarray.dataset shape...[0m


In [10]:
ds_precip = ds_precip.drop_duplicates(dim="station_id")

logger.info("Checking xarray.dataset shape...")
assert ds_precip.pr.shape == (1_200, 22_645)

[32m2024-12-04 13:00:12.630[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mChecking xarray.dataset shape...[0m


### Correct Coordinates and Units

In [11]:
logger.info("Validating Coordinates...")

# valudate coordinates
ds_precip = validate_longitude(ds_precip)
ds_precip = validate_latitude(ds_precip)

# sort by time
ds_precip = ds_precip.sortby("time")

# clean up attributes names
ds_precip["pr"].attrs["standard_name"] = "daily_cumulative_precipitation"
ds_precip["pr"].attrs["long_name"] = "Daily Cumulative Precipitation"


ds_precip["alt"].attrs["standard_name"] = "altitude"
ds_precip["alt"].attrs["long_name"] = "Altitude"

# validate units
ds_precip = ds_precip.pint.quantify(
    {"pr": "mm / day", 
    "lon": "degree", 
    "lat": "degree",
    "alt": "meters"
    }
)
ds_precip = ds_precip.pint.dequantify()

ds_precip 

[32m2024-12-04 13:00:12.663[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mValidating Coordinates...[0m


### Good Stations

In [12]:
# Load the GOOD Stations
red_feten_stations = pd.read_csv(raw_data_dir.joinpath("red_feten.csv"))
red_feten_stations.shape

(1178, 1)

In [13]:
red_feten_stations = np.intersect1d(red_feten_stations.id, ds_precip.station_id)

logger.info(f"# Red Feten Stations: {len(red_feten_stations)}...")

[32m2024-12-04 13:00:12.833[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1m# Red Feten Stations: 146...[0m


In [14]:
# create mask
red_feten_mask = ds_precip.station_id.isin(red_feten_stations).rename("red_feten").astype(np.int8)

# assign as coordinates
ds_precip = ds_precip.assign_coords({"red_feten_mask": red_feten_mask})

ds_precip

In [15]:
ds_precip = ds_precip.drop_duplicates(dim="station_id")
ds_precip

## Save

In [16]:
logger.info(f"Saving data to disk...")

save_name = "pr_stations_spain.zarr"
full_save_path = clean_data_dir.joinpath(save_name)

logger.debug(f"Save Path: {full_save_path}")
ds_precip.to_zarr(full_save_path, mode="w")

[32m2024-12-04 13:00:13.069[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mSaving data to disk...[0m
[32m2024-12-04 13:00:13.070[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [34m[1mSave Path: /home/juanjohn/pool_data/dynev4eo/data/clean/pr_stations_spain.zarr[0m


<xarray.backends.zarr.ZarrStore at 0x7f1985262bc0>