In [1]:
import datetime
import hashlib
import itertools
import json
import os
import pickle
import re
import warnings

from collections import namedtuple
from copy import copy
from pathlib import Path

In [2]:
import netCDF4
from netCDF4 import Dataset

import numpy as np

import pandas as pd

In [3]:
TrajectoryPaths = namedtuple("TrajectoryPaths", ["date", "out", "aer", "ant", "bio", "met"])
TrajectoryDatasets = namedtuple("TrajectoryDatasets", ["date", "out", "aer", "ant", "bio", "met"])
MLDataset = namedtuple("MLDataset", ["date", "paths", "X_raw", "Y_raw", "X_train", "X_valid", "X_test", "Y_train", "Y_valid", "Y_test", "X_scaler", "Y_scaler"])

In [4]:
OUTDIR_PATTERN = re.compile(r"(\d{4})(\d{2})(\d{2})_T(\d{2})")

In [5]:
traj_datetimes = dict()

base = Path.cwd().parent / "trajectories"

for child in (base / "outputs" / "baseline").iterdir():
    if not child.is_dir():
        continue
    
    match = OUTDIR_PATTERN.match(child.name)
    
    if match is None:
        continue
        
    date = datetime.datetime(
        year=int(match.group(1)),
        month=int(match.group(2)),
        day=int(match.group(3)),
        hour=int(match.group(4)),
    )
    
    out_path = child / "output.nc"
    aer_path = (
        base / "inputs" / "baseline" / "HYDE_BASE_Y2018" /
        f"OUTPUT_bwd_{date.strftime('%Y%m%d')}" /
        "EMISSIONS_0422" /
        f"{date.strftime('%Y%m%d')}_7daybwd_Hyde_traj_AER_{24-date.hour:02}_L3.nc"
    )
    ant_path = (
        base / "inputs" / "baseline" / "HYDE_BASE_Y2018" /
        f"OUTPUT_bwd_{date.strftime('%Y%m%d')}" /
        "EMISSIONS_0422" /
        f"{date.strftime('%Y%m%d')}_7daybwd_Hyde_traj_ANT_{24-date.hour:02}_L3.nc"
    )
    bio_path = (
        base / "inputs" / "baseline" / "HYDE_BASE_Y2018" /
        f"OUTPUT_bwd_{date.strftime('%Y%m%d')}" /
        "EMISSIONS_0422" /
        f"{date.strftime('%Y%m%d')}_7daybwd_Hyde_traj_BIO_{24-date.hour:02}_L3.nc"
    )
    met_path = (
        base / "inputs" / "baseline" / "HYDE_BASE_Y2018" /
        f"OUTPUT_bwd_{date.strftime('%Y%m%d')}" /
        "METEO" /
        f"METEO_{date.strftime('%Y%m%d')}_R{24-date.hour:02}.nc"
    )
    
    if (
        (not out_path.exists()) or (not aer_path.exists()) or
        (not ant_path.exists()) or (not bio_path.exists()) or
        (not met_path.exists())
    ):
        raise Exception(out_path, aer_path, ant_path, bio_path, met_path)
    
    traj_datetimes[date] = TrajectoryPaths(
        date=date, out=out_path, aer=aer_path, ant=ant_path, bio=bio_path, met=met_path,
    )

traj_dates = sorted(set(d.date() for d in traj_datetimes.keys()))

In [6]:
def load_trajectory_dataset(paths: TrajectoryPaths) -> TrajectoryDatasets:
    outds = Dataset(paths.out, "r", format="NETCDF4")
    aerds = Dataset(paths.aer, "r", format="NETCDF4")
    antds = Dataset(paths.ant, "r", format="NETCDF4")
    biods = Dataset(paths.bio, "r", format="NETCDF4")
    metds = Dataset(paths.met, "r", format="NETCDF4")
    
    return TrajectoryDatasets(
        date=paths.date, out=outds, aer=aerds, ant=antds, bio=biods, met=metds,
    )

In [7]:
dt = datetime.datetime(year=2018, month=5, day=15, hour=19)
ds = load_trajectory_dataset(traj_datetimes[dt])

In [8]:
for name, var in ds.met.variables.items():
    if name not in ["time", "lev", "t", "q", "ssr", "lsm", "blh"]:
        continue
    
    if var.shape == ds.met["time"].shape:
        dim = "indexed by time"
    elif var.shape == ds.met["lev"].shape:
        dim = "indexed by height level"
    elif var.shape == tuple(list(ds.met["time"].shape) + list(ds.met["lev"].shape)):
        dim = "indexed by time and height level"
    else:
        raise Exception("unexpected shape")
    
    print(name, getattr(var, "long_name", None), getattr(var, "units", None), dim)

time time Seconds since release at: 2018-05-15 19:00:00 indexed by time
lev None None indexed by height level
t Temperature K indexed by time and height level
q Specific humidity kg kg**-1 indexed by time and height level
ssr Surface net solar radiation W m**-2 indexed by time
lsm Land-sea mask (0 - 1) indexed by time
blh atmospheric_boundary_layer_height m indexed by time


In [9]:
for name, var in ds.aer.variables.items():
    if name in ["lat", "lon", "bottom_layer_height", "mid_layer_height", "top_layer_height", "SRRsum"]:
        continue
    
    if var.shape == ds.aer["time"].shape:
        dim = "indexed by time"
    elif var.shape == ds.aer["layer"].shape:
        dim = "indexed by height level"
    elif var.shape == tuple(list(ds.aer["layer"].shape) + list(ds.aer["time"].shape)):
        dim = "indexed by height level and time"
    else:
        raise Exception("unexpected shape")
    
    print(name, getattr(var, "units", None), dim)

layer Meters from ground indexed by height level
time Seconds since release at: 2018-05-15 19:00:00 indexed by time
3-10nm kg m-2 s-1 indexed by height level and time
10-20nm kg m-2 s-1 indexed by height level and time
20-30nm kg m-2 s-1 indexed by height level and time
30-50nm kg m-2 s-1 indexed by height level and time
50-70nm kg m-2 s-1 indexed by height level and time
70-100nm kg m-2 s-1 indexed by height level and time
100-200nm kg m-2 s-1 indexed by height level and time
200-400nm kg m-2 s-1 indexed by height level and time
400-1000nm kg m-2 s-1 indexed by height level and time


In [10]:
for name, var in ds.ant.variables.items():
    if name in ["lat", "lon", "bottom_layer_height", "mid_layer_height", "top_layer_height", "SRRsum"]:
        continue
    
    if var.shape == ds.ant["time"].shape:
        dim = "indexed by time"
    elif var.shape == ds.ant["layer"].shape:
        dim = "indexed by height level"
    elif var.shape == tuple(list(ds.ant["layer"].shape) + list(ds.ant["time"].shape)):
        dim = "indexed by height level and time"
    else:
        raise Exception("unexpected shape")
    
    print(name, getattr(var, "units", None), dim)

layer Meters from ground indexed by height level
time Seconds since release at: 2018-05-15 19:00:00 indexed by time
co kg m-2 s-1 indexed by height level and time
nox kg m-2 s-1 indexed by height level and time
co2 kg m-2 s-1 indexed by height level and time
nh3 kg m-2 s-1 indexed by height level and time
ch4 kg m-2 s-1 indexed by height level and time
so2 kg m-2 s-1 indexed by height level and time
nmvoc kg m-2 s-1 indexed by height level and time
alcohols kg m-2 s-1 indexed by height level and time
ethane kg m-2 s-1 indexed by height level and time
propane kg m-2 s-1 indexed by height level and time
butanes kg m-2 s-1 indexed by height level and time
pentanes kg m-2 s-1 indexed by height level and time
hexanes kg m-2 s-1 indexed by height level and time
ethene kg m-2 s-1 indexed by height level and time
propene kg m-2 s-1 indexed by height level and time
acetylene kg m-2 s-1 indexed by height level and time
isoprene kg m-2 s-1 indexed by height level and time
monoterpenes kg m-2 s-1 

In [11]:
for name, var in ds.bio.variables.items():
    if name in ["lat", "lon", "bottom_layer_height", "mid_layer_height", "top_layer_height", "SRRsum"]:
        continue
    
    if var.shape == ds.bio["time"].shape:
        dim = "indexed by time"
    elif var.shape == ds.bio["layer"].shape:
        dim = "indexed by height level"
    elif var.shape == tuple(list(ds.bio["layer"].shape) + list(ds.bio["time"].shape)):
        dim = "indexed by height level and time"
    else:
        raise Exception("unexpected shape")
    
    print(name, getattr(var, "units", None), dim)

time Seconds since release at: 2018-05-15 19:00:00 indexed by time
acetaldehyde kg m-2 s-1 indexed by time
acetone kg m-2 s-1 indexed by time
butanes-and-higher-alkanes kg m-2 s-1 indexed by time
butenes-and-higher-alkenes kg m-2 s-1 indexed by time
CH4 kg m-2 s-1 indexed by time
CO kg m-2 s-1 indexed by time
ethane kg m-2 s-1 indexed by time
ethanol kg m-2 s-1 indexed by time
ethene kg m-2 s-1 indexed by time
formaldehyde kg m-2 s-1 indexed by time
hydrogen-cyanide kg m-2 s-1 indexed by time
isoprene kg m-2 s-1 indexed by time
MBO kg m-2 s-1 indexed by time
methanol kg m-2 s-1 indexed by time
methyl-bromide kg m-2 s-1 indexed by time
methyl-chloride kg m-2 s-1 indexed by time
methyl-iodide kg m-2 s-1 indexed by time
other-aldehydes kg m-2 s-1 indexed by time
other-ketones kg m-2 s-1 indexed by time
other-monoterpenes kg m-2 s-1 indexed by time
pinene-a kg m-2 s-1 indexed by time
pinene-b kg m-2 s-1 indexed by time
propane kg m-2 s-1 indexed by time
propene kg m-2 s-1 indexed by time
s

In [12]:
for name, var in ds.out.variables.items():
    if name not in ["time", "lev", "dp_dry_fs", "nconc_par"]:
        continue
    
    print(name, getattr(var, "long_name", None), getattr(var, "unit", None))

time time since the beginning of month s
lev height above the ground m
dp_dry_fs dry radius of aerosol particles in each size bin m
nconc_par particle number concentration # m-3
