In [1]:
import datetime

import colormaps
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import xarray as xr
from jetutils.anyspell import get_persistent_jet_spells, mask_from_spells_pl, subset_around_onset
from jetutils.clustering import Experiment
from jetutils.data import DataHandler, open_da, smooth, standardize, compute_all_dailymeans, compute_all_smoothed_anomalies
from jetutils.definitions import (
    DATADIR,
    YEARS,
    PRETTIER_VARNAME,
    compute,
    get_region,
    infer_direction,
    polars_to_xarray,
    xarray_to_polars,
)
from jetutils.jet_finding import JetFindingExperiment, gather_normal_da_jets, iterate_over_year_maybe_member, average_jet_categories
from jetutils.plots import COLORS, Clusterplot, gather_normal_da_jets_wrapper, interp_jets_to_zero_one
from matplotlib.cm import ScalarMappable
from matplotlib.colors import BoundaryNorm
from matplotlib.ticker import MaxNLocator
from tqdm import tqdm

%load_ext autoreload
%autoreload 2
%matplotlib inline

Found config override file at  /storage/homefs/hb22g102/.jetutils.ini
Guessed N_WORKERS :  10
Guessed MEMORY_LIMIT :  225280


## temperature comparison

In [2]:
compute_all_dailymeans("ERA5", "plev", "t300", np.mean)

 98%|█████████▊| 63/64 [06:16<00:05,  5.98s/it]


In [3]:
compute_all_smoothed_anomalies("ERA5", "plev", "t300", "dailymean", "dayofyear", {"dayofyear": ("win", 15)})

[########################################] | 100% Completed | 86.39 s


100%|██████████| 64/64 [04:21<00:00,  4.08s/it]


# CESM clims

In [6]:
da_tp = xr.open_zarr("/storage/workspaces/giub_meteo_impacts/ci01/CESM2/PRECL/past.zarr")

clim = da_tp.groupby("time.dayofyear").mean()
clim = smooth(clim, {'dayofyear': ('win', 15)})
clim = compute(clim, progress_flag=True)
clim.to_zarr("/storage/workspaces/giub_meteo_impacts/ci01/CESM2/PRECL/past_clim.zarr")

[########################################] | 100% Completed | 126.37 s


<xarray.backends.zarr.ZarrStore at 0x7ff12afe3d00>

In [None]:
anom = da_tp.groupby("time.dayofyear") - clim
anom = compute(anom, progress_flag=True)
anom.to_zarr("/storage/workspaces/giub_meteo_impacts/ci01/CESM2/PRECL/past_anom.zarr")

In [None]:
da_T = xr.open_zarr("/storage/workspaces/giub_meteo_impacts/ci01/CESM2/TS/past.zarr")

clim = da_tp.groupby("time.dayofyear").mean()
clim = smooth(clim, {'dayofyear': ('win', 15)})
clim = compute(clim, progress_flag=True)
clim.to_zarr("/storage/workspaces/giub_meteo_impacts/ci01/CESM2/PRECL/past_clim.zarr")

In [7]:
# da_tp = xr.open_zarr("/storage/workspaces/giub_meteo_impacts/ci01/CESM2/PRECL/future.zarr")

# clim = da_tp.groupby("time.dayofyear").mean()
# clim = smooth(clim, {'dayofyear': ('win', 15)})
# clim = compute(clim, progress_flag=True)
# clim.to_zarr("/storage/workspaces/giub_meteo_impacts/ci01/CESM2/PRECL/future_clim.zarr")

# create jet relative climatologies

In [2]:
dh = DataHandler.from_specs("ERA5", "plev", "high_wind", "6H", "all", None, -80, 40, 15, 80)
exp = JetFindingExperiment(dh)
all_jets_one_df = exp.find_jets(force=False, alignment_thresh=0.6, base_s_thresh=0.55, int_thresh_factor=0.35, hole_size=10)
all_jets_one_df = exp.categorize_jets(None, ["s", "theta"], force=False, n_init=5, init_params="k-means++", mode="week").cast({"time": pl.Datetime("ms")})

phat_jets = all_jets_one_df.filter((pl.col("is_polar") < 0.5) | ((pl.col("is_polar") > 0.5) & (pl.col("int") > 5e8)))
phat_jets_catd = phat_jets.with_columns(**{"jet ID": (pl.col("is_polar") > 0.5).cast(pl.UInt32())})

In [3]:
def create_jet_relative_clim(jets, path, da, suffix=""):
    jets = jets.with_columns(pl.col("time").dt.round("1d"))
    jets = jets.with_columns(jets.group_by("time", maintain_order=True).agg(pl.col("jet ID").rle_id())["jet ID"].explode())
    indexer = iterate_over_year_maybe_member(jets, da)
    to_average = []
    for idx1, idx2 in tqdm(indexer, total=len(YEARS)):
        jets_ = jets.filter(*idx1)
        da_ = da.sel(**idx2)
        try:
            jets_with_interp = gather_normal_da_jets(jets_, da_, half_length=20)
        except (KeyError, ValueError):
            break
        varname = da_.name + "_interp"
        jets_with_interp = interp_jets_to_zero_one(jets_with_interp, [varname, "is_polar"])
        jets_with_interp = jets_with_interp.group_by("time", pl.col("is_polar") > 0.5, "norm_index", "n", maintain_order=True).agg(pl.col(varname).mean() )
        to_average.append(jets_with_interp)
    to_average = pl.concat(to_average)
    clim = to_average.group_by(pl.col("time").dt.ordinal_day().alias("dayofyear"), "is_polar", "norm_index", "n").agg(pl.col(varname).mean()).sort("dayofyear", "is_polar", "norm_index", "n")
    clim_ds = polars_to_xarray(clim, ["dayofyear", "is_polar", "n", "norm_index"])
    clim_ds.to_netcdf(path.joinpath(f"{da.name}{suffix}_relative_clim.nc"))

In [4]:
args = ["all", None, *get_region(exp.ds), "all"]
# da_T = open_da("ERA5", "plev", "t300", "dailymean", *args)
# da_T = compute(da_T)
# create_jet_relative_clim(phat_jets, exp.path, da_T, suffix="_phat")
# del da_T
da_T = open_da("ERA5", "surf", "t2m", "dailymean", *args)
da_T = compute(da_T)
create_jet_relative_clim(phat_jets, exp.path, da_T, suffix="_phat")
del da_T
da_tp = open_da("ERA5", "surf", "tp", "dailysum", *args)
da_tp = compute(da_tp)
create_jet_relative_clim(phat_jets, exp.path, da_tp, suffix="_phat")
del da_tp
da_apvs = open_da("ERA5", "thetalev", "apvs", "dailyany", *args)
da_apvs = compute(da_apvs)
create_jet_relative_clim(phat_jets, exp.path, da_apvs, suffix="_phat")
del da_apvs
da_cpvs = open_da("ERA5", "thetalev", "cpvs", "dailyany", *args)
da_cpvs = compute(da_cpvs)
create_jet_relative_clim(phat_jets, exp.path, da_cpvs, suffix="_phat")
del da_cpvs

100%|██████████| 64/64 [07:10<00:00,  6.72s/it]
100%|██████████| 64/64 [06:56<00:00,  6.50s/it]
100%|██████████| 64/64 [07:11<00:00,  6.74s/it]
100%|██████████| 64/64 [07:07<00:00,  6.68s/it]


In [5]:
args = ["all", None, *get_region(exp.ds), "all", "dayofyear", {"dayofyear": ("win", 15)}]
da_T = open_da("ERA5", "surf", "t2m", "dailymean", *args)
da_T = compute(da_T)
create_jet_relative_clim(exp, da_T, "_anom")
del da_T
# da_T = open_da("ERA5", "plev", "t300", "dailymean", *args)
# da_T = compute(da_T)
# create_jet_relative_clim(exp, da_T, "_anom")
# del da_T
# da_tp = open_da("ERA5", "surf", "tp", "dailysum", *args)
# da_tp = compute(da_tp)
# create_jet_relative_clim(exp, da_tp, "_anom")
# del da_tp
# da_apvs = open_da("ERA5", "thetalev", "apvs", "dailyany", *args)
# da_apvs = compute(da_apvs)
# create_jet_relative_clim(exp, da_apvs, "_anom")
# del da_apvs
# da_cpvs = open_da("ERA5", "thetalev", "cpvs", "dailyany", *args)
# da_cpvs = compute(da_cpvs)
# create_jet_relative_clim(exp, da_cpvs, "_anom")
# del da_cpvs

100%|██████████| 64/64 [05:34<00:00,  5.23s/it]


# arco-era5 tests

In [3]:
ds = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3",
    chunks=None,
    storage_options=dict(token="anon"),
)
ar_full_37_1h = ds.sel(
    time=slice(ds.attrs["valid_time_start"], ds.attrs["valid_time_stop"])
)

temp_full = (
    ar_full_37_1h["temperature"]
    .sel(
        time=ar_full_37_1h.time.dt.hour % 6 == 0,
        latitude=ar_full_37_1h.latitude >= 0,
        level=200,
    )
    .isel(longitude=slice(None, None, 2), latitude=slice(None, None, 2))
)

temp_full = standardize(temp_full).chunk("auto")

from pathlib import Path
base_path_1 = Path(f"{DATADIR}/ERA5/plev/t200/6H")
base_path_2 = Path(f"{DATADIR}/ERA5/plev/t200/dailymean")
# base_path_1.mkdir(parents=True)
# base_path_2.mkdir(parents=True)
for year in YEARS:
    print(year)
    opath_1 = base_path_1.joinpath(f"{year}.nc")
    opath_2 = base_path_2.joinpath(f"{year}.nc")

    if opath_2.is_file():
        continue
    this_temp = temp_full.sel(time=temp_full.time.dt.year == year)
    this_temp = this_temp.reset_coords("lev", drop=True)
    this_temp = compute(this_temp, progress_flag=True)
    this_temp.to_netcdf(opath_1)
    
    this_temp = this_temp.resample(time="1d").mean()
    this_temp.to_netcdf(opath_2)

1959
[                                        ] | 0% Completed | 97.06 sms


KeyboardInterrupt: 

In [5]:
temp_full

In [None]:
ds = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3",
    chunks=None,
    storage_options=dict(token="anon"),
)
ar_full_37_1h = ds.sel(
    time=slice(ds.attrs["valid_time_start"], ds.attrs["valid_time_stop"])
)

temp_full = (
    ar_full_37_1h["temperature"]
    .sel(
        time=ar_full_37_1h.time.dt.hour % 6 == 0,
        latitude=ar_full_37_1h.latitude >= 0,
        level=[175, 200, 225, 250, 300, 350],
    )
    .isel(longitude=slice(None, None, 2), latitude=slice(None, None, 2))
)

temp_full = standardize(temp_full)

orig_path = Path(f"{DATADIR}/ERA5/plev/flat_wind/dailymean")
base_path = Path(f"{DATADIR}/ERA5/plev/flat_wind/dailymean_2")
for year in tqdm(YEARS):
    for month in trange(1, 13, leave=False):
        month_str = str(month).zfill(2)
        opath = base_path.joinpath(f"{year}{month_str}.nc")
        if opath.is_file():
            continue
        ipath = orig_path.joinpath(f"{year}{month_str}.nc")
        ds = xr.open_dataset(ipath)
        this_temp = temp_full.sel(time=ds.time.values, lev=ds["lev"])
        this_temp = this_temp * (1000 / this_temp.lev) ** KAPPA
        this_temp = this_temp.reset_coords("lev", drop=True)
        ds["theta"] = compute(this_temp, progress_flag=True)
        ds.to_netcdf(opath)

# new pvs das: any() over levels

In [2]:
pl.read_parquet("/storage/workspaces/giub_meteo_impacts/ci01/ERA5/RWB_index/era5_pv_streamers_350K_1959-2022.parquet")

date,level,com,mean_var,event_area,intensity,geometry
datetime[ns],f64,list[f64],f64,f64,f64,binary
1959-01-01 00:00:00,-2.0,"[-147.0, -18.0]",-2.09,375772.89,23.35,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x00`b\xc0\x00\x00\x00\x00\x00\x004\xc0\x00\x00\x00\x00\x00`b\xc0\x00\x00\x00\x00\x00\x003\xc0\x00\x00\x00\x00\x00`b\xc0\x00\x00\x00\x00\x00\x002""…"
1959-01-01 00:00:00,-2.0,"[-132.0, -13.0]",-2.16,529737.55,-0.45,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x00\x00\x00\x00\x00\xe0`\xc0\x00\x00\x00\x00\x00\x000\xc0\x00\x00\x00\x00\x00\xc0`\xc0\x00\x00\x00\x00\x00\x00.\xc0\x00\x00\x00\x00\x00\xc0`\xc0\x00\x00\x00\x00\x00\x00,""…"
1959-01-01 00:00:00,-2.0,"[-20.0, -18.0]",-2.36,1.1971e6,88.36,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00\x20\x00\x00\x00\x00\x00\x00\x00\x00\x008\xc0\x00\x00\x00\x00\x00\x007\xc0\x00\x00\x00\x00\x00\x008\xc0\x00\x00\x00\x00\x00\x006\xc0\x00\x00\x00\x00\x00\x008\xc0\x00\x00\x00\x00\x00\x005""…"
1959-01-01 00:00:00,-2.0,"[77.0, -30.0]",-1.82,481343.94,-2.95,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x00@R@\x00\x00\x00\x00\x00\x00@\xc0\x00\x00\x00\x00\x00\x80R@\x00\x00\x00\x00\x00\x00@\xc0\x00\x00\x00\x00\x00\xc0R@\x00\x00\x00\x00\x00\x00@""…"
1959-01-01 00:00:00,-2.0,"[82.0, -27.0]",-2.17,462190.13,-5.23,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x00\xc0T@\x00\x00\x00\x00\x00\x00>\xc0\x00\x00\x00\x00\x00\x80T@\x00\x00\x00\x00\x00\x00>\xc0\x00\x00\x00\x00\x00@T@\x00\x00\x00\x00\x00\x00=""…"
…,…,…,…,…,…,…
2022-12-31 18:00:00,-2.0,"[58.0, -29.0]",-2.69,1.2129e6,42.57,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00#\x00\x00\x00\x00\x00\x00\x00\x00\x00L@\x00\x00\x00\x00\x00\x80A\xc0\x00\x00\x00\x00\x00\x80L@\x00\x00\x00\x00\x00\x80A\xc0\x00\x00\x00\x00\x00\x80L@\x00\x00\x00\x00\x00\x00A""…"
2022-12-31 18:00:00,-2.0,"[169.0, -29.0]",-2.19,714050.86,78.11,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00\x16\x00\x00\x00\x00\x00\x00\x00\x00\x80d@\x00\x00\x00\x00\x00\x00;\xc0\x00\x00\x00\x00\x00\xa0d@\x00\x00\x00\x00\x00\x00;\xc0\x00\x00\x00\x00\x00\xc0d@\x00\x00\x00\x00\x00\x00;""…"
2022-12-31 18:00:00,-2.0,"[171.0, -35.0]",-1.85,415423.96,22.4,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\xa0e@\x00\x00\x00\x00\x00\x00?\xc0\x00\x00\x00\x00\x00\x80e@\x00\x00\x00\x00\x00\x00@\xc0\x00\x00\x00\x00\x00`e@\x00\x00\x00\x00\x00\x00@""…"
2022-12-31 18:00:00,2.0,"[-107.0, 6.0]",2.26,3.2906e6,30.49,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00S\x00\x00\x00\x00\x00\x00\x00\x00\xc0Z\xc0\x00\x00\x00\x00\x00\x002@\x00\x00\x00\x00\x00\xc0Z\xc0\x00\x00\x00\x00\x00\x001@\x00\x00\x00\x00\x00\xc0Z\xc0\x00\x00\x00\x00\x00\x000""…"


In [13]:
for year in tqdm(YEARS):
    opath = Path("/storage/workspaces/giub_meteo_impacts/ci01/ERA5/thetalev/apvs/dailyany", f"{year}.nc")
    if opath.is_file():
        continue
    da = open_da("ERA5", "thetalev", "apvs", "6H", [year], None, None, None, None, None, "all").astype(np.int8).any("lev").resample(time="1D").any()
    da = compute(da)
    to_netcdf(da, opath)

  0%|          | 0/64 [00:09<?, ?it/s]


KeyboardInterrupt: 

# CESM

### new download with urls

## newnew merger script: download then postprocess:

In [45]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from jetutils.definitions import DATADIR, KAPPA, compute
from jetutils.data import standardize, flatten_by, extract
import intake
import numpy as np
import xarray as xr
from pathlib import Path
from dask.diagnostics import ProgressBar

varname = "PRECL"
component = "atm" # for land variables like RAIN, "atm" for atmospheric variables like wind, and "ocn" for ocean variables
forcing_variant = "cmip6" # other option is "smbb", which stands for "SMoothed Biomass Burning"
out_path = Path(DATADIR, "CESM2", varname)
minlon, maxlon, minlat, maxlat = None, None, 0, 90
levels = None
years = {
    "past": np.arange(1970, 2010),
    "future": np.arange(2060, 2100),
}

col_url = (
    "https://ncar-cesm2-lens.s3-us-west-2.amazonaws.com/catalogs/aws-cesm2-le.json"
)
catalog = intake.open_esm_datastore(col_url)

catalog_subset = catalog.search(variable=varname, frequency='daily', forcing_variant=forcing_variant)
dsets = catalog_subset.to_dataset_dict(storage_options={'anon':True})

ds_past = dsets[f"{component}.historical.daily.{forcing_variant}"]
ds_future = dsets[f"{component}.ssp370.daily.{forcing_variant}"]

ds_past_ns = (
    standardize(ds_past)
    .reset_coords("time_bnds", drop=True)
    .squeeze()
    .isel(time=np.isin(ds_past.time.dt.year, years["past"]))
    .sel(lon=slice(minlon, maxlon))
    .sel(lat=slice(minlat, maxlat))
)
ds_future_ns = (
    standardize(ds_future)
    .reset_coords("time_bnds", drop=True)
    .squeeze()
    .isel(time=np.isin(ds_future.time.dt.year, years["future"]))
    .sel(lon=slice(minlon, maxlon))
    .sel(lat=slice(minlat, maxlat))
)
if levels is not None and "lev" in ds_past_ns.dims:
    ds_past_ns = ds_past_ns.isel(lev=levels)
    ds_future_ns = ds_future_ns.isel(lev=levels)

opath = out_path.joinpath("historical")
opath.mkdir(parents=True, exist_ok=True)
for varname in ds_past_ns.data_vars:
    ds_past_ns[varname] = ds_past_ns[varname].drop_encoding()
saved = ds_past_ns.to_zarr(opath.joinpath("ds.zarr"), compute=False, mode="w")
with ProgressBar():
    saved.compute()
    
opath = out_path.joinpath("ssp370")
opath.mkdir(parents=True, exist_ok=True)
for varname in ds_future_ns.data_vars:
    ds_future_ns[varname] = ds_future_ns[varname].drop_encoding()
saved = ds_future_ns.to_zarr(opath.joinpath("ds.zarr"), compute=False, mode="w")
with ProgressBar():
    saved.compute()
#     ds_past_ns = ds_past_ns.load()
# ds_past_ns.to_netcdf(out_path.joinpath(out_name_past))
# del ds_past_ns # free up memory

# with ProgressBar():
#     ds_future_ns = ds_future_ns.load()
# ds_future_ns.to_netcdf(out_path.joinpath(out_name_future))


--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.frequency.forcing_variant'


[##################                      ] | 45% Completed | 10m 57ss


KeyboardInterrupt: 

### new cesm zarrification

In [3]:
basepath = Path("/storage/workspaces/giub_meteo_impacts/ci01/CESM2/high_wind/ssp370")
paths = list(basepath.glob("*.nc"))
names = [path.stem.split("-") for path in paths]
members = [name[0] for name in names]
years = [name[1] for name in names]
for i, member in enumerate(tqdm(np.unique(members))):
    da = xr.open_mfdataset(basepath.joinpath(f"{member}-*.nc").as_posix())
    kwargs = {"mode": "w"} if i == 0 else {"mode": "a", "append_dim": "member"}
    da["member"] = da["member"].astype("<U15")
    da = da.expand_dims("member").copy(deep=True)
    break
    # da.to_zarr(basepath.joinpath("ds.zarr"), **kwargs)

  0%|          | 0/50 [00:03<?, ?it/s]
