# Generate name conventions template for CoCliCo fast-track datasets 

In [None]:
# Use the black code formatter
%load_ext lab_black

### Configure OS independent paths

In [None]:
import os
import pathlib
import sys

# Make root directories importable by appending root to path
cwd = pathlib.Path().resolve()
sys.path.append(os.path.dirname(cwd))


# Get root paths
home = pathlib.Path().home()
root = home.root

# Define both local and remote drives
local_data_dir = home.joinpath("ddata")
local_temp_dir = local_data_dir.joinpath("temp")
p_dir = pathlib.Path(root, "p")
coclico_data_dir = p_dir.joinpath("11205479-coclico", "data")

In [None]:
# Import libraries
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr

In [None]:
import re

# list all files in datadir and get filename from filepath
fpaths = list(coclico_data_dir.iterdir())
fnames = [fp.stem for fp in fpaths]

# only keep filenames which start with range(01, 08)
r = re.compile("^0[0-8]")
dirnames = [fn for fn in fnames if re.search(r, fn)]

In [None]:
def get_names(ds) -> set:
    result = []
    for attrs in [ds.dims, ds.coords, ds.variables]:
        result.extend(list(attrs))
    return set(result)


variables = []
fps_ = []
for dn in dirnames:
    fps = list(coclico_data_dir.joinpath(dn).glob("*.nc"))
    # condition to filter filepaths that contain CF extension or new flag
    fps = [fp for fp in fps if not any(substr in fp.stem for substr in ["_CF", "new"])]
    fps_.extend(fps)
    for fp in fps:
        ds = xr.open_dataset(fp)
        variables.extend(get_names(ds))
variables = sorted(set(variables))

### Create template table

Template has already been saved to data directoy. Following cell is kept for reference,
but can be ignored. 

In [None]:
cv_table = [{"src_name": v} for v in variables]
cv_table = [dict(i, **{"dst_name": ""}) for i in cv_table]
cv_table = [dict(i, **{"long_name": ""}) for i in cv_table]
cv_table = [dict(i, **{"standard_name": ""}) for i in cv_table]
cv_table = [dict(i, **{"cf_type": ""}) for i in cv_table]
cv_table = [dict(i, **{"dtype": ""}) for i in cv_table]

### Load name convention table data directory

In [None]:
df = pd.read_csv(coclico_data_dir.joinpath("common_vocabulary.csv"))
df["cf_type"] = df.cf_type.astype("category")
df.cf_type.cat.set_categories(["dim", "dim or coord", "coord", "var"], inplace=True)
df = df.sort_values(["cf_type", "src_name"]).reset_index(drop=True)

In [None]:
# df.to_csv(coclico_data_dir.joinpath("cv_table.csv"), index=False)