# Aggregating and combining metrics

The plot metrics we previously created can be 0D (single value), 1D (vertical profile), 2D (single values across an xy grid) or 3D (vertical profiles across an xy grid - ie. voxels). 

At the moment, to compare plots we just want 0D metrics.

For the non gridded metrics, we've already calculated some summary stats over the vertical profiles, so we can just drop those metrics / data_vars. Similarly, we can drop the vertical profiles in the gridded metrics to just have 2D metrics. Now we can summarise across the x and y dimensions using something like sd or cv to capture variability. 

Not using open_mfdataset this time because it seems to be very slow.

In [1]:
from pathlib import Path

import geopandas as gpd
import pandas as pd
import numpy as np
import xarray as xr

In [2]:
data_dir = Path("../data")
plots_dir = data_dir / "outputs" / "plots"
metrics_dir = plots_dir / "metrics"
lidar_dir = plots_dir / "lidar"

In [3]:
def drop_z_data_vars(ds):
    vars_without_z = [v for v in ds.data_vars if "z" not in ds[v].dims]
    ds = ds[vars_without_z]
    return ds


def read_metrics(plot: str, subdir="no_grid_z_1m"):
    path = metrics_dir / subdir / f"{plot}.nc"
    with xr.open_dataset(path) as ds:
        return ds.load()


def ds_to_df(ds, grid_size=0):
    ds = drop_z_data_vars(ds)

    if grid_size > 0:
        ds_mean = ds.mean(dim=["x", "y"])
        ds_std = ds.std(dim=["x", "y"])
        ds_cv = ds_std / ds_mean

        df_mean = ds_mean.stack(plot_variant=("plot", "variant")).to_dataframe()
        df_std = ds_std.stack(plot_variant=("plot", "variant")).to_dataframe()
        df_cv = ds_cv.stack(plot_variant=("plot", "variant")).to_dataframe()

        df_mean["grid_agg"] = "mean"
        df_std["grid_agg"] = "std"
        df_cv["grid_agg"] = "cv"
        df = pd.concat([df_mean, df_std, df_cv])
    else:
        df = ds.stack(plot_variant=("plot", "variant")).to_dataframe()
        df["grid_agg"] = 'none'

    df["grid_size"] = grid_size

    # Drop the plot_variant multi-index
    df = df.reset_index(drop=True)
    df = df.set_index("plot")

    return df


def load_df_for_grid_size(plot: str, grid_size=0):
    if grid_size == 0:
        ds = read_metrics(plot, "no_grid_z_1m")
    elif grid_size == 10:
        ds = read_metrics(plot, "grid_10m_z_1m")
    elif grid_size == 5:
        ds = read_metrics(plot, "grid_5m_z_1m")
    elif grid_size == 1:
        ds = read_metrics(plot, "grid_1m_z_1m")
    else:
        raise ValueError(f"Grid size {grid_size} not supported")

    return ds_to_df(ds, grid_size)


def load_plot_metrics(plot: str):
    df_no_grid = load_df_for_grid_size(plot, 0)
    df_grid_10m = load_df_for_grid_size(plot, 10)
    df_grid_5m = load_df_for_grid_size(plot, 5)
    df_grid_1m = load_df_for_grid_size(plot, 1)

    return pd.concat([df_no_grid, df_grid_10m, df_grid_5m, df_grid_1m])

In [4]:
plots = gpd.read_file(plots_dir / "plots.geojson")
plot_ids = plots['id'].to_list()

In [None]:
from dask.distributed import Client

client = Client()  # Start a Dask client
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:49949,Workers: 0
Dashboard: http://127.0.0.1:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:49961,Total threads: 2
Dashboard: http://127.0.0.1:49965/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:49952,
Local directory: /var/folders/37/j4yld2bd7pz4_0p7b249nvv40000gn/T/dask-scratch-space/worker-o8y8irds,Local directory: /var/folders/37/j4yld2bd7pz4_0p7b249nvv40000gn/T/dask-scratch-space/worker-o8y8irds

0,1
Comm: tcp://127.0.0.1:49960,Total threads: 2
Dashboard: http://127.0.0.1:49963/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:49954,
Local directory: /var/folders/37/j4yld2bd7pz4_0p7b249nvv40000gn/T/dask-scratch-space/worker-b5yajhcn,Local directory: /var/folders/37/j4yld2bd7pz4_0p7b249nvv40000gn/T/dask-scratch-space/worker-b5yajhcn

0,1
Comm: tcp://127.0.0.1:49962,Total threads: 2
Dashboard: http://127.0.0.1:49967/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:49956,
Local directory: /var/folders/37/j4yld2bd7pz4_0p7b249nvv40000gn/T/dask-scratch-space/worker-rg_z9x82,Local directory: /var/folders/37/j4yld2bd7pz4_0p7b249nvv40000gn/T/dask-scratch-space/worker-rg_z9x82

0,1
Comm: tcp://127.0.0.1:49969,Total threads: 2
Dashboard: http://127.0.0.1:49970/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:49958,
Local directory: /var/folders/37/j4yld2bd7pz4_0p7b249nvv40000gn/T/dask-scratch-space/worker-i6xkchzn,Local directory: /var/folders/37/j4yld2bd7pz4_0p7b249nvv40000gn/T/dask-scratch-space/worker-i6xkchzn




In [6]:
futures = client.map(load_plot_metrics, plot_ids, key=plot_ids, retries=10)

In [7]:
results = client.gather(futures)

In [9]:
df = pd.concat(results)
df

Unnamed: 0_level_0,max,min,range,mean,median,sd,var,cv,crr,skew,...,norm_fhd,cv_inside,cv_inside_p,cv_ppi,cv_vad,site,site_type,variant,grid_agg,grid_size
plot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AGG_O_01_P1,16.275999,0.000000,16.275999,4.428016,4.555000,4.093416,16.756056,0.924436,0.272058,0.245465,...,0.824705,1.051162,1.051162,0.111321,1.511392,AGG_O_01,AGG,default,none,0
AGG_O_01_P1,16.275999,0.000000,16.275999,6.716327,7.568000,3.647536,13.304515,0.543085,0.412652,-0.601755,...,0.860111,0.848656,0.848656,0.129149,0.997475,AGG_O_01,AGG,rnw,none,0
AGG_O_01_P1,16.275999,0.000000,16.275999,4.428016,4.555000,4.093416,16.756056,0.924436,0.272058,0.245465,...,0.821678,1.066437,1.066437,0.110900,1.522600,AGG_O_01,AGG,iw,none,0
AGG_O_01_P1,16.275999,0.000000,16.275999,4.428016,4.555000,4.093416,16.756056,0.924436,0.272058,0.245465,...,0.811571,1.125259,1.125259,0.109713,1.573446,AGG_O_01,AGG,fr,none,0
AGG_O_01_P1,16.275999,0.001000,16.275000,5.646877,6.400000,3.805994,14.485587,0.674000,0.346905,-0.170667,...,0.838153,0.972110,0.972110,0.070207,0.761257,AGG_O_01,AGG,v0,none,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ULY_Y_96_P5,1.277456,6.454628,1.294826,1.519432,2.150591,1.458456,3.106510,0.608595,0.484396,1.262774,...,0.240840,0.448943,0.448943,0.698109,0.622618,ULY_Y_96,ULY,rnw,cv,1
ULY_Y_96_P5,1.273690,8.157261,1.284342,1.545650,2.531208,1.467207,3.153993,0.567536,0.486007,1.016162,...,0.247022,0.454578,0.454578,0.714566,0.606891,ULY_Y_96,ULY,iw,cv,1
ULY_Y_96_P5,1.273690,8.157261,1.284342,1.545650,2.531208,1.467207,3.153993,0.567536,0.486007,1.016162,...,0.276168,0.468649,0.468649,0.728377,0.601008,ULY_Y_96,ULY,fr,cv,1
ULY_Y_96_P5,1.266414,3.705982,1.307232,1.437641,2.025826,1.486969,3.204246,0.529988,0.371274,1.538131,...,0.311155,0.381923,0.381923,0.865285,0.634225,ULY_Y_96,ULY,v0,cv,1


In [10]:
df.to_csv(plots_dir /  "plot_metrics.csv")
df.to_parquet(plots_dir / "plot_metrics.parquet")
df.to_json(plots_dir / "plot_metrics.json", orient='records')

In [9]:
client.close()