In [None]:
import xarray as xr
import warnings
warnings.filterwarnings("ignore", category=xr.SerializationWarning)
import os
import sys
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

In [None]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [None]:
from config import FLUX_DATA_PATH, FLUX_METADATA, MICASA_PREPROCESSED_DATA, MERRA_DATA_PATH

In [None]:
from utils.functions import import_flux_metadata, import_site_RMSE_data, polyfit1d_and_plot

## Open virtual datasets

In [None]:
# Virtualized dataset path
ref_path = os.path.join(MERRA_DATA_FILEPATH, "virtual_store")

In [None]:
ref_url1 = f"reference::{os.path.join(ref_path, "vstore1.parquet")}"
ref_url2 = f"reference::{os.path.join(ref_path, "vstore2.parquet")}"

In [None]:
ds1 = xr.open_dataset(ref_url1, engine="zarr", consolidated=False)

In [None]:
ds2 = xr.open_dataset(ref_url2, engine="zarr", consolidated=False)

# Should I just average across this whole time for now?

In [None]:
ds_combined = xr.concat([ds1["T2M"],ds2["T2M"]], dim="time")

In [None]:
ds_annavgs = ds_combined.groupby("time.year").mean("time")

In [None]:
ds_totavg = ds_combined.mean("time")

In [None]:
ds_totavg

# Import RSME values

In [None]:
df_ANN = import_site_RMSE_data(FLUX_METADATA, '../analysis/RMSE_results_ANN.csv')
df_ANN

# Extract values from xarray dataset

In [None]:
# I should move this function to utils (this could speed up preprocessing?)
def extract_nearest_xr(df, da):
    """
    Vectorized extraction (more efficient for large datasets)
    """
    # Create arrays of lat/lon values
    lats = df['lat'].values
    lons = df['lon'].values
    
    # Use xarray's advanced indexing
    selected = da.sel(
        lat=xr.DataArray(lats, dims='points'),
        lon=xr.DataArray(lons, dims='points'),
        method='nearest'
    )
    
    return selected.values

In [None]:
df_ANN['T2M_avg'] = extract_nearest_xr(df_ANN, ds_totavg)

In [None]:
df_ANN

In [None]:
def polyfit1d_and_plot(df, var1, var2, title):
    coef = np.polyfit(df[var1], df[var2], 1)
    poly1d_fn = np.poly1d(coef)
    r2 = r2_score(df[var1], poly1d_fn(df[var2]))
    
    fig, ax = plt.subplots(1,1)
    ax.scatter(df[var1], df[var2])
    ax.plot(df[var1],poly1d_fn(df[var1]), color="red", label=f"$R^2$: {r2:.2f}")
    ax.set_xlabel(f"{var1}")
    ax.set_ylabel("RMSE (FluxNet vs MiCASA)")
    ax.set_title(title)
    ax.legend(loc="upper left")
    return fig

In [None]:
polyfit1d_and_plot(df_ANN, "T2M_avg", "NEE_RMSE", "NEE RSME vs MERRA2 T2M 30-year (1991-2021) average");