## Define Variables / Import MetaData

In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import xarray as xr
from cartopy import crs as ccrs 
import datetime
import glob
import os

In [None]:
# Define misc variables
amer_filepath = '../ameriflux-data/'
mic_filepath = '../../micasa-data/'
timedelta = 'DD'

In [None]:
# Import site metadata csv
meta_file = amer_filepath + 'AmeriFlux-site-search-results-202410071335.tsv'
ameriflux_meta = pd.read_csv(meta_file, sep='\t')
fluxnet_meta = ameriflux_meta.loc[ameriflux_meta['AmeriFlux FLUXNET Data'] == 'Yes'] #use FLUXNET only

In [None]:
total_sites = fluxnet_meta['Site ID'].to_list()
len(total_sites)

In [None]:
# set map proj
proj=ccrs.PlateCarree()

In [None]:
def get_single_match(pattern):
    matches = glob.glob(pattern)
    if len(matches) == 1:
        return matches[0]
    elif len(matches) == 0:
        raise ValueError(f"No matches found")
    else:
        raise ValueError(f"Multiple matches found: {matches}")

In [None]:
def get_amer_match(amer_filepath, site_ID):
    match = get_single_match(amer_filepath + 'AMF_' + site_ID +
                            '_FLUXNET_SUBSET_*/AMF_' + site_ID +
                            '_FLUXNET_SUBSET_' + timedelta + '*.csv')
    return match

## B. Develop script to dump all variables requested into one preprocessed file

In [None]:
site_ID = 'AR-TF1'
# Define micasa variables wanted
micasa_var_list = ['NEE', 'NPP']
# Extract site lat/lon
site_lat = ameriflux_meta.loc[ameriflux_meta['Site ID'] == site_ID, 'Latitude (degrees)'].values
site_lon = ameriflux_meta.loc[ameriflux_meta['Site ID'] == site_ID, 'Longitude (degrees)'].values

In [None]:
# Import fluxnet 
fluxnet_sel = pd.read_csv(get_amer_match(amer_filepath, site_ID))

In [None]:
fluxnet_sel_dates = fluxnet_sel.loc[:,['TIMESTAMP']].copy()
fluxnet_sel_dates['TIMESTAMP'] = pd.to_datetime(fluxnet_sel_dates['TIMESTAMP'],format='%Y%m%d')
fluxnet_sel_dates = fluxnet_sel_dates.set_index('TIMESTAMP')
time = fluxnet_sel_dates.index
dates_unique = list({dt.date() for dt in time})
dates_unique.sort()
dates_unique[:5]

In [None]:
# Import micasa
# Test some random files
path_test_mon = glob.glob(mic_filepath + 'daily-0.1deg-final/holding/daily/2016/01/MiCASA_v1_flux_x3600_y1800_daily_201601*')
path_sorted_mon = sorted(path_test_mon)

path_test_yr = glob.glob(mic_filepath + 'daily-0.1deg-final/holding/daily/2016/??/MiCASA_v1_flux_x3600_y1800_daily_20??????.nc4')
path_sorted_yr = sorted(path_test_yr)

path_test_all = glob.glob(mic_filepath + 'daily-0.1deg-final/holding/daily/20??/??/MiCASA_v1_flux_x3600_y1800_daily_20??????.nc4')
path_sorted_all = sorted(path_test_all)

In [None]:
print(path_sorted_mon[0], len(path_sorted_mon), len(path_sorted_yr), len(path_sorted_all), sep='\n')

In [None]:
%%time
# ds = xr.open_mfdataset(path_test_mon)
# ds = xr.open_mfdataset(path_sorted_mon)[micasa_var_list] 
ds = xr.open_mfdataset(path_sorted_mon, combine='nested', concat_dim='time')[micasa_var_list]
# ds = xr.open_mfdataset(path_sorted_mon, parallel=True)[micasa_var_list] #Test this when I can get the dashboard to work
ds

# Note: 
# Not clear how fast combine=nested, concat_dim='time' improves speed over default combine='by-coords'
# Adding in just relevant variables seems to shorten time several seconds for 1 month

In [None]:
# IMO this is all slow and i should virtualize... because then I can reuse the virtualized store for across all sites, since there are year crossovers?
# Or perhaps I should load all the micasa data at once and subset from there? That would avoid opening some files multiple times
# How long does it take to open the micasa data? ~10 seconds for 1 month, 1:30 for 1 year, too long to run all

In [None]:
# %%time
ds_subset = ds.sel(lon=site_lon, lat=site_lat, method='nearest')
ds_subset

In [None]:
# ds_out = pd.DataFrame()
# with xr.open_mfdataset(path_test_mon)[micasa_var_list] as ds:
#     # Select grid closest to selected site
#     ds_subset = ds.sel(lon=site_lon, lat=site_lat, method='nearest')
#     # print(ds_subset)
    
#     # Prep data for writing to csv
#     ds_subset = ds_subset.squeeze(dim=['lat','lon'],drop=True)
#     # print(ds_subset)
    
#     # Output a single file for each site with all variables
#     for micasa_var in micasa_var_list:
#         ds_out[micasa_var] = ds_subset[micasa_var].to_dataframe()
#         ds_out.rename(columns={micasa_var: f'MiCASA {micasa_var} ({ds_subset[micasa_var].units})'}, inplace=True)