## Check Jupyter notebook is using pixi kernel

In [None]:
# Cell 1: Check Python executable
import sys
print("Python executable:", sys.executable)

In [None]:
# Cell 2: Check Python path
import sys
print("Python path:")
for path in sys.path:
    print(f"  {path}")

## Define Variables / Import MetaData

In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from cartopy import crs as ccrs 
import cartopy.feature as cfeature
import pandas as pd
import xarray as xr
# import hvplot.pandas
import holoviews as hv
import hvplot.xarray
# plt.rcParams["animation.html"] = "jshtml" #for jupyterlab display, do I need this???
import datetime
import glob
import os
import pprint


In [None]:
# Double check packages are coming from the right locations
print("NumPy location:", np.__file__)
print("NumPy version:", np.__version__)

In [None]:
# Define misc variables
amer_filepath = '../ameriflux-data/'
mic_filepath = '../intermediates/'
timedelta = 'DD'

In [None]:
# Import site metadata csv
meta_file = amer_filepath + 'AmeriFlux-site-search-results-202410071335.tsv'
ameriflux_meta = pd.read_csv(meta_file, sep='\t')
fluxnet_meta = ameriflux_meta.loc[ameriflux_meta['AmeriFlux FLUXNET Data'] == 'Yes'] 

In [None]:
total_sites = fluxnet_meta['Site ID'].to_list()
len(total_sites)

In [None]:
# set map proj
proj=ccrs.PlateCarree()

In [None]:
def get_single_match(pattern):
    matches = glob.glob(pattern)
    if len(matches) == 1:
        return matches[0]
    elif len(matches) == 0:
        raise ValueError(f"No matches found")
    else:
        raise ValueError(f"Multiple matches found: {matches}")

In [None]:
def get_amer_match(amer_filepath, site_ID):
    match = get_single_match(amer_filepath + 'AMF_' + site_ID +
                            '_FLUXNET_SUBSET_*/AMF_' + site_ID +
                            '_FLUXNET_SUBSET_' + timedelta + '*.csv')
    return match

## Preprocessing

### 1. Check sites that aren't preprocessed or plotted

In [None]:
# Get list of files and create dataframe with truncated filenames
plots_list = !ls plots/
plotted_sites = list([filename.split('_')[0] for filename in plots_list])

In [None]:
len(plotted_sites)

In [None]:
# Find missing sites that are not plotted:
missing_plots = [item for item in total_sites
                if item not in plotted_sites]

missing_df = pd.DataFrame(missing_plots, columns=['Missing from Plots'])
missing_df

In [None]:
# What about intermediates?
intermediates_list = !ls intermediates/
processed_sites = list([filename.split('_')[0] for filename in intermediates_list])

In [None]:
missing_processed = [item for item in total_sites
                if item not in processed_sites]
missing_df2 = pd.DataFrame(missing_processed, columns=['Missing from Intermediates'])
missing_df2

##### No longer missing! Bugs fixed

### 2. Debugging pre 2001 FLUXNET data

In [None]:
def get_single_match(pattern):
  matches = glob.glob(pattern)
  if len(matches) == 1:
      return matches[0]
  elif len(matches) == 0:
      raise ValueError(f"No matches found for: {pattern}")
  else:
      raise ValueError(f"Multiple matches found: {matches}")

timedelta = 'DD'
micasa_var_list = ['NEE', 'NPP']

#Import list of fluxnet sites
meta_file = amer_filepath + 'AmeriFlux-site-search-results-202410071335.tsv'
ameriflux_meta = pd.read_csv(meta_file, sep='\t')
fluxnet_meta = ameriflux_meta.loc[ameriflux_meta['AmeriFlux FLUXNET Data'] == 'Yes'] #use FLUXNET only
fluxnet_list = fluxnet_meta['Site ID'].to_list()

In [None]:
# see the dates for the erroring sites
for site_ID in fluxnet_list:
    # Open site data and access time indices
    site_file = get_single_match(amer_filepath + 'AMF_' + site_ID +
                              '_FLUXNET_SUBSET_*/AMF_' + site_ID +
                              '_FLUXNET_SUBSET_' + timedelta + '*.csv')
    fluxnet_sel = pd.read_csv(site_file)
    
    # select subset of columns + convert to datetime objects
    fluxnet_sel_dates = fluxnet_sel.loc[:,['TIMESTAMP']].copy()
    fluxnet_sel_dates['TIMESTAMP'] = pd.to_datetime(fluxnet_sel_dates['TIMESTAMP'],format='%Y%m%d')
    fluxnet_sel_dates = fluxnet_sel_dates.set_index('TIMESTAMP')
    
    # Create a list of unique dates from the site
    time = fluxnet_sel_dates.index
    dates_unique = list({dt.date() for dt in time})
    dates_unique.sort()

    
    # Extract micasa data
    path = '../micasa-data/daily-0.1deg-final/holding/'
    data_path = path + 'daily/'
    
    path_list = []
    for date in dates_unique:
        f_year = str(date.year)
        f_month = f"{date.month:02}"
        filename = 'MiCASA_v1_flux_*' + date.strftime('%Y%m%d') + '.nc4'
        try:
            get_single_match(os.path.join(data_path,f_year,f_month,filename))
        except ValueError as e:
            print(f"{site_ID} has Fluxnet data for: {dates_unique[0]} to {dates_unique[-1]}")
            break

In [None]:
# try to skip the error for one site
site_ID = "CA-Ca1"

# Open site data and access time indices
site_file = get_single_match(amer_filepath + 'AMF_' + site_ID +
                          '_FLUXNET_SUBSET_*/AMF_' + site_ID +
                          '_FLUXNET_SUBSET_' + timedelta + '*.csv')
fluxnet_sel = pd.read_csv(site_file)

# select subset of columns + convert to datetime objects
fluxnet_sel_dates = fluxnet_sel.loc[:,['TIMESTAMP']].copy()
fluxnet_sel_dates['TIMESTAMP'] = pd.to_datetime(fluxnet_sel_dates['TIMESTAMP'],format='%Y%m%d')
fluxnet_sel_dates = fluxnet_sel_dates.set_index('TIMESTAMP')

# Create a list of unique dates from the site
time = fluxnet_sel_dates.index
dates_unique = list({dt.date() for dt in time})
dates_unique.sort()


# Extract micasa data
data_path = '/discover/nobackup/hzafar/ghgc/micasa/micasa-data/daily'
path_list = []
for date in dates_unique:
    f_year = str(date.year)
    f_month = f"{date.month:02}"
    filename = 'MiCASA_v1_flux_*' + date.strftime('%Y%m%d') + '.nc4'
    # print(os.path.join(data_path,f_year,f_month,filename)
    
    try:
        filepath = get_single_match(os.path.join(data_path,f_year,f_month,filename))
        path_list.append(filepath)
    except ValueError as e:
        continue

In [None]:
print(dates_unique[0])
path_list[0]

## Land Mask

In [None]:
import seaborn as sns

### FluxNet Land Type / Tables of Site lat/lon for Brad

In [None]:
summary_table = fluxnet_meta[['Site ID','Latitude (degrees)','Longitude (degrees)', 'Vegetation Abbreviation (IGBP)', 'Vegetation Description (IGBP)', 'Climate Class Abbreviation (Koeppen)', 'Climate Class Description (Koeppen)']]
summary_table

In [None]:
# make csv summary
# summary_table.to_csv('ameriflux-data/Ameriflux_summary_table.csv')

### Plot AmeriFlux sites

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6), subplot_kw= {'projection': proj})
ax.add_feature(cfeature.COASTLINE,zorder=0)
sns.scatterplot(x='Longitude (degrees)', y='Latitude (degrees)', data=summary_table, hue='Vegetation Abbreviation (IGBP)', ax=ax)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
# Plot vegetation abbreviation?
veg_dict = dict(zip(summary_table['Vegetation Abbreviation (IGBP)'].unique(),summary_table['Vegetation Description (IGBP)'].unique()))

In [None]:
pprint.pprint(veg_dict)

### MiCASA Land Mask

In [None]:
!ls ../../landmask

In [None]:
# Import landmask file for 2001
landmask_filepath = '../../landmask/'
year = str(2001)
ds = xr.open_dataset(landmask_filepath + 'MiCASA_v1_cover_x3600_y1800_yearly_' + year + '.nc4')

In [None]:
ds.ftype

In [None]:
ds_water = ds.ftype.sel(type=17)

In [None]:
ds_water.plot()

In [None]:
# ds.ftype.hvplot(x='lat',y='lon', 
#                 crs=proj,
#                size=150)

## Coastal Site Issue

In [None]:
site_ID = 'US-KS3' # example site that is showing up weird
# Extract site lat/lon
site_lat = ameriflux_meta.loc[ameriflux_meta['Site ID'] == site_ID, 'Latitude (degrees)'].values[0]
site_lon = ameriflux_meta.loc[ameriflux_meta['Site ID'] == site_ID, 'Longitude (degrees)'].values[0]
print(site_lat, site_lon)

In [None]:
# Approx location of site
ax = plt.subplot(projection=proj,frameon=False)
if site_lat >= 20:
    # North America extents
    min_lon, max_lon = -170, -57
    min_lat, max_lat = 25, 74

else:
    # South America extents
    min_lon, max_lon = -90, -30
    min_lat, max_lat = -60, 12

ax.axis('off')
ax.set_extent([min_lon, max_lon, min_lat, max_lat], crs=ccrs.PlateCarree())
ax.coastlines()

ax.scatter(site_lon,site_lat,
       marker='*', 
       s=500,
       color='yellow',
       edgecolor='black',
               zorder=3)

In [None]:
# Subset data for plotting
min_lon, max_lon = site_lon-5, site_lon+5
min_lat, max_lat = site_lat-2, site_lat+2

#### Single file

In [None]:
mult_path_test = glob.glob('/discover/nobackup/hzafar/ghgc/micasa/micasa-data/daily/2016/01/MiCASA_v1_flux*.nc4')

In [None]:
ds = xr.open_dataset(mult_path_test[0])['NEE']
ds_subset = ds.sel(lat=slice(min_lat, max_lat), lon=slice(min_lon,max_lon)).isel(time=0)
ds_subset

#### Multifile

In [None]:
import h5netcdf
import dask
dask.config.set({'array.slicing.split_large_chunks': True})

In [None]:
mult_ds = xr.open_mfdataset(
    mult_path_test, 
    engine='h5netcdf',
    parallel=True,  # Enable parallel reading
    chunks='auto'   # Let dask choose chunk sizes
)['NEE']
mult_ds

In [None]:
ds_subset = mult_ds.sel(lat=slice(min_lat, max_lat), lon=slice(min_lon,max_lon))
ds_subset.min().load(), ds_subset.max().load()

In [None]:
mesh_plot = ds_subset.hvplot(x="lon", y="lat",
                      cmap='RdBu_r',
                  clim=(-2e-9,3e-8),
                      # crs = proj,
                      # rasterize=True,
                 frame_width = 500,
                 # widget_location='bottom'
                     )
mesh_plot

In [None]:
ds_sel = ds_subset.sel(lon=[site_lon], lat=[site_lat], method='nearest')

In [None]:
point = ds_sel.hvplot.points('lon', 'lat',
                             color='yellow',size=150,
                              # crs=proj,
                              # geo=True
                             # widget_location='bottom'
                            )
type(point)

In [None]:
type(mesh_plot), type(point)

In [None]:
mesh_plot * point

In [None]:
## Let's look at some of the other sites that plot zero, where they are:
ID_list = ['US-EDN' , 'US-HB1', 'US-KS3']# example site that is showing up weird
filtered_df = fluxnet_meta[fluxnet_meta['Site ID'].isin(ID_list)]
filtered_df[['Site ID','Latitude (degrees)','Longitude (degrees)',]]