In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from scipy.stats import zscore, linregress
from pandas.plotting import scatter_matrix
from scipy import stats
import numpy as np
import seaborn as sns
import xarray as xr


def remove_outliers(df, threshold=3):
    # Initialize a boolean mask to keep track of rows to drop
    outlier_rows_mask = np.zeros(len(df), dtype=bool)

    # Iterate over each column
    for col in df.columns:
        # Skip the "t1" and "t2" columns
        if col == "t1" or col == "t2":
            continue

        # Calculate the mean and standard deviation of the column
        mean = df[col].mean()
        std = df[col].std()

        # Find outliers in this column
        outliers = (df[col] - mean).abs() > threshold * std

        # Mark rows with outliers in this column
        outlier_rows_mask = np.logical_or(outlier_rows_mask, outliers)

    # Drop rows with outliers
    cleaned_df = df[~outlier_rows_mask]
    return cleaned_df

#### Load the data and make them consistent


In [2]:
fluxnet_info = pd.read_csv("../data/EC/fluxnet/sites_info.csv")
ameriflux_info = pd.read_csv("../data/EC/Ameriflux/sites_info.tsv", delimiter="\t")
Icos_info = pd.read_csv("../data/EC/ICOS/sites_info.csv")

fluxnet_names = fluxnet_info["ID"].to_list()
fluxnet_types = fluxnet_info["type"].to_list()

ameriflux_names = ameriflux_info["Site ID"].to_list()
ameriflux_types = ameriflux_info["Vegetation Abbreviation (IGBP)"].to_list()

icos_names = Icos_info["ID"].to_list()
icos_types = Icos_info["type"].to_list()
icos_id = Icos_info["id_number"].to_list()
icos_dict = dict(zip(icos_id, icos_names))

In [3]:
combined_names = list(set(ameriflux_names + fluxnet_names + icos_names))
combined_types = []
for name in combined_names:
    if name in ameriflux_names and name in fluxnet_names:
        # Choose a type from either fluxnet_types or ameriflux_types
        combined_types.append(fluxnet_types[fluxnet_names.index(name)])
    elif name in ameriflux_names:
        combined_types.append(ameriflux_types[ameriflux_names.index(name)])
    elif name in icos_names:
        combined_types.append(icos_types[icos_names.index(name)])
    else:
        combined_types.append(fluxnet_types[fluxnet_names.index(name)])

In [4]:
combined_ec = []

for i in range(len(combined_names)):
    site_name = combined_names[i]
    site_type = combined_types[i]

    if site_name in ameriflux_names:
        file = glob.glob("../data/EC/Ameriflux/AMF_" + site_name + "*DD*")
    elif site_name in icos_names:
        file = glob.glob(
            "../data/EC/ICOS/Data/FLX_" + site_name + "*" + "/*FULLSET_DD*"
        )
    else:
        file = glob.glob("../data/EC/fluxnet/FLX_" + site_name + "*DD*")

    ec = pd.read_csv(file[0])
    ec.loc[:, "type"] = site_type
    ec.loc[:, "name"] = site_name
    ec.index = pd.to_datetime(ec["TIMESTAMP"], format="%Y%m%d")
    combined_ec.append(ec)

combined_ec = pd.concat(combined_ec)

In [5]:
# Code for five day fpar
import glob
import pandas as pd

MCD43_fluxnet = []
MCD43_ameriflux = []
MCD43_icos = []

MCD15_fluxnet = []
MCD15_ameriflux = []
MCD15_icos = []

# Loop over batches (#5) of downloaded data
for i in range(1, 6):
    refl_fluxnet = glob.glob(
        "../data/EC/fluxnet/sat_data/*batch" + str(i) + "*MCD43A4-061-results.csv"
    )
    sat_refl_fluxnet = pd.read_csv(refl_fluxnet[0])
    sat_refl_fluxnet.loc[:, "time"] = pd.to_datetime(sat_refl_fluxnet["Date"])
    sat_refl_fluxnet.set_index(sat_refl_fluxnet["Date"], inplace=True)
    MCD43_fluxnet.append(sat_refl_fluxnet)

    fpar_fluxnet = glob.glob(
        "../data/EC/fluxnet/sat_data/*batch" + str(i) + "*MCD15A3H-061-results.csv"
    )
    sat_fpar_fluxnet = pd.read_csv(fpar_fluxnet[0])
    sat_fpar_fluxnet.loc[:, "time"] = pd.to_datetime(sat_fpar_fluxnet["Date"])
    sat_fpar_fluxnet.set_index(sat_fpar_fluxnet["Date"], inplace=True)
    MCD15_fluxnet.append(sat_fpar_fluxnet)

    # Note all the ameriflux sites are in 4 batches
    if i < 5:
        refl_ameriflux = glob.glob(
            "../data/EC/Ameriflux/sat_data/*batch" + str(i) + "*MCD43A4-061-results.csv"
        )

        sat_refl_ameriflux = pd.read_csv(refl_ameriflux[0])
        sat_refl_ameriflux.loc[:, "time"] = pd.to_datetime(sat_refl_ameriflux["Date"])
        sat_refl_ameriflux.set_index(sat_refl_ameriflux["Date"], inplace=True)
        MCD43_ameriflux.append(sat_refl_ameriflux)

        fpar_ameriflux = glob.glob(
            "../data/EC/Ameriflux/sat_data/*batch"
            + str(i)
            + "*MCD15A3H-061-results.csv"
        )
        sat_fpar_ameriflux = pd.read_csv(fpar_ameriflux[0])  # Changed variable name

        sat_fpar_ameriflux.loc[:, "time"] = pd.to_datetime(sat_fpar_ameriflux["Date"])
        sat_fpar_ameriflux.set_index(sat_fpar_ameriflux["Date"], inplace=True)
        MCD15_ameriflux.append(sat_fpar_ameriflux)
    if i < 3:
        refl_icos = glob.glob(
            "../data/EC/ICOS/sat_data/*batch" + str(i) + "*MCD43A4-061-results.csv"
        )
        sat_refl_icos = pd.read_csv(refl_icos[0])
        refl_icos = glob.glob(
            "../data/EC/ICOS/sat_data/*batch" + str(i) + "*MCD43A4-061-results.csv"
        )
        sat_refl_icos = pd.read_csv(refl_icos[0])
        sat_refl_icos["ID"] = sat_refl_icos["ID"].map(icos_dict)
        sat_refl_icos.dropna(subset=["ID"], inplace=True)
        sat_refl_icos.loc[:, "time"] = pd.to_datetime(sat_refl_icos["Date"])
        sat_refl_icos.set_index(sat_refl_icos["Date"], inplace=True)
        MCD43_icos.append(sat_refl_icos)

        fpar_icos = glob.glob(
            "../data/EC/ICOS/sat_data/*batch" + str(i) + "*MCD15A3H-061-results.csv"
        )
        sat_fpar_icos = pd.read_csv(fpar_icos[0])
        sat_fpar_icos["ID"] = sat_fpar_icos["ID"].map(icos_dict)
        sat_fpar_icos.dropna(subset=["ID"], inplace=True)

        sat_fpar_icos.loc[:, "time"] = pd.to_datetime(sat_fpar_icos["Date"])
        sat_fpar_icos.set_index(sat_fpar_icos["Date"], inplace=True)
        MCD15_icos.append(sat_fpar_icos)


refl_fluxnet = pd.concat(MCD43_fluxnet)
refl_fluxnet = refl_fluxnet.rename(columns={"ID": "name"})
fpar_fluxnet = pd.concat(MCD15_fluxnet)
fpar_fluxnet = fpar_fluxnet.rename(columns={"ID": "name"})

refl_ameriflux = pd.concat(MCD43_ameriflux)
refl_ameriflux = refl_ameriflux.rename(columns={"ID": "name"})
fpar_ameriflux = pd.concat(MCD15_ameriflux)
fpar_ameriflux = fpar_ameriflux.rename(columns={"ID": "name"})

refl_icos = pd.concat(MCD43_icos)
refl_icos = refl_icos.rename(columns={"ID": "name"})
fpar_icos = pd.concat(MCD15_icos)
fpar_icos = fpar_icos.rename(columns={"ID": "name"})


combined_refl = []
combined_fpar = []

for name in combined_names:
    if name in ameriflux_names:
        selected_refl = refl_ameriflux[refl_ameriflux["name"] == name]
        selected_fpar = fpar_ameriflux[fpar_ameriflux["name"] == name]
    elif name in icos_names:
        selected_refl = refl_icos[refl_icos["name"] == name]
        selected_fpar = fpar_icos[fpar_icos["name"] == name]
    else:
        selected_refl = refl_fluxnet[refl_fluxnet["name"] == name]
        selected_fpar = fpar_fluxnet[fpar_fluxnet["name"] == name]

    combined_refl.append(selected_refl)
    combined_fpar.append(selected_fpar)

combined_refl = pd.concat(combined_refl)
combined_fpar = pd.concat(combined_fpar)

Flags come from [This paper](https://pdf.sciencedirectassets.com/271723/1-s2.0-S0168192320X00062/1-s2.0-S0168192320301945/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEDsaCXVzLWVhc3QtMSJGMEQCIH4OB25%2B%2BrWEsBRZZ4CMPUgYETlwWwu0kcC2JF5q5VHKAiAysHZntlDvw8izOPtHcQa1YDDXXdQ%2ByZrXPCaxn2NdLiqzBQg0EAUaDDA1OTAwMzU0Njg2NSIMhywMz6eScMTpU2DpKpAFhCNp4JUVE3sxgm2Ly4FDKc%2BSr7igdIpyk9TsBrE%2F0lC51AZ4N3waj2MpXB%2BA2dorxigw%2BZqxNIgin%2BQFdFvFUN0k6mgrcLTBkS9FhRXOMtTKHvqFgR6FAHKZynNnFuxkIM6eV3dZLRHS0R2yeyRpHxGUk%2FYdd6MCozWZKdmaO00mNuMaCQNgwifIIBvwKStqkc9WTys%2F0PXrBO48pSfm90AcEbBzjGiJRgmIpoKoW%2BUkwvBmd%2ByoyK9%2FIQ8nHz9nN7g%2FgopA5nBLXDRvT29mo0D3nFjO6of%2Fm0aPVA0cX2OCmbYDrdb4s%2BTl%2Fb3Cx0HBquT78mhkYTCbOd6YxFHpmb1s6QDcD%2B%2Bl003yqSDYyJdzZmoq0D8wI8NQWJcycDEPI5fgCYxaS0WubAL2QPYO4u%2BOopqWqBfe1l4gf7nkEe8Pp5UAe2vjYGxV5mN9dkOOhlrKXpWa697KXIATqSoYGQwrkumPECnPQWs3FFVrwDiwWDJNvUqUikDqIFG1zFGD7xwzLQh%2BWfrKyxhi%2BfiE8rY4YlFVQ2e6M4DwrmwPtkagsRtzufygm1nJ8bF9z%2F3aiyioMBwCeQ9y3zWT42L5V%2BRtf95N0aGjjjlfzKMB04euz6YWqkhoVuaKGo2WFlt82S4B9PPxa8d8v3bTwxZgfY%2FAXJaAsbET7i9t89h9XF9aZmjR06YbPLi4F0%2FMAbq0njAWnW6oSW9xYXl7Rugx1p8DpKKtzyBr%2FpzNj9djuFAfKUnfLOuiu8rZpsPctz%2FDKwchAqjiYLWtPsTo%2FuyIhZoDBcO3KujlTP7F4VbI87xj5JFXwdsw5C1ZrP0b8N9JZZsZO0V9yhUS8n07bIc3grqIS%2Fn7wzsBGvHu5jiqpMIwz8SIrwY6sgHXwF3s5SpzUmT%2Bd%2FpaOxFaSWK69qbq1fh%2Bv92O3hmad2L%2F82iL5dKCVYcfn5cEbl1mCCJGLbj%2B1JACCSn%2Bb0IDzbqm9PlhBh09lYsoQFVr%2FMaOCh2jlc6UJX%2BWuWuVQ1DercdbawxGlkLwfrDweiqdswRdEHNP5yU6LkedJVvBuG%2Ffm4Kl05DMaCUpKB5zH0b04M16ASPcw5wiPUpNMuyudj1Ko4wwx2sOZt%2BDxbVJHEdP&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20240301T194332Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTYY7GY5ACN%2F20240301%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=746656b69aa01f343be150d203fa4d604f6da3aad6570344a69b5eda22f1ac03&hash=da2637ff2882758ae7d5d5d82b08ed3897533c9ea8fae448ed1b18c618c8b49a&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S0168192320301945&tid=spdf-f3eb7708-9f66-472e-82f4-ba59d2040f74&sid=54e759056934f749ce7937c-05135ec4bd54gxrqa&type=client&tsoh=d3d3LnNjaWVuY2VkaXJlY3QuY29t&ua=0f1557550402015c085500&rr=85dba2f5b9a06a41&cc=us)


In [10]:
par_counter = 0
combined_data = []
for i in range(len(combined_names)):
    name = combined_names[i]
    type = combined_types[i]
    ec_data = combined_ec[combined_ec["name"] == name]
    # # -------------Filter GPP------------------------
    # # The gap filled data is less than 20%
    ec_data = ec_data[ec_data["NEE_VUT_REF_QC"] > 0.8]

    # Uncertaninity in NEE is less than 3  C m−2 d−1
    ec_data = ec_data[ec_data["NEE_VUT_REF_RANDUNC"] < 3]

    # Difference between GPP day and night is less than 3 C m−2 d−1
    # idx_good_gpp = abs(ec_data["GPP_DT_VUT_REF"] - ec_data["GPP_NT_VUT_REF"]) < 3
    # gpp_day = ec_data["GPP_DT_VUT_REF"][idx_good_gpp]
    # gpp_night = ec_data["GPP_NT_VUT_REF"][idx_good_gpp]
    gpp_night = ec_data["GPP_NT_VUT_REF"]
    gpp_day = ec_data["GPP_DT_VUT_REF"]

    # # Take the mean of the day and night GPP
    gpp = (gpp_day + gpp_night) / 2

    # Make sure there is no negative GPP
    gpp = gpp[gpp > 0]

    # ----------------Filter PAR-----------------------
    par_qc = ec_data.loc[gpp.index, "PPFD_IN_QC"]
    sw_qc = ec_data.loc[gpp.index, "SW_IN_F_QC"]

    idx_good_par = par_qc[par_qc > 0.8]
    idx_good_sw = sw_qc[sw_qc > 0.8]

    par = ec_data.loc[idx_good_par.index, "PPFD_IN"]
    sw = ec_data.loc[idx_good_sw.index, "SW_IN_F"]

    # # Alternative way to calculate PAR
    if par.empty:
        # convert SW from w m-2 to umol m-2 s-1 then multiply by 0.45 to get PAR
        par = sw * 4.57 * 0.45
        par_counter += 1

    gpp = gpp[par.index]
    gpp = gpp.to_frame("gpp")
    par = par.to_frame("par")

    if par.empty or gpp.empty:
        print(i, name, type, "Par or GPP is empty")
        continue

    site_ec = pd.concat([gpp, par], axis=1)

    # ----------------Filter Reflectance and FPAR-----------------------
    site_refl = combined_refl[combined_refl["name"] == name]

    site_refl.index = pd.to_datetime(site_refl.index, format="%Y-%m-%d")
    site_refl = site_refl[site_refl.index.isin(site_ec.index)]
    filtered_refl = site_refl[
        (
            site_refl["MCD43A4_061_BRDF_Albedo_Band_Mandatory_Quality_Band1_MODLAND"]
            == "0b000"
        )
        & (
            site_refl["MCD43A4_061_BRDF_Albedo_Band_Mandatory_Quality_Band2_MODLAND"]
            == "0b000"
        )
    ].copy()

    site_red = filtered_refl[["MCD43A4_061_Nadir_Reflectance_Band1"]].rename(
        columns={"MCD43A4_061_Nadir_Reflectance_Band1": "red"}
    )
    site_nir = filtered_refl[["MCD43A4_061_Nadir_Reflectance_Band2"]].rename(
        columns={"MCD43A4_061_Nadir_Reflectance_Band2": "nir"}
    )

    site_fpar = combined_fpar[combined_fpar["name"] == name]
    site_fpar.index = pd.to_datetime(site_fpar.index, format="%Y-%m-%d")

    filtered_fpar = site_fpar[
        (site_fpar["MCD15A3H_061_FparLai_QC_MODLAND"] == "0b0")
        & (site_fpar["MCD15A3H_061_FparLai_QC_DeadDetector"] == "0b0")
        & (site_fpar["MCD15A3H_061_FparLai_QC_CloudState"] == "0b00")
        & (site_fpar["MCD15A3H_061_FparLai_QC_SCF_QC"].isin(["0b000", "0b001"]))
    ].copy()
    if filtered_fpar.empty:
        print(i, name, type, " fpar")
        continue
    fpar_tmp = filtered_fpar["MCD15A3H_061_Fpar_500m"]
    site_fpar = fpar_tmp.resample("D").interpolate("linear")
    site_fpar = site_fpar.to_frame("fpar")
    site_fpar = site_fpar[site_fpar.index.isin(site_ec.index)]

    lai_tmp = filtered_fpar["MCD15A3H_061_Lai_500m"]
    site_lai = lai_tmp.resample("D").interpolate("linear")
    site_lai = site_lai.to_frame("lai")
    site_lai = site_lai[site_lai.index.isin(site_ec.index)]
    # Merge the dataframes
    site_df = (
        site_ec.merge(site_red, left_index=True, right_index=True)
        .merge(site_nir, left_index=True, right_index=True)
        .merge(site_fpar, left_index=True, right_index=True)
        .merge(site_lai, left_index=True, right_index=True)
    )
    if site_df.empty:
        print(i, name, type, " site_df")
        continue
    # Calculate the NDVI, NIRv, NIRvp, Fesc, and LUE
    site_df.loc[:, "ndvi"] = (site_df["nir"] - site_df["red"]) / (
        site_df["nir"] + site_df["red"]
    )
    site_df.loc[:, "nirv"] = site_df["ndvi"] * site_df["nir"]
    site_df.loc[:, "nirvp"] = site_df["nirv"] * site_df["par"]
    site_df.loc[:, "fesc"] = site_df["nirv"] / site_df["fpar"]
    site_df.loc[:, "fesc_p"] = site_df["fesc"] * site_df["par"]
    site_df.loc[:, "apar"] = site_df["fpar"] * site_df["par"]
    site_df.loc[:, "fesc_n"] = site_df["nirv"] / site_df["apar"]

    site_df.loc[:, "lue"] = site_df["gpp"] / (site_df["par"] * site_df["fpar"])
    site_df = site_df.replace([np.inf, -np.inf], np.nan).dropna()
    cleaned_site_df = remove_outliers(site_df, 3).copy()
    cleaned_site_df.loc[:, "name"] = name
    cleaned_site_df.loc[:, "type"] = type
    combined_data.append(cleaned_site_df)

36 US-Pnp WAT  fpar
48 US-KS3 WET  fpar
70 DE-RuS CRO  fpar
111 US-Snf GRA  fpar
147 IT-La2 ENF  site_df
155 US-WPT WET  fpar
167 US-LWW GRA  site_df
215 CD-Ygb MF  site_df
218 CG-Tch SAV  site_df
248 DE-Akm WET  fpar
270 US-Me4 ENF  site_df
279 US-ORv WET  fpar
282 FR-Tou GRA  fpar
297 GH-Ank EBF  site_df
316 US-HB1 WET  fpar
320 US-UM3 WAT  fpar
370 IT-Ro1 DBF  fpar


In [11]:
"number of sites with par being approximated:", par_counter

('number of sites with par being approximated:', 64)

In [12]:
df = pd.concat(combined_data)

### Add the GLASS LAI


In [13]:
glass_dir = "/home/hamid/mnt/nas/GLASS/csv_files/"
names = df["name"].unique()

# Initialize an empty DataFrame to store the results
result_df = pd.DataFrame()

for name in names:
    glass_lai = pd.read_csv(glass_dir + name + ".csv")
    if glass_lai.empty:
        print(name, ": No Glass LAI")
        continue
    glass_lai.set_index("Unnamed: 0", inplace=True)
    glass_lai.index = pd.to_datetime(glass_lai.index, format="%Y-%m-%d")
    glass_lai_daily = glass_lai.resample("D").interpolate("linear")
    glass_lai_daily.rename(columns={"LAI": "glass_lai"}, inplace=True)
    glass_lai_daily.index.name = ""
    tmp_df = df[df["name"] == name]
    tmp_df = tmp_df.merge(glass_lai_daily, left_index=True, right_index=True)

    # Append the tmp_df to the result_df
    result_df = pd.concat([result_df, tmp_df])

In [14]:
result_df.to_csv("../outputs/data_clean_glass_lai_icos2.csv")

In [15]:
df

Unnamed: 0,gpp,par,red,nir,fpar,lai,ndvi,nirv,nirvp,fesc,fesc_p,apar,fesc_n,lue,name,type
2002-07-04,0.019643,381.342964,0.2345,0.3295,0.1300,0.200,0.168440,0.055501,21.164873,0.426930,162.806712,49.574585,0.001120,0.000396,US-Cop,GRA
2002-07-16,0.030749,601.182815,0.2305,0.3253,0.1100,0.200,0.170565,0.055485,33.356495,0.504407,303.240867,66.130110,0.000839,0.000465,US-Cop,GRA
2002-07-18,0.000467,534.330112,0.2243,0.3158,0.1150,0.200,0.169413,0.053501,28.587007,0.465223,248.582672,61.447963,0.000871,0.000008,US-Cop,GRA
2002-07-19,0.022495,301.396527,0.2235,0.3147,0.1175,0.200,0.169454,0.053327,16.072600,0.453848,136.788084,35.414092,0.001506,0.000635,US-Cop,GRA
2002-07-22,0.034960,495.530127,0.2229,0.3142,0.1250,0.200,0.169987,0.053410,26.466217,0.427279,211.729736,61.941266,0.000862,0.000564,US-Cop,GRA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006-12-21,3.927745,254.209917,0.0432,0.2466,0.8350,3.150,0.701863,0.173080,43.998526,0.207281,52.692846,212.265280,0.000815,0.018504,US-KS2,CSH
2006-12-22,3.360560,194.859792,0.0431,0.2466,0.8275,3.025,0.702451,0.173224,33.754465,0.209335,40.790894,161.246478,0.001074,0.020841,US-KS2,CSH
2006-12-23,4.393695,249.624937,0.0431,0.2466,0.8200,2.900,0.702451,0.173224,43.241123,0.211249,52.733076,204.692449,0.000846,0.021465,US-KS2,CSH
2006-12-24,2.999990,180.187104,0.0432,0.2469,0.8125,2.775,0.702172,0.173366,31.238351,0.213374,38.447201,146.402022,0.001184,0.020491,US-KS2,CSH


### Adding clumping index values (if asked by reviewer later)


In [None]:
# ci = xr.open_dataset(
#     "../data/EC/Global_Clumping_Index_1531/data/global_clumping_index_geographic.nc"
# )
# ci = ci.__xarray_dataarray_variable__
# ameriflux_coords = pd.read_csv("../data/Ameriflux_coords.csv")
# fluxnet_coords = pd.read_csv("../data/Fluxnet_coords.csv")
# merged_coords = pd.concat([ameriflux_coords, fluxnet_coords], ignore_index=True)
# merged_coords.drop_duplicates(subset=merged_coords.columns[0], inplace=True)
# merged_coords.reset_index(drop=True, inplace=True)
# merged_coords.rename({"Name": "name"}, axis=1, inplace=True)
# pd_all = pd.merge(df, merged_coords[["name", "Lat", "Lon"]], on="name", how="left")
# pd_all.set_index(df.index, inplace=True)
# names = pd_all["name"].unique()
# for name in names:
#     lat = pd_all[pd_all["name"] == name]["Lat"].values[0]
#     lon = pd_all[pd_all["name"] == name]["Lon"].values[0]
#     ci_point = ci.sel(x=lon, y=lat, method="nearest").values
#     pd_all.loc[pd_all["name"] == name, "ci"] = ci_point
# pd_all.to_csv("../outputs/data_clean_fpar_lai_ci2.csv")