# Notebook Setup

In [1]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
#MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

## Import Modules

In [2]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys
import joblib
from io import BytesIO
from sklearn.model_selection import StratifiedKFold

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
sys.path.append(os.path.abspath("./code/src/tools"))
from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Define Constants

In [3]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
preproc_objects_dir = root_dir + os.sep + 'code' + os.sep + 'src' + os.sep + 'preprocessing' + os.sep + 'preproc_objects'


# input files
site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-mvp.csv"

# File
container = "all-sites-data"
ext = "parquet"
ver = "mvp"
model = "rfr"
blob_name_base = f"{model}-full_2010_2015_v_{ver}"
train_blob_name = f"{model}-full_2010_2015-train-v-{ver}.{ext}"
val_blob_name = f"{model}-full_2010_2015-val-v-{ver}.{ext}"
test_blob_name = f"{model}-full_2010_2015-test-v-{ver}.{ext}"

In [4]:
month_df = pd.read_csv(monthly_data_filename)
month_df.columns

Index(['year', 'month', 'SITE_ID', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN',
       'CSIF-SIFdaily', 'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI',
       'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night', 'MODIS_IGBP',
       'MODIS_PFT', 'gap_flag_month'],
      dtype='object')

## View Available Features

In [5]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
#target_variable_qc = 'NEE_VUT_REF_QC'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main', 'IGBP',
                     'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

In [None]:
features_to_add = ['prcp-lag3', 'Tmean?', 'surface soil moisture', 'MSC features', 'Amplitude of MSC features']

# Stage 1: Trim and Merge Site Metadata

In [6]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = False # <----------- set to false for RF run
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date ='2015-12-31'

# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

In [28]:
# Load and format
site_id = 'AU-Dry'
local_filename = tmp_dir + os.sep + f'data_full_half_hourly_raw_v0_1_{site_id}.csv' 
site_df = pd.read_csv(local_filename, usecols = [target_variable] + hourly_features)

 # Format columns
qc_flags_features = [s for s in hourly_features if "_QC" in s]
site_df['datetime'] = pd.to_datetime(site_df['datetime'])
site_df['date'] = pd.to_datetime(site_df['date'])
site_df['minute'] = site_df['datetime'].dt.minute
if len(qc_flags_features) != 0:
    site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
site_df['site_id'] = site_id

# Move to H level
site_df = site_df.loc[site_df['datetime'].dt.minute == 0, ].copy()
site_df.drop('minute', axis=1, inplace=True)

site_df.head()

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,site_id
0,26.642,0.0,409.11,15.452,0.0,99.202,-0.14266,2008-09-01 00:00:00,2008,9,1,0,2008-09-01,0.24575,0.38268,0.11545,0.1347,0.3017,0.0548,0.0931,0.4006,0.3549,0.2124,AU-Dry
2,25.846,0.0,402.987,13.377,0.0,99.209,-0.1995,2008-09-01 01:00:00,2008,9,1,1,2008-09-01,0.24575,0.38268,0.11545,0.1347,0.3017,0.0548,0.0931,0.4006,0.3549,0.2124,AU-Dry
4,25.19,0.0,402.987,12.211,0.0,99.197,-0.03327,2008-09-01 02:00:00,2008,9,1,2,2008-09-01,0.24575,0.38268,0.11545,0.1347,0.3017,0.0548,0.0931,0.4006,0.3549,0.2124,AU-Dry
6,24.534,0.0,402.987,11.044,0.0,99.185,0.31388,2008-09-01 03:00:00,2008,9,1,3,2008-09-01,0.24575,0.38268,0.11545,0.1347,0.3017,0.0548,0.0931,0.4006,0.3549,0.2124,AU-Dry
8,23.986,0.0,400.795,10.208,0.0,99.193,0.26865,2008-09-01 04:00:00,2008,9,1,4,2008-09-01,0.24575,0.38268,0.11545,0.1347,0.3017,0.0548,0.0931,0.4006,0.3549,0.2124,AU-Dry


In [29]:
print(site_df['datetime'].min())
print(site_df['datetime'].max())

2008-09-01 00:00:00
2014-12-31 23:00:00


In [30]:
def filter_date_range(df, start_date, end_date, time_col, missing_thresh=0.2):
    df.set_index(time_col, inplace=True)
    filtered_df = df.loc[start_date:end_date].copy()

    # Remove sites without at least one year of records
    if len(filtered_df) < 365*24:
        filtered_df.reset_index(inplace=True)
        return None
    else:
        # Remove sites that have > 20% gaps in sequence
        first_date = filtered_df.index.min()
        last_date = filtered_df.index.max()
        total_expected_count = len(pd.date_range(start=first_date, end=last_date, freq='H'))
        missing_percentage = (total_expected_count - len(filtered_df)) / total_expected_count

        if missing_percentage > missing_thresh:
            filtered_df.reset_index(inplace=True)
            return missing_percentage # <---- changed for this local investigation from NONE
        else:
            filtered_df.reset_index(inplace=True)
            return filtered_df
        
res = filter_date_range(site_df, start_date, end_date, time_col, missing_thresh=0.2)
print(res)

0.2732749178532311


### Check if all files have same columns

In [32]:
# Get expected cols
site_id = 'AU-Dry'
site_df = pd.read_csv(tmp_dir + os.sep + f'data_full_half_hourly_raw_v0_1_{site_id}.csv')
expected_cols = site_df.columns
expected_cols

Index(['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen'],
      dtype='object')

In [38]:
site_files = [x for x in os.listdir(tmp_dir) if 'data_full_half_hourly_raw_v0_1_' in x]
match = 0
non_match = 0

for csv_file in site_files:
    #df = pd.read_csv(os.path.join(tmp_dir, csv_file))
    #actual_cols = list(df.columns)
 
    # Read the first row of the CSV file to determine the actual column order
    with open(os.path.join(tmp_dir, csv_file), 'r') as f:
        first_line = f.readline().strip()
        actual_cols = first_line.split(',')
    
    # Compare the expected and actual column orders
    if np.all(expected_cols == actual_cols):
        match += 1
    else:
        non_match += 1

print(f"Matches: {match}")
print(f"non-matches: {non_match}")


Matches: 276
non-matches: 0
