# Notebook Setup

In [16]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

## Import Modules

In [17]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH)
import math
import json

import pandas as pd
import numpy as np
from calendar import monthrange
from datetime import datetime
from io import BytesIO
from sklearn.impute import KNNImputer
from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Define Constants

In [18]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + 'monthly-interpolated-v3.csv'

# File
container = "baseline-data"
ext = "parquet"
ver = "1-i"
blob_name_base = f"baseline_all_v_{ver}"
train_blob_name_base = f"baseline-train-v-{ver}"
test_blob_name_base = f"baseline-test-v-{ver}"

In [19]:
# Define features and target variables of the data pipelines
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = 'NEE_VUT_REF_QC'
target_variable = 'GPP_NT_VUT_REF'
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main',
                     'koppen_name', 'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
knn_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP', 'koppen']
knn_imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in knn_exclude_cols]

In [20]:
# "Golden" Sites
tier1_sites = ["IT-Lav", "US-NR1", "US-Vcp"]#, "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
#tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

train_sites = tier1_sites# + tier2_sites

# Selected Test Sites
test_sites = ['IT-Lsn']
#test_sites = # ["US-GLE", # ENF, Cold
              # "US-AR1", # GRA, Temperate
              # "US-Seg", # GRA, Arid
            #   "US-FR2", # WSA, Temperate
            #   "ES-LM2", # WSA, Arid
            #   "CA-Cbo", # DBF, Cold
            #   "FR-Lam", # CRO, Temperate
            #   "IT-Cpz", # EBF, Temperate
            #   "CN-Cha", # MF Cold
            #   "IT-Lsn", # OSH, Temperate
            #   ]

In [24]:
# Define imput params
impute = True
impute_method = 'ffill'
impute_global = True # <---- for now
resample = True
time_col = 'datetime'
duration = 'H'

# KNNImputer params (if used)
k=5
weights='uniform'
n_fit=20000

[] no__imp, no_resample
[] no__imp, resample
[] ffill
[] ffill + resample
[] ffill + resample + global
[] knn + resample
[] knn + resample + global

# Stage 1: Trim and Merge Site Metadata

In [25]:
class PrepareAllSitesHourly:
    def __init__(self, site_metadata_filename, monthly_data_filename, train_sites, test_sites, 
                hourly_features, metadata_features, target_variable_qc, target_variable, data_dir):
        self.site_metadata_filename = site_metadata_filename
        self.monthly_data_filename = monthly_data_filename
        self.train_sites = train_sites
        self.test_sites = test_sites
        self.hourly_features = hourly_features
        self.metadata_features = metadata_features
        self.target_variable_qc = target_variable_qc
        self.target_variable = target_variable
        self.data_dir = data_dir

    def add_time_index(self, df, time_col, duration):
        df['gap_flag'] = int(0)
        df.sort_values(time_col, inplace=True)
        df.set_index(time_col, inplace=True)
        df = df.resample(duration).first()
        df = df.reset_index()
        df.index.name='timestep_idx'
        df = df.reset_index()
        df['gap_flag'].fillna(1, inplace=True)

        # Fix time records that are NA for new rows
        df['year'] = df['datetime'].dt.year.astype(int)
        df['month'] = df['datetime'].dt.month.astype(int)
        df['day'] = df['datetime'].dt.day.astype(int)
        df['hour'] = df['datetime'].dt.hour.astype(int)
        df['date'] = df['datetime'].dt.date

        return df


    def knn_impute_site(self, site_df, knn_imp_cols, k, weights, n_fit):
        # Fit and transform the data using KNNImputer, format as DF
        group_knn_df = site_df[knn_imp_cols].copy()
        group_knn_df = group_knn_df.dropna(axis=1, how='all') # drop col if all NA, need to globally impute later

        # Get subset of rows to speed up impute time (instead of fitting on every single record)
        na_mask = group_knn_df.isna().any(axis=1)
        na_rows = group_knn_df[na_mask]
        not_na_rows = group_knn_df.dropna().sample(n=n_fit)

        # Execute imputation
        imputer = KNNImputer(n_neighbors=k, weights=weights)
        imputer.fit(not_na_rows)
        imputed_group = imputer.transform(na_rows)
        imputed_group = pd.DataFrame(imputed_group, columns=group_knn_df.columns)

        # Reinsert NA rows
        group_knn_df.loc[na_mask] = imputed_group

        # Fill NA in initial site/group df
        site_df.fillna(group_knn_df, inplace=True)

        return site_df


    def knn_impute_global(self, df, knn_imp_cols, k, weights, n_fit):
        print("Begin global imputing for fully missing features at site-level")
        print(f"NA values remaining before global impute: {df.isna().sum().sum()}")
        
        # Create copy
        df_inds = df.index
        data_df_copy = df[knn_imp_cols].copy()
        data_df_copy.reset_index(drop=True, inplace=True)

        # Use Global Imputing for Sites that have 100% of one feature missing (couldn't impute at site-level)
        na_mask = data_df_copy.isna().any(axis=1)
        na_inds = na_mask[na_mask==True].index
        na_rows = data_df_copy.loc[na_inds, ].copy()
        not_na_rows = data_df_copy.dropna().sample(n=n_fit)

        # Execute imputation
        imputer = KNNImputer(n_neighbors=k, weights=weights)
        imputer.fit(not_na_rows)
        na_rows_imp = imputer.transform(na_rows)
        na_rows_imp = pd.DataFrame(na_rows_imp, columns=na_rows.columns)

        # Reinsert NA rows
        na_rows_imp.set_index(na_inds, inplace=True)
        data_df_copy.loc[na_inds] = na_rows_imp
        data_df_copy.set_index(df_inds, inplace=True)

        # Fill NA in initial site/group df
        df.fillna(data_df_copy, inplace=True)
        print(f"Final NA Count: {df.isna().sum().sum()}")

        return df
    

    def site_data_cleanup(self, site_metadata_df, knn_imp_cols, resample, impute, impute_method,
                         impute_global, k, weights, n_fit, time_col, duration):
        data_df = None
        qc_flags_features = [s for s in self.hourly_features if "_QC" in s]

        ## PRINT THE PLAN
        if impute:
            print(f"Filling missing values with {impute_method} at site-level, then at global-level at end")
        else:
            print("Filling all NA values with -1")

        ## SITE-LEVEL CLEANING -> CONCATENATE
        for i, r in tqdm(site_metadata_df[['site_id','filename']].iterrows()):        
            if not r.filename or type(r.filename) != type(""):
                print(f'ERROR: {r.site_id} is missing hourly data.')
                continue

            # Prepare hourly site df
            local_filename = self.data_dir + os.sep + r.filename
            site_df = pd.read_csv(local_filename, usecols = [self.target_variable, self.target_variable_qc] + self.hourly_features)

            # Format columns
            site_df['datetime'] = pd.to_datetime(site_df['datetime'])
            site_df['date'] = pd.to_datetime(site_df['date'])
            site_df['minute'] = site_df['datetime'].dt.minute
            if len(qc_flags_features) != 0:
                site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
            site_df['site_id'] = r.site_id


            # ----------------- #
            # LATER: FILTER SITE-DF TO THE BEST SEQUENCE OF X YEARS (e.g., if we only use 1.5 years per site)
            # ----------------- #


            # Move from HH to H level
            site_df = site_df.loc[site_df['datetime'].dt.minute == 0, ].copy()

            # For records with bad target QC, make NAN and impute
            site_df.loc[site_df[self.target_variable_qc] == 3, self.target_variable] = np.nan
            site_df.drop([self.target_variable_qc], axis=1, inplace=True)

            # Resample to add rows for missing timesteps, assign timestep_idx and "gap_flag"
            if resample:
                site_df = self.add_time_index(site_df, time_col, duration)
            else:
                site_df.sort_values(time_col, inplace=True)
                site_df = site_df.reset_index()
                site_df.index.name='timestep_idx'
                site_df = site_df.reset_index()

            # Impute missing values at site-level, otherwise fillna w/ -1 at very end
            if impute:
                if impute_method=='ffill': # select most recent record
                    site_df.sort_values(time_col, ascending=True, inplace=True)
                    site_df.fillna(method="ffill", inplace=True)
                    
                elif impute_method=='knn': # use KNNImputer
                    site_df = self.knn_impute_site(site_df, knn_imp_cols, k, weights, n_fit)

            # When done cleaning site -> concatenate site_dfs together into global data_df
            if type(data_df) == type(None):
                data_df = site_df
            else:
                data_df = pd.concat([data_df, site_df])

        ## Handle Global Missing Data (if 100% of feature missing for one site)
        if data_df.isna().sum().sum() != 0:
            if impute_global:
                print("Filling global NA with KNNImpute")
                data_df = self.knn_impute_global(data_df, knn_imp_cols, k, weights, n_fit)
            elif type(impute_method) != type(None):
                data_df.fillna(-1, inplace=True)
                print("Filling global NA w/ -1")
            elif type(impute_method) == type(None):
                print("Not filling global NA values")
        print(f"NA values remaining at end of cleanup: {data_df.isna().sum().sum()}")

        return data_df


    def prep_metadata(self):
        site_metadata_df = pd.read_csv(self.site_metadata_filename, usecols = self.metadata_features)
        site_metadata_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(self.train_sites + self.test_sites), ]
        site_metadata_df = site_metadata_df.loc[site_metadata_df['monthly_data_available']=='Yes', ] # <---- not including sites that have zero monthly data (ask team)
        site_metadata_df.reset_index(inplace=True, drop=True)
        return site_metadata_df


    def merge_site_metadata(self, data_df, site_metadata_df):
        site_metadata_df = site_metadata_df.drop(['filename', 'koppen_main', 'koppen_name', 'monthly_data_available'], axis=1)
        data_df = data_df.merge(site_metadata_df, how='left', left_on='site_id', right_on='site_id')
        print(f"Data size after after merged with site metadata: {data_df.shape}")
        return data_df


    def merge_monthly_data(self, data_df):
        # Prep monthly
        monthly_df = pd.read_csv(self.monthly_data_filename)
        monthly_df = monthly_df.loc[monthly_df['SITE_ID'].isin(self.train_sites + self.test_sites)]
        monthly_df.reset_index(inplace=True, drop=True)
        monthly_df[['year','month', 'MODIS_LC']] = monthly_df[['year','month', 'MODIS_LC']].astype('int')
        print(f"# NAs in monthly: {monthly_df.isna().sum().sum()}")
        print(f"Num site-year-months (monthly): {len(monthly_df[['SITE_ID', 'year', 'month']].drop_duplicates())}")

        # Merge
        data_df = data_df.merge(monthly_df, how='left',
                        left_on =['site_id', 'year', 'month'],
                        right_on=['SITE_ID', 'year', 'month'])
        data_df.drop('SITE_ID', axis=1, inplace=True)
        print(f"Data size after after merged with monthly data: {data_df.shape}")
        return data_df


    def all_sites_all_sources(self, knn_imp_cols, resample, impute, impute_method, impute_global, k, weights, n_fit, time_col, duration):
        site_metadata_df = self.prep_metadata()
        data_df = self.site_data_cleanup(site_metadata_df, knn_imp_cols, resample, impute, impute_method, 
                                        impute_global, k, weights, n_fit, time_col, duration)

        # Merge with site metadata and monthly data
        print(f"NA values remaining at beginning of merges: {data_df.isna().sum().sum()}")
        data_df = self.merge_site_metadata(data_df, site_metadata_df)
        data_df = self.merge_monthly_data(data_df)
        print(f"NA values remaining at end of merges: {data_df.isna().sum().sum()}")

        #reorder columns
        features = data_df.columns.to_list()
        features.remove(target_variable)
        data_df = data_df[([target_variable] + features)]
        print(f"Num site-year-months (data_df): {len(data_df[['site_id', 'year', 'month']].drop_duplicates())}")

        return data_df


In [26]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, train_sites, test_sites, 
                                    hourly_features, metadata_features, target_variable_qc, target_variable, raw_data_dir)

data_df = prep_hourly.all_sites_all_sources(knn_imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration)

Filling missing values with ffill at site-level, then at global-level at end


4it [00:02,  1.50it/s]


Filling global NA with KNNImpute
Begin global imputing for fully missing features at site-level
NA values remaining before global impute: 146400
Final NA Count: 0
NA values remaining at end of cleanup: 0
NA values remaining at beginning of merges: 0
Data size after after merged with site metadata: (435432, 34)
# NAs in monthly: 0
Num site-year-months (monthly): 596
Data size after after merged with monthly data: (435432, 50)
NA values remaining at end of merges: 0
Num site-year-months (data_df): 596


In [None]:
# Weird print statements!

# CheckPoint: Upload Data to Azure Storage Blob as Parquet

In [None]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
data_cleanup_checkpoint = False
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

if data_cleanup_checkpoint:
  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  print(f"Uploading raw data checkpoint to Azure")
  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

In [None]:
# Save non-transformed data
get_non_transform_train_test = False
if get_non_transform_train_test:
  data_transformer = TFTDataTransformer(train_sites, test_sites,
                                        None, data_df)
  train_df, test_df = data_transformer.get_test_train_raw()
  print("Train data peak:")
  display(train_df.head(5))
  print("Test data peak:")
  display(test_df.head(5))

  train_blob_name= f"{train_blob_name_base}-{tag}.{ext}"
  test_blob_name= f"{test_blob_name_base}-{tag}.{ext}"
  data_transformer.upload_train_test_to_azure(az_cred_file, container,\
                                              train_blob_name, test_blob_name)

# Stage 2: Data Transform - Convert to Model Ready Data
- Encode data
- Split into train and test
- Assemble data through VectorAssembler
- Normalized data through minmax

In [None]:
load_data_from_previous_checkpoint = False
useSpark = False

raw_data_file_path = None
if load_data_from_previous_checkpoint:
  data_df = None
  raw_data_file_path = tmp_dir + os.sep + blob_name
  print(f"loading {raw_data_file_path}...")
  if not (os.path.exists(raw_data_file_path)):
      if not (os.path.exists(tmp_dir)):
          os.mkdir(tmp_dir)
      azStorageClient = AzStorageClient(az_cred_file)
      file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
      data_df = pd.read_parquet(file_stream, engine='pyarrow')
      data_df.to_parquet(raw_data_file_path)
  
if useSpark:
  data_transformer = PySparkMLDataTransformer(spark, train_sites, test_sites,
                                              raw_data_file_path, data_df)
else:
  data_transformer = TFTDataTransformer(train_sites, test_sites,
                                              raw_data_file_path, data_df)

In [None]:
timestamp_col = ['datetime']
target_col = 'GPP_NT_VUT_REF'

if useSpark: # Spark ML Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'MODIS_PFT', 'MODIS_LC'] 
  data_transformer.data_transform(categorical_cols, timestamp_col, target_col)

  print("Train data peak:")
  data_transformer.train_df.show(5, False)
  print("Test data peak:")
  data_transformer.test_df.show(5, False)

  train_blob_name= f"{train_blob_name_base}"
  test_blob_name= f"{test_blob_name_base}"

else: # TFT Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'site_id',
                      'year', 'month', 'day', 'hour', 'minute',
                      'MODIS_PFT', 'MODIS_LC']
  realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                  'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                  'lat', 'long', 'c4_percent',
                  'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                  'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                  'LST_Day', 'LST_Night']
  backup_cols = ['IGBP', 'koppen','site_id']
  data_transformer.data_transform(categorical_cols, realNum_cols, backup_cols,\
                                  timestamp_col, target_col)

  print("Train data peak:")
  display(data_transformer.train_df.head(5))
  print("Test data peak:")
  display(data_transformer.test_df.head(5))

  train_blob_name= f"{train_blob_name_base}.{ext}"
  test_blob_name= f"{test_blob_name_base}.{ext}"

# Checkpoint: Upload train and test to Azure Blob Storage

In [None]:
final_checkpoint = True

if final_checkpoint:
  data_transformer.upload_train_test_to_azure(az_cred_file, container, \
                                            train_blob_name, test_blob_name)

# Fix Class

In [None]:
def add_time_index(df, time_col):
    df['gap_flag'] = int(0)
    df.sort_values(time_col, inplace=True)
    df.set_index(time_col, inplace=True)
    df = df.resample(duration).first()
    df = df.reset_index()
    df.index.name='timestep_idx'
    df = df.reset_index()
    df['gap_flag'].fillna(1, inplace=True)

    # Fix time records that are NA for new rows
    df['year'] = df['datetime'].dt.year.astype(int)
    df['month'] = df['datetime'].dt.month.astype(int)
    df['day'] = df['datetime'].dt.day.astype(int)
    df['hour'] = df['datetime'].dt.hour.astype(int)
    df['date'] = df['datetime'].dt.date

    return df

In [None]:
def knn_impute_site(site_df, imp_cols, k, weights, n):
    # Fit and transform the data using KNNImputer, format as DF
    group_knn_df = site_df[imp_cols].copy()
    group_knn_df = group_knn_df.dropna(axis=1, how='all') # drop col if all NA, need to globally impute later

    # Get subset of rows to speed up impute time (instead of fitting on every single record)
    na_mask = group_knn_df.isna().any(axis=1)
    na_rows = group_knn_df[na_mask]
    not_na_rows = group_knn_df.dropna().sample(n=n)

    # Execute imputation
    imputer = KNNImputer(n_neighbors=k, weights=weights)
    imputer.fit(not_na_rows)
    imputed_group = imputer.transform(na_rows)
    imputed_group = pd.DataFrame(imputed_group, columns=group_knn_df.columns)

    # Reinsert NA rows
    group_knn_df.loc[na_mask] = imputed_group

    # Fill NA in initial site/group df
    site_df.fillna(group_knn_df, inplace=True)

    return site_df

In [None]:
def knn_impute_global(self, df):
    print("Begin global imputing for fully missing features at site-level")
    print(f"NA values remaining before global impute: {df.isna().sum().sum()}")
    
    # Create copy
    df_inds = df.index
    data_df_copy = df[self.imp_cols].copy()
    data_df_copy.reset_index(drop=True, inplace=True)

    # Use Global Imputing for Sites that have 100% of one feature missing (couldn't impute at site-level)
    na_mask = data_df_copy.isna().any(axis=1)
    na_inds = na_mask[na_mask==True].index
    na_rows = data_df_copy.loc[na_inds, ].copy()
    not_na_rows = data_df_copy.dropna().sample(n=n)

    # Execute imputation
    imputer = KNNImputer(n_neighbors=self.k, weights=self.weights)
    imputer.fit(not_na_rows)
    na_rows_imp = imputer.transform(na_rows)
    na_rows_imp = pd.DataFrame(na_rows_imp, columns=na_rows.columns)

    # Reinsert NA rows
    na_rows_imp.set_index(na_inds, inplace=True)
    data_df_copy.loc[na_inds] = na_rows_imp
    data_df_copy.set_index(df_inds, inplace=True)

    # Fill NA in initial site/group df
    df.fillna(data_df_copy, inplace=True)
    print(f"Final NA Count: {df.isna().sum().sum()}")

    return df

In [None]:
def merge_site_metadata(self, data_df, site_metadata):
    data_df = data_df.merge(site_metadata_df, how='left', left_on='site_id', right_on='site_id')

    return data_df


def all_sites_all_sources(self):
    data_df = self.site_data_cleanup()

    # Merge with site metadata
    data_df = self.merge_site_metadata(data_df, self.site_metadata_df.drop(['filename', 'koppen_main', 'koppen_name'], axis=1))
    print(f"Data size after after merged with site metadata: {data_df.shape}")

    # Merge with monthly data
    data_df = data_df.merge(monthly_df.drop('date', axis=1), how='left',
                            left_on =['site_id', 'year', 'month'],
                            right_on=['SITE_ID', 'year', 'month'])
    data_df.drop('SITE_ID', axis=1, inplace=True)
    print(f"Data size after after merged with monthly data: {data_df.shape}")

    #reorder columns
    features = data_df.columns.to_list()
    features.remove(target_variable)
    data_df = data_df[([target_variable] + features)]

    return data_df


In [None]:
def site_data_cleanup(time_col, imp_cols, k, weights, n):
    data_df = None
    qc_flags_features = [s for s in self.hourly_features if "_QC" in s]

    ## PRINT THE PLAN
    if self.impute:
        print(f"Filling missing values with {self.impute_method} at site-level, then at global-level at end")
    else:
        print("Filling all NA values with -1")

    ## SITE-LEVEL CLEANING -> CONCATENATE
    for i, r in tqdm(self.site_metadata_df[['site_id','filename']].iterrows()):        
        if not r.filename or type(r.filename) != type(""):
            print(f'ERROR: {r.site_id} is mssing hourly data.')
            continue

        # Prepare hourly site df
        local_filename = self.data_dir + os.sep + r.filename
        site_df = pd.read_csv(local_filename, usecols = [self.target_variable, self.target_variable_qc] + self.hourly_features)

        # Format columns
        site_df['datetime'] = pd.to_datetime(site_df['datetime'])
        site_df['date'] = pd.to_datetime(site_df['date'])
        site_df['minute'] = site_df['datetime'].dt.minute
        if len(qc_flags_features) != 0:
            site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
        site_df['site_id'] = r.site_id


        # ----------------- #
        # LATER: FILTER SITE-DF TO THE BEST SEQUENCE OF X YEARS (e.g., if we only use 1.5 years per site)
        # ----------------- #


        # Move from HH to H level
        site_df = site_df.loc[site_df['datetime'].dt.minute == 0, ].copy()

        # Drop rows with NAs, or bad NEE_VUT_REF_QC for Target Variable <-------------------- MAKE BAD_QC_FLAGS==3 -> NAN to be imputed
        #site_df.dropna(subset=[self.target_variable], axis=0, inplace=True)
        #site_df.drop(site_df[site_df[self.target_variable_qc] == 3].index, inplace = True)
        #site_df.drop([self.target_variable_qc], axis=1, inplace=True)

        # Resample to add rows for missing timesteps, assign timestep_idx and "gap_flag"
        if self.resample:
            site_df = self.add_time_index(site_df, time_col)
        else:
            site_df.sort_values(self.time_col, inplace=True)
            site_df = site_df.reset_index()
            site_df.index.name='timestep_idx'
            site_df = site_df.reset_index()

        # Impute missing values at site-level, otherwise fillna w/ -1 at very end
        if self.impute:
            if self.impute_method=='ffill': # select most recent record
                site_df.sort_values(self.time_col, ascending=True, inplace=True)
                site_df.fillna(method="ffill", inplace=True)
                
            elif self.impute_method=='knn': # use KNNImputer
                site_df = knn_impute_site(site_df, imp_cols, k, weights, n)

        # When done cleaning site -> concatenate site_dfs together into global data_df
        if type(data_df) == type(None):
            data_df = site_df
        else:
            data_df = pd.concat([data_df, site_df])

    ## Handle Global Data
    # If we imputed at site-level already, there may be some features 100% missing for site...
    # ... -> thus, we need to impute using global data to fill these
    if data_df.isna().sum().sum() != 0:
        if self.impute_global:
            print("Filling global NA with KNNImpute")
            data_df = self.knn_impute_global(data_df, self.imp_cols, self.k, self.weights, self.n)
        elif type(self.impute_method) != type(None):
            data_df.fillna(-1, inplace=True)
            print("Filling global NA w/ -1")
        elif type(self.impute_method) == type(None):
            print("Not filling global NA values")
    print(f"NA values remaining at end of cleanup: {data_df.isna().sum().sum()}")

    return data_df