### Imputation and Gap-Filling Logic (Dev)
Goal: To build a set of functions to impute and gap-fill site-level data in the pipeline process

Imputation Logic: Use KNNImputer to find k most similar neighbors to missing point and impute with average of neighbor values
Gap-Filling Logic: TBD

## Imports and Paths

In [1]:
# install required modules quietly
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

from pyspark.sql.functions import col
import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
sys.path.append(os.path.abspath("../tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Prepare One Site Dta

In [2]:
# Define features and target variables of the data pipelines
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = 'NEE_VUT_REF_QC'
target_variable = 'GPP_NT_VUT_REF'

In [3]:
# Pick site, create dummy df
site = 'CN-HaM' # <--- reduced to one site by John

# Load site metadata
included_site_features = ['site_id', 'filename', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name',
                          'c3c4', 'c4_percent']
                          
site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df = site_metadata_df.loc[site_metadata_df['site_id'].isin([site])]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

size:(1, 10)


Unnamed: 0,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,c3c4,c4_percent,filename
0,CN-HaM,,37.37,101.18,29,5,ET,C3,1.07,data_full_half_hourly_raw_v0_1_CN-HaM.csv


#### Load Site w/ cleanup function

In [5]:
# Load site data
def data_cleanup(data_dir, site_id_file_df, target, target_qc, features):
  data_df = None
  # qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
  qc_flags_features = [s for s in features if "_QC" in s]

  # Iterate through each site:
  for i, r in site_id_file_df.iterrows():        
    if not r.filename or type(r.filename) != type(""):
      print(f'\nERROR: {r.site_id} is mssing hourly data.')
      continue

    # Get only `features` from file
    local_filename = data_dir + os.sep + r.filename
    site_df = pd.read_csv(local_filename, usecols = [target, target_qc] + features)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    site_df['date'] = pd.to_datetime(site_df['date'])
    site_df['minute'] = site_df['datetime'].dt.minute
    if len(qc_flags_features) != 0:
      site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
    site_df['site_id'] = r.site_id

    # Remove zero or negative SW
    site_df.drop(site_df[site_df['SW_IN_ERA'] <= 0].index, inplace = True)

    # Drop rows with NAs for Target Variable
    site_df.dropna(subset=[target], axis=0, inplace=True)

    # Drop rows with bad NEE_VUT_REF_QC (aka bad GPP records)
    site_df.drop(site_df[site_df[target_qc] == 3].index, inplace = True)
    site_df.drop([target_qc], axis=1, inplace=True)

    # Drop rows with any NA
    #site_df.dropna(axis=0, inplace=True) # <---------------- REMOVED BY JOHN

    # Move from HH to H level <---------------- ADDED BY JOHN
    site_df = site_df.loc[site_df['minute']==0, ].copy()

    print(f"{r.site_id}: {site_df.shape}")
    if type(data_df) == type(None):
      data_df = site_df
    else:
      data_df = pd.concat([data_df, site_df])
          
  return data_df

# Initial data clean and feature selections from raw data
data_df = data_cleanup(raw_data_dir, site_metadata_df,
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

# # Merge with site metadata
# data_df = merge_site_metadata(data_df, site_metadata_df.drop(['filename', 'koppen_main', 'koppen_name'], axis=1))
# print(f"Data size after after merged with site metadata: {data_df.shape}")

# Drop rows with NA
# check_and_drop_na(data_df) <---------------- REMOVED BY JOHN
#print(f"Data size after after final drop: {data_df.shape}")

#reorder columns
features = data_df.columns.to_list()
features.remove(target_variable)
data_df = data_df[([target_variable] + features)]

data_df.reset_index(inplace=True, drop=True)  #<---------------- ADDED BY JOHN

display(data_df.head(3))

CN-HaM: (12726, 27)
Data size after cleanup: (12726, 27)


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id
0,-3.00263,-13.853,53.56,224.623,0.515,0.0,68.683,2002-01-15 09:00:00,2002,1,15,9,2002-01-15,-0.03115,-0.00702,-0.0064,0.9252,0.9123,0.8571,0.9113,0.5601,0.1609,0.0708,GRA,Polar,0,CN-HaM
1,-2.05477,-13.545,121.268,224.623,0.583,0.0,68.734,2002-01-15 10:00:00,2002,1,15,10,2002-01-15,-0.03115,-0.00702,-0.0064,0.9252,0.9123,0.8571,0.9113,0.5601,0.1609,0.0708,GRA,Polar,0,CN-HaM
2,0.54304,-13.237,301.45,223.175,0.651,0.0,68.784,2002-01-15 11:00:00,2002,1,15,11,2002-01-15,-0.03115,-0.00702,-0.0064,0.9252,0.9123,0.8571,0.9113,0.5601,0.1609,0.0708,GRA,Polar,0,CN-HaM


### Encode categoricals for KNN only
Decision to make: Do we treat month, hour as numerical, or use Season and TOD groupings? The question is how will KNN use the time information to identify similar records?

In [6]:
## Add season and TOD groupings
def define_season_tod(df):
    # Season indicator
    df.loc[df['month'].isin([12, 1, 2]), 'season'] = 'WINTER'
    df.loc[df['month'].isin([3, 4, 5]), 'season'] = 'SPRING'
    df.loc[df['month'].isin([6, 7, 8]), 'season'] = 'SUMMER'
    df.loc[df['month'].isin([9, 10, 11]), 'season'] = 'FALL'

    # 6-Hour TOD indicator
    df.loc[df['hour'].isin(list(range(0, 6))), 'time_block'] = '1'
    df.loc[df['hour'].isin(list(range(6, 12))), 'time_block'] = '2'
    df.loc[df['hour'].isin(list(range(12, 18))), 'time_block'] = '3'
    df.loc[df['hour'].isin(list(range(18, 24))), 'time_block'] = '4'

    return df



In [10]:
from sklearn.impute import KNNImputer

# Encode categorical features for KNN Imputer
def knn_impute(df, knn_imp_cat, knn_imp_real, k=5):
    # Add Season, TOD features <------ only use if we decide to use these as time features
    df = define_season_tod(df)

    # One-hot encoding
    knn_df = df[knn_imp_cat + knn_imp_real].copy()
    knn_df = pd.get_dummies(knn_df,
                            columns=knn_imp_cat, 
                            dummy_na=False)

    # Initialize KNNImputer
    imputer = KNNImputer(n_neighbors=k, weights='uniform') #<-- may want to try weighted by distance later

    # Fit and transform the data using KNNImputer
    imputed_data = imputer.fit_transform(knn_df)

    # Convert the imputed data back to a DataFrame
    imputed_df = pd.DataFrame(imputed_data, columns=knn_df.columns)

    # update initial df with imputed values
    data_df_imp = df.copy()
    data_df_imp.update(imputed_df, overwrite=True)

    return data_df_imp


# Define the features to use in KNN imputer
knn_imp_cat = ['season', 'time_block']
knn_imp_real = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
data_df_imp = knn_impute(data_df, knn_imp_cat, knn_imp_real)


#### CHECK IMPUTATION
Double-check that the imputed data update didn't affect the data

In [39]:
## Compare data_df init to data_df_copy with filled NA values
# confirm no NAs remain in new df
na_rows_post = data_df_imp.isna().any(axis=1).sum()
print(f"Number of NA rows post imputation: {na_rows_post}")

# Drop NA rows from both (using indices) confirm they are same df now
drop_na = data_df.dropna(how='any')
drop_imp = data_df_imp.iloc[drop_na.index, ]
print(f"Are rows with no NAs the same as before? {drop_na.equals(drop_imp)}")

# Check that some rows with NA are the same in non-NA cols
na_inds = data_df.loc[data_df.isna().any(axis=1), ].index
errors = 0
for ind in na_inds:
    check_ind = pd.concat([data_df.iloc[ind], data_df_imp.iloc[ind]], axis=1).dropna()
    check_ind.columns = ['initial', 'post_imp']
    if not check_ind['initial'].equals(check_ind['post_imp']):
        errors += 1
        print(ind)

print(f"Number of non-NA values changed by error: {errors}")

Number of NA rows post imputation: 0
Are rows with no NAs the same as before? True
Number of non-NA values changed: 0


### Calculate Missing Vals (more later, plus move up)

In [None]:
# # Calculate how many rows have any NA <---- Added
# cols_with_all_na = data_df.columns[data_df.isna().all()]
# print(f"Dropping {cols_with_all_na} due to 100% missing")
# data_df.drop(cols_with_all_na, axis=1, inplace=True)
# num_rows_with_na = data_df.isna().any(axis=1).sum()
# print(f"Rows with any NA values: Count = {num_rows_with_na}, % = {100*round(num_rows_with_na/len(data_df), 2):.1f}")
