### Imputation and Gap-Filling Logic (Dev)
Goal: Quantify gaps for each site to understand extent of the problem

## Imports and Paths

In [None]:
# install required modules quietly
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

from pyspark.sql.functions import col
import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
sys.path.append(os.path.abspath("../tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

## Prepare One Site Dta

In [None]:
# Define features and target variables of the data pipelines
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = 'NEE_VUT_REF_QC'
target_variable = 'GPP_NT_VUT_REF'

In [None]:
# Pick site, create dummy df
site = 'CN-HaM' # <--- reduced to one site by John

# Load site metadata
included_site_features = ['site_id', 'filename', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name',
                          'c3c4', 'c4_percent']
                          
site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df = site_metadata_df.loc[site_metadata_df['site_id'].isin([site])]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

#### Load Site w/ cleanup function

In [None]:
# Load site data
def data_cleanup(data_dir, site_id_file_df, target, target_qc, features):
  data_df = None
  # qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
  qc_flags_features = [s for s in features if "_QC" in s]

  # Iterate through each site:
  for i, r in site_id_file_df.iterrows():        
    if not r.filename or type(r.filename) != type(""):
      print(f'\nERROR: {r.site_id} is mssing hourly data.')
      continue

    # Get only `features` from file
    local_filename = data_dir + os.sep + r.filename
    site_df = pd.read_csv(local_filename, usecols = [target, target_qc] + features)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    site_df['date'] = pd.to_datetime(site_df['date'])
    site_df['minute'] = site_df['datetime'].dt.minute
    if len(qc_flags_features) != 0:
      site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
    site_df['site_id'] = r.site_id

    # Remove zero or negative SW
    #site_df.drop(site_df[site_df['SW_IN_ERA'] <= 0].index, inplace = True) # <---------------- REMOVED BY JOHN, NEED TO DISCUSS
    # challenge: For gap-filling a completely blank day.. how do we know where to begin and end the filled timesteps?
    # Pt1: If other research doesn't do this, we shouldnn't either in order to compare metrics
    # Pt2: If we kept these in, we can always analyze errors per hour
    # Pt3: Tradeoff is that we are feeding less meaningfull features to model

    # Drop rows with NAs for Target Variable
    site_df.dropna(subset=[target], axis=0, inplace=True)

    # Drop rows with bad NEE_VUT_REF_QC (aka bad GPP records)
    site_df.drop(site_df[site_df[target_qc] == 3].index, inplace = True)
    site_df.drop([target_qc], axis=1, inplace=True)

    # Drop rows with any NA
    #site_df.dropna(axis=0, inplace=True) # <---------------- REMOVED BY JOHN

    # Move from HH to H level <---------------- ADDED BY JOHN
    site_df = site_df.loc[site_df['minute']==0, ].copy()

    print(f"{r.site_id}: {site_df.shape}")
    if type(data_df) == type(None):
      data_df = site_df
    else:
      data_df = pd.concat([data_df, site_df])
          
  return data_df

# Initial data clean and feature selections from raw data
data_df = data_cleanup(raw_data_dir, site_metadata_df,
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

# # Merge with site metadata
# data_df = merge_site_metadata(data_df, site_metadata_df.drop(['filename', 'koppen_main', 'koppen_name'], axis=1))
# print(f"Data size after after merged with site metadata: {data_df.shape}")

# Drop rows with NA
# check_and_drop_na(data_df) <---------------- REMOVED BY JOHN
#print(f"Data size after after final drop: {data_df.shape}")

#reorder columns
features = data_df.columns.to_list()
features.remove(target_variable)
data_df = data_df[([target_variable] + features)]

data_df.reset_index(inplace=True, drop=True)  #<---------------- ADDED BY JOHN

display(data_df.head(3))

### Hourly Gaps

In [None]:
# Set the datetime column as the index
data_df_hr = data_df.copy() # <---- do we need?
data_df_hr = data_df.set_index('datetime')

# Create a new DataFrame with hourly frequency
data_df_imp_gf = data_df_hr.resample('H').asfreq()
#data_df_imp_gf_sub = data_df_imp_gf[[x for x in data_df_imp.columns if x not in ignore_cols]].copy()

# Find missing records
missing_df = data_df_imp_gf[data_df_imp_gf.isnull().all(axis=1)]
print(f"Hours missing: Count = {len(missing_df)}, % = {100*len(missing_df)/len(data_df_imp_gf):.1f}")

# Collect metrics - hourly
count_hrs = len(missing_df)
pct_hrs = round(count_hrs/len(data_df_imp_gf), 3)

# Determine missing streaks
streaks = {}
current_streak = 0

for i, row in missing_df.iterrows():
    current_streak += 1
    next_index = i + pd.Timedelta(hours=1)
    if next_index not in missing_df.index:
        streaks[i] = current_streak
        current_streak = 0

# Print the resulting dictionary of missing streaks
display(streaks)

### Daily Gaps

In [None]:
daily_df = data_df_hr.resample('D').asfreq()
missing_days = daily_df[daily_df.isnull().all(axis=1)]

# Determine missing day streaks
streaks_day = {}
current_streak = 0

for i, row in missing_days.iterrows():
    current_streak += 1
    next_index = i + pd.Timedelta(days=1)
    if next_index not in missing_days.index:
        streaks_day[i] = current_streak
        current_streak = 0

# Print the resulting dictionary of missing day streaks
display(streaks_day)

# Collect metrics
total_days_missing = len(missing_days)
count_all_missing_streaks = len(streaks_day)
big_streak = 5
count_big_missing_streaks = len([x for x in list(streaks_day.values()) if x > big_streak])
print(total_days_missing)
print(count_all_missing_streaks)
print(count_big_missing_streaks)

### Save to DF

In [None]:
columns = ["site_id", "total_hours",  "count_hrs_missing", "pct_hrs_missing", "total_full_days_missing", 
"count_all_missing_streaks", "count_big_missing_streaks", "streaks_hr_dict", "streaks_day_dict"]
site_missing_df = pd.DataFrame(columns=columns)

site_missing_df.loc[len(site_missing_df)] = \
    [site, total_hours, count_hrs_missing, pct_hrs_missing, total_days_missing, count_all_missing_streaks,
     count_big_missing_streaks, streaks_hr, streaks_day]

site_missing_df