# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

## Import Modules

In [2]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH) # <------------------ ADDED
import math
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pyspark.sql.functions import col
import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO
from tqdm import tqdm
from sklearn.impute import KNNImputer

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


# Define paths
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = root_dir + os.sep + 'data'
data_dir = root_dir + os.sep + 'data/datasets'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Constant Definitions

In [3]:
# Select monthly features to use
included_features= ['SITE_ID', 'year', 'month', 'TIMESTAMP',
                   'ESACCI-sm',    # ESACCI Soil Moisture (%)
                   'Percent_Snow', # Percentage of snow cover (%)
                   'NDWI',      # Normalized Different Water Index (NDWI)
                   'PET',       # Potential ET (m)
                   'MODIS_PFT', # Plant Function Type
                   'MODIS_LC',  # MODIS Land Cover
                   'Ts',        # Skin temperature (K) ??
                   'LST_Day',   # Daytime land surface temperature (K)
                   'LST_Night', # Nightime land surface temperature (K)
                   'Lai',       # Leaf Area Index (LAI)
                   'Fpar',      # Fraction of photosynthetically active radiation (fPAR)
                   'CSIF-SIFdaily', # All-sky daily average SIF
                   'BESS-PAR',      # Photosynthetic Active Radiation (PAR) (W/m^2)
                   'BESS-PARdiff',  # Diffuse PAR (W/m^2)
                   'BESS-RSDN'      # Shortwave downwelling radiation (W/m^2)
                   ]

# Define in and out files for monthly data
monthly_data_input_fname = data_dir + os.sep + 'data_monthly_v1_0.csv'
monthly_data_output_fname = raw_data_dir + os.sep + "monthly-interpolated-v3.csv"

# Define methods for filling NA (interpolate or -1) and gap-fill (fill or leave be)
impute = True
impute_method = 'interpolate' # other options are 'interpolate', 'knn', 'constant' or None
resample_monthly = True
knn_imp_cols = ['year', 'month', 'ESACCI-sm', 'Percent_Snow', 'NDWI', 'PET', 'MODIS_LC', 'Ts', 'LST_Day',
                'LST_Night', 'Lai', 'Fpar', 'CSIF-SIFdaily', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN']
k=5
weights='uniform'
c=-1 # if impute_method = 'constant'

In [4]:
class PrepareMonthlyData:
    def __init__(self, included_features, monthly_data_input_fname, data_dir):
        self.included_features =included_features
        self.data_dir = data_dir
        self.monthly_data_input_fname = monthly_data_input_fname
        self.month_df = pd.read_csv(self.monthly_data_input_fname, usecols=self.included_features)
        self.month_df['date'] = pd.to_datetime(self.month_df['TIMESTAMP'],  format="%Y%m")


    def to_datetime(self, row):
        return pd.to_datetime(f'{row.year}{row.month:02}', format='%Y%m')


    def knn_impute_site(self, df, knn_imp_cols, k, weights):
        # Fit and transform the data using KNNImputer, format as DF
        inds = df.index.copy()
        df_subcols = df[knn_imp_cols].copy()
        df_subcols = df_subcols.dropna(axis=1, how='all') # drop col if all NA, need to globally impute later

        # Execute imputation
        imputer = KNNImputer(n_neighbors=k, weights=weights)
        imputed_group = imputer.fit_transform(df_subcols)
        imputed_group = pd.DataFrame(imputed_group, columns=df_subcols.columns, index=inds)

        # Fill NA in initial site/group df
        df.fillna(imputed_group, inplace=True)

        return df


    def impute(self, impute_method, resample_monthly, knn_imp_cols=None, k=None, weights=None, c=-1):
        # Resample to fill in missing month gaps, and interpolate values at site-level
        monthly_df = None

        # Subset month_df to only sites with hourly records available
        available_sites = [x[-10:-4] for x in os.listdir(self.data_dir)]
        init_sites = len(self.month_df['SITE_ID'].unique())
        self.month_df = self.month_df.loc[self.month_df['SITE_ID'].isin(available_sites)]
        print(f"# sites dropped bc not available in data_dir: {init_sites - len(self.month_df['SITE_ID'].unique())}")
        
        # Loop through hourly site data to determine which months are present
        for i, s in tqdm(enumerate(self.month_df['SITE_ID'].unique())):
            # Get monthly data for site
            site_month = self.month_df[self.month_df['SITE_ID'] == s].copy()
            site_month.reset_index(drop = True, inplace=True)
            site_month['gap_flag_month'] = 0

            if resample_monthly:
                # Get start and end range for given site <------------------------- CREATE DF NEXT TIME TO SAVE TIME (30 seconds per run)
                site_file = f'data_full_half_hourly_raw_v0_1_{s}.csv'
                site_hr_df = pd.read_csv(f"{tmp_dir}/{site_file}", usecols=['SITE_ID', 'datetime', 'year', 'month'])
                dates = [d for d in pd.date_range(start=site_hr_df['datetime'].min(), end=site_hr_df['datetime'].max(), freq='M')]

                # Create dataframe
                site_hr_df = pd.DataFrame({'datetime': dates})
                site_hr_df['year'] = site_hr_df['datetime'].dt.year
                site_hr_df['month'] = site_hr_df['datetime'].dt.month
                site_hr_df['SITE_ID'] = s

                # Resample montlhly data to get the months required in hourly data
                pft = site_month['MODIS_PFT'][0] # retain PFT to fill new rows
                site_month = pd.merge(site_hr_df, site_month, how='left', on =['SITE_ID', 'year', 'month'])
                site_month['MODIS_PFT'] = pft
                site_month['SITE_ID'] = s
                site_month['gap_flag_month'].fillna(1, inplace=True)

            # Fill in known values for new/resampled month-level rows
            site_month['datetime'] = site_month.apply(self.to_datetime, axis=1)
            site_month.set_index('datetime', inplace=True)
            site_month.drop(columns='TIMESTAMP', inplace=True)
            site_month.drop(columns='date', inplace=True)

            # If any new months added by resample, interpolate gap values at site-level
            if site_month.isna().sum().sum() != 0: 
                if impute_method == 'interpolate':
                    site_month.interpolate(method='linear', limit_direction='both', inplace=True)

                elif impute_method == 'knn':
                    site_month = self.knn_impute_site(site_month, knn_imp_cols, k, weights)

                elif impute_method == 'constant':
                    monthly_df = self.month_df.fillna(c)

            # Concat site_month to monthly_df
            if type(monthly_df) == type(None):
                monthly_df = site_month
            else:
                monthly_df = pd.concat([monthly_df, site_month])

        # if any site had 100% missing for a feature, impute these using global data
        if monthly_df.isna().sum().sum() != 0:
            print("Imputing values where site has 100 percent of feature missing")
            print(f"# of NA features before global impute: {monthly_df.isna().sum().sum()}")
            if impute_method == 'interpolate':
                monthly_df.interpolate(method='linear', limit_direction='both', inplace=True)

            elif impute_method == 'knn':
                monthly_df = self.knn_impute_site(monthly_df, knn_imp_cols, k, weights)

            elif impute_method == 'constant':
                monthly_df = self.monthly_df.fillna(c)

            print(f"# of NA features after global impute: {monthly_df.isna().sum().sum()}")

        return monthly_df


    def run(self, impute=False, impute_method=None, resample_monthly=False, knn_imp_cols=None, k=None, weights=None, c=-1):
        # Hanlde missing values
        if impute:
            print(f"Impute method: {impute_method}")
            print(f"Resampling and gap filling missing months: {resample_monthly}")
            monthly_df = self.impute(impute_method, resample_monthly, knn_imp_cols, k, weights, c)
        else:
            print("Not gap filling or filling NAs, leave be")
            available_sites = [x[-10:-4] for x in os.listdir(self.data_dir)]
            self.month_df = self.month_df.loc[self.month_df['SITE_ID'].isin(available_sites)]
            monthly_df = self.month_df.copy()
            
        # Confirm No NAS
        if monthly_df.isna().sum().sum() == 0:
            print("Confirmed: No NA values remain")
        elif type(impute_method) != type(None):
            print("ISSUE: SOME NA VALUES REMAIN - INVESTIGATE")
            monthly_df.isna().sum()

        return monthly_df
        

### Execute and Save Out

In [5]:
# Execute Monthly Preparation
PrepMonthly = PrepareMonthlyData(included_features, monthly_data_input_fname, tmp_dir)                                
monthly_df_out = PrepMonthly.run(impute, impute_method, resample_monthly, knn_imp_cols, k, weights, c)

Impute method: interpolate
Resampling and gap filling missing months: True
# sites dropped bc not available in data_dir: 9


234it [00:54,  4.29it/s]

Imputing values where site has 100 percent of feature missing
# of NA features before global impute: 1839
# of NA features after global impute: 0
Confirmed: No NA values remain





In [6]:
# Save out
monthly_df_out.to_csv(monthly_data_output_fname, index=False)

## Issue with US-NR1 Months

In [None]:
s = 'US-NR1'
site_file = f'data_full_half_hourly_raw_v0_1_{s}.csv'
site_hr_df = pd.read_csv(f"{tmp_dir}/{site_file}", usecols=['SITE_ID', 'datetime', 'year', 'month'])
dates = [d for d in pd.date_range(start=site_hr_df['datetime'].min(), end=site_hr_df['datetime'].max(), freq='M')]

# Create dataframe
site_hr_df = pd.DataFrame({'datetime': dates})
site_hr_df['year'] = site_hr_df['datetime'].dt.year
site_hr_df['month'] = site_hr_df['datetime'].dt.month

display(site_hr_df.head())

In [None]:
usnr1 = monthly_df_out.loc[monthly_df_out['SITE_ID']=='US-NR1', ].copy()
usnr1.head()

### Check Monthly Data for errors/overwriting of non-NA values

In [None]:
# Create pre and post copies to compare
df_imputed = monthly_df_out.copy().reset_index()
df_init = pd.read_csv(monthly_data_input_fname, usecols=included_features)
df_init['date'] = pd.to_datetime(df_init['TIMESTAMP'],  format="%Y%m")

# confirm no NAs remain in new df
na_rows_post = df_imputed.isna().any(axis=1).sum()
print(f"Number of NA rows post imputation: {na_rows_post}")

# Drop NA rows from both (using indices) confirm they are same df now
drop_na = df_init.dropna(how='any')
drop_imp = df_imputed.loc[drop_na.index]
drop_na.reset_index(inplace=True, drop=True)
drop_imp.reset_index(inplace=True, drop=True)
shared_cols = list(set(drop_imp.columns).intersection(drop_na.columns))
print(f"Are all rows with no NAs the same as before? {drop_na[shared_cols].equals(drop_imp[shared_cols])}")

# Check that 50 rows that initiall had NA are the same in non-NA cols
na_inds = df_init.loc[df_init.isna().any(axis=1), ].index
errors = 0
for ind in na_inds[:1000]:
    check_ind = pd.concat([df_init.iloc[ind], df_imputed.iloc[ind]], axis=1).dropna()
    check_ind.columns = ['initial', 'post_imp']
    if not check_ind['initial'].equals(check_ind['post_imp']):
        errors += 1
        print(ind)
print(f"Number of non-NA values changed by error: {errors}")

# DF length is the same 
print(f"DF is same length as before: {len(df_init) == len(df_imputed)}")

## Check Interpolation

In [None]:
# Loop through hourly site data to determine which months are present
s = 'AR-SLu'
resample_monthly = True
month_df = pd.read_csv(monthly_data_input_fname, usecols=included_features)
month_df['date'] = pd.to_datetime(month_df['TIMESTAMP'],  format="%Y%m")
site_month = month_df[month_df['SITE_ID'] == s].copy()
site_month.reset_index(drop = True, inplace=True)

def to_datetime(row):
    return pd.to_datetime(f'{row.year}{row.month:02}', format='%Y%m')

if resample_monthly:
    # Get hourly data for site to find months to fill
    site_file = f'data_full_half_hourly_raw_v0_1_{s}.csv'
    try:
        site_hr_df = pd.read_csv(f"{tmp_dir}/{site_file}", usecols=['SITE_ID', 'year', 'month'])
    except:
        print(f"{site_file} not available")

    # Get set of year-months represented in site-hourly dataset
    site_hr_df.drop_duplicates(inplace=True)
    site_hr_df['datetime'] = site_hr_df.apply(to_datetime, axis=1)

    # Resample montlhly data to get the months required in hourly data
    pft = site_month['MODIS_PFT'][0] # retain PFT to fill new rows
    site_month = pd.merge(site_hr_df, site_month, how='left', on =['SITE_ID', 'year', 'month'])
    site_month['MODIS_PFT'] = pft
    site_month['SITE_ID'] = s

# Fill in known values for new/resampled month-level rows
site_month['datetime'] = site_month.apply(to_datetime, axis=1)
site_month.set_index('datetime', inplace=True)
site_month.drop(columns='TIMESTAMP', inplace=True)
site_month.drop(columns='date', inplace=True)

na_inds = site_month[site_month.isna().any(axis=1)].index
print(na_inds[:2])

# If any new months added by resample, interpolate gap values at site-level
if site_month.isna().sum().sum() != 0: 
    site_month.interpolate(method='linear', limit_direction='both', inplace=True)

In [None]:
site_month.head()