# Notebook Setup

CHANGES FROM V1 - V2:
- Fixed some issues with date columns in gap-filled records
- Use linear interpolator instead of quadratic
- For features that were 100% missing at site-level, impute them using linear interpolator at global level
RESULT: 0 NA values in monthly df

In [2]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

## Import Modules

In [4]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH) # <------------------ ADDED
import math
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pyspark.sql.functions import col
import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO
from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Constant Definitions

In [37]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = root_dir + os.sep + 'data'
data_dir = root_dir + os.sep + 'data/datasets'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data"



monthly_data_input_fname = data_dir + os.sep + 'data_monthly_v1_0.csv'
monthly_data_output_fname = raw_data_dir + os.sep + "monthly-interpolated-v3.csv"

gap_fill_monthly = False
interpolate_na = False

included_features= ['SITE_ID', 'year', 'month', 'TIMESTAMP',
                   'ESACCI-sm',    # ESACCI Soil Moisture (%)
                   'Percent_Snow', # Percentage of snow cover (%)
                   'NDWI',      # Normalized Different Water Index (NDWI)
                   'PET',       # Potential ET (m)
                   'MODIS_PFT', # Plant Function Type
                   'MODIS_LC',  # MODIS Land Cover
                   'Ts',        # Skin temperature (K) ??
                   'LST_Day',   # Daytime land surface temperature (K)
                   'LST_Night', # Nightime land surface temperature (K)
                   'Lai',       # Leaf Area Index (LAI)
                   'Fpar',      # Fraction of photosynthetically active radiation (fPAR)
                   'CSIF-SIFdaily', # All-sky daily average SIF
                   'BESS-PAR',      # Photosynthetic Active Radiation (PAR) (W/m^2)
                   'BESS-PARdiff',  # Diffuse PAR (W/m^2)
                   'BESS-RSDN'      # Shortwave downwelling radiation (W/m^2)
                   ]

In [42]:
class PrepareMonthlyData:
    def __init__(self, included_features, monthly_data_input_fname, interpolate_na=False, gap_fill_monthly=False):
        self.interpolate_na = interpolate_na
        self.gap_fill_monthly = gap_fill_monthly
        self.included_features =included_features
        self.monthly_data_input_fname = monthly_data_input_fname
        self.month_df = pd.read_csv(self.monthly_data_input_fname, usecols=self.included_features)
        self.month_df['date'] = pd.to_datetime(self.month_df['TIMESTAMP'],  format="%Y%m")


    def to_datetime(self, row):
        return pd.to_datetime(f'{row.year}{row.month:02}', format='%Y%m')


    def gap_fill(self):
        # Resample to fill in missing month gaps, and interpolate values at site-level
        monthly_df = None  

        # Loop through hourly site data to determine which months are present
        for i, s in tqdm(enumerate(self.month_df['SITE_ID'].unique())):
            try:
                site_file = f'data_full_half_hourly_raw_v0_1_{s}.csv'
                site_hr_df = pd.read_csv(f"{tmp_dir}/{site_file}", usecols=['SITE_ID', 'year', 'month'])

                # Get set of year-months represented in site-hourly dataset
                site_hr_df.drop_duplicates(inplace=True)
                site_hr_df['datetime'] = site_hr_df.apply(self.to_datetime, axis=1)

                # Get monthly data for site
                site_month = self.month_df[self.month_df['SITE_ID'] == s]
                site_month.reset_index(drop = True, inplace=True)

                # Resample montlhly data to get the months required in hourly data
                pft = site_month['MODIS_PFT'][0] # retain PFT to fill new rows
                site_month = pd.merge(site_hr_df, site_month, how='left', on =['SITE_ID', 'year', 'month'])

                # Fill in known values for new/resampled month-level rows
                site_month.set_index('datetime', inplace=True)
                #site_df['year'] = site_df.index.year.astype(int)
                #site_df['month'] = site_df.index.month.astype(int)
                site_month['MODIS_PFT'] = pft
                site_month['SITE_ID'] = s
                site_month.drop(columns='TIMESTAMP', inplace=True)
                site_month.drop(columns='date', inplace=True)

                # If any new months added by resample, interpolate gap values at site-level
                if interpolate_na:
                    if site_month.isna().sum().sum() != 0: 
                        site_month.interpolate(method='linear', limit_direction='both', inplace=True)

                # Concat to monthly_df across sites
                if type(monthly_df) == type(None):
                    monthly_df = site_month
                else:
                    monthly_df = pd.concat([monthly_df, site_month])

            except:
                continue

        return monthly_df


    def run(self):
        # fill in missing months or leave it
        if self.gap_fill_monthly:
            monthly_df = self.gap_fill()
        else:
            monthly_df = self.month_df.copy()

        # interpolate missing vals or fill w/ -1
        if monthly_df.isna().sum().sum() != 0: 
            if self.interpolate_na:
                monthly_df.interpolate(method='linear', limit_direction='both', inplace=True)
            else:
                monthly_df = monthly_df.fillna(-1)

        return monthly_df
        

In [43]:
PrepMonthly = PrepareMonthlyData(included_features, monthly_data_input_fname, interpolate_na=False, gap_fill_monthly=False)                                
monthly_df_out = PrepMonthly.run()

In [44]:
# Confirm No NAS
if monthly_df_out.isna().sum().sum() == 0:
    print("No NAS")
else:
    monthly_df_out.isna().sum()

# Save out
monthly_df_out.to_csv(monthly_data_output_fname)

No NAS


# Load Monthly Data

In [21]:
# Load monthly data
month_df = pd.read_csv(monthly_data_filename, usecols = included_features)
month_df['date'] = pd.to_datetime(month_df['TIMESTAMP'],  format="%Y%m")

size:(19015, 20)


Unnamed: 0,SITE_ID,year,month,TIMESTAMP,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,date
0,AR-SLu,2010,1,201001,154,40,336,0.20432,-0.01339,302.46967,0.15152,7,0.03542,0.0,0.49,1.2,313.84,293.58,SH,2010-01-01
1,AR-SLu,2010,2,201002,120,46,258,0.14553,-0.00894,298.78864,0.16656,7,0.0004,0.0,0.43,0.9,309.86,292.96,SH,2010-02-01


## Get Site-Months from Hourly Data
Goal: Determine which months we need monthly data from. Ultimately left-merge into hourly data, so we only need months that are available in hourly

In [22]:
def to_datetime(row):
        return pd.to_datetime(f'{row.year}{row.month:02}', format='%Y%m')

In [33]:
def gap_fill():
    # Resample to fill in missing month gaps, and interpolate values at site-level
    monthly_df = None  

    # Loop through hourly site data to determine which months are present
    for i, s in tqdm(enumerate(month_df['SITE_ID'].unique())):
        try:
            site_file = f'data_full_half_hourly_raw_v0_1_{s}.csv'
            site_hr_df = pd.read_csv(f"{tmp_dir}/{site_file}", usecols=['SITE_ID', 'year', 'month'])

            # Get set of year-months represented in site-hourly dataset
            site_hr_df.drop_duplicates(inplace=True)
            site_hr_df['datetime'] = site_hr_df.apply(to_datetime, axis=1)

            # Get monthly data for site
            site_month = month_df[month_df['SITE_ID'] == s]
            site_month.reset_index(drop = True, inplace=True)

            # Resample montlhly data to get the months required in hourly data
            pft = site_month['MODIS_PFT'][0] # retain PFT to fill new rows
            site_month = pd.merge(site_hr_df, site_month, how='left', on =['SITE_ID', 'year', 'month'])

            # Fill in known values for new/resampled month-level rows
            site_month.set_index('datetime', inplace=True)
            #site_df['year'] = site_df.index.year.astype(int)
            #site_df['month'] = site_df.index.month.astype(int)
            site_month['MODIS_PFT'] = pft
            site_month['SITE_ID'] = s
            site_month.drop(columns='TIMESTAMP', inplace=True)
            site_month.drop(columns='date', inplace=True)

            # If any new months added by resample, interpolate gap values at site-level
            if interpolate_na:
                if site_month.isna().sum().sum() != 0: 
                    site_month.interpolate(method='linear', limit_direction='both', inplace=True)

            # Concat to monthly_df across sites
            if type(monthly_df) == type(None):
                monthly_df = site_month
            else:
                monthly_df = pd.concat([monthly_df, site_month])

        except:
            continue

    return monthly_df



# fill in missing months or leave it
if gap_fill_monthly:
    monthly_df = gap_fill()
else:
    monthly_df = month_df.copy()

# interpolate missing vals or fill w/ -1
if interpolate_na:
    if monthly_df.isna().sum().sum() != 0: 
        monthly_df.interpolate(method='linear', limit_direction='both', inplace=True)
else:
    if monthly_df.isna().sum().sum() != 0: 
        monthly_df = month_df.fillna(-1)

243it [00:43,  5.63it/s]


In [34]:
if monthly_df.isna().sum().sum() == 0:
    print("No NAS")
else:
    monthly_df.isna().sum()

No NAS


In [11]:
# Save interpolated monthly data checkpoint
#monthly_df.to_csv(interpolated_monthly_data_filename)