# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

## Import Modules

In [2]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH) # <------------------ ADDED
import math
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pyspark.sql.functions import col
import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO
from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


# Define paths
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = root_dir + os.sep + 'data'
data_dir = root_dir + os.sep + 'data/datasets'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Constant Definitions

In [18]:
# Select monthly features to use
included_features= ['SITE_ID', 'year', 'month', 'TIMESTAMP',
                   'ESACCI-sm',    # ESACCI Soil Moisture (%)
                   'Percent_Snow', # Percentage of snow cover (%)
                   'NDWI',      # Normalized Different Water Index (NDWI)
                   'PET',       # Potential ET (m)
                   'MODIS_PFT', # Plant Function Type
                   'MODIS_LC',  # MODIS Land Cover
                   'Ts',        # Skin temperature (K) ??
                   'LST_Day',   # Daytime land surface temperature (K)
                   'LST_Night', # Nightime land surface temperature (K)
                   'Lai',       # Leaf Area Index (LAI)
                   'Fpar',      # Fraction of photosynthetically active radiation (fPAR)
                   'CSIF-SIFdaily', # All-sky daily average SIF
                   'BESS-PAR',      # Photosynthetic Active Radiation (PAR) (W/m^2)
                   'BESS-PARdiff',  # Diffuse PAR (W/m^2)
                   'BESS-RSDN'      # Shortwave downwelling radiation (W/m^2)
                   ]

# Define in and out files for monthly data
monthly_data_input_fname = data_dir + os.sep + 'data_monthly_v1_0.csv'
monthly_data_output_fname = raw_data_dir + os.sep + "monthly-interpolated-v3.csv"

# Define methods for filling NA (interpolate or -1) and gap-fill (fill or leave be)
resample_monthly = False
impute_method = '-1' # other options are 'interpolate' or None

In [21]:
class PrepareMonthlyData:
    def __init__(self, included_features, monthly_data_input_fname, impute_method='-1', resample_monthly=False):
        self.impute_method = impute_method
        self.resample_monthly = resample_monthly
        self.included_features =included_features
        self.monthly_data_input_fname = monthly_data_input_fname
        self.month_df = pd.read_csv(self.monthly_data_input_fname, usecols=self.included_features)
        self.month_df['date'] = pd.to_datetime(self.month_df['TIMESTAMP'],  format="%Y%m")


    def to_datetime(self, row):
        return pd.to_datetime(f'{row.year}{row.month:02}', format='%Y%m')


    def interpolate(self):
        # Resample to fill in missing month gaps, and interpolate values at site-level
        monthly_df = None  

        # Loop through hourly site data to determine which months are present
        for i, s in tqdm(enumerate(self.month_df['SITE_ID'].unique())):
            # Get monthly data for site
            site_month = self.month_df[self.month_df['SITE_ID'] == s].copy()
            site_month.reset_index(drop = True, inplace=True)

            if self.resample_monthly:
                # Get hourly data for site to find months to fill
                site_file = f'data_full_half_hourly_raw_v0_1_{s}.csv'
                try:
                    site_hr_df = pd.read_csv(f"{tmp_dir}/{site_file}", usecols=['SITE_ID', 'year', 'month'])
                except:
                    print(f"{site_file} not available")
                    continue

                # Get set of year-months represented in site-hourly dataset
                site_hr_df.drop_duplicates(inplace=True)
                site_hr_df['datetime'] = site_hr_df.apply(self.to_datetime, axis=1)

                # Resample montlhly data to get the months required in hourly data
                pft = site_month['MODIS_PFT'][0] # retain PFT to fill new rows
                site_month = pd.merge(site_hr_df, site_month, how='left', on =['SITE_ID', 'year', 'month'])
                site_month['MODIS_PFT'] = pft
                site_month['SITE_ID'] = s

            # Fill in known values for new/resampled month-level rows
            site_month['datetime'] = site_month.apply(self.to_datetime, axis=1)
            site_month.set_index('datetime', inplace=True)
            site_month.drop(columns='TIMESTAMP', inplace=True)
            site_month.drop(columns='date', inplace=True)

            # If any new months added by resample, interpolate gap values at site-level
            if site_month.isna().sum().sum() != 0: 
                site_month.interpolate(method='linear', limit_direction='both', inplace=True)

            # Concat to monthly_df across sites
            if type(monthly_df) == type(None):
                monthly_df = site_month
            else:
                monthly_df = pd.concat([monthly_df, site_month])

        # if any site had 100% missing for a feature, impute these using global data
        if monthly_df.isna().sum().sum() != 0: 
            print("Interpolating Values for 100% Missing Records")
            monthly_df.interpolate(method='linear', limit_direction='both', inplace=True)

        return monthly_df


    def run(self):
        # fill in missing months or leave it
        if self.impute_method == 'interpolate':
            print("Interpolating Values for Missing Values")
            if self.resample_monthly: print("Gap-Filling Missing Months")
            monthly_df = self.interpolate()
        elif self.impute_method == '-1':
            print("No Gap Filling of Monthly Data, Filling all NA with -1")
            monthly_df = self.month_df.fillna(-1)
        else:
            monthly_df = self.month_df.copy()
            print("Not filling NAs, leave be")

        # Confirm No NAS
        if monthly_df.isna().sum().sum() == 0:
            print("Confirmed: No NA values remain")
        elif type(self.impute_method) != type(None):
            print("ISSUE: SOME NA VALUES REMAIN - INVESTIGATE")
            monthly_df.isna().sum()

        return monthly_df
        

### Execute and Save Out

In [23]:
# Execute Monthly Preparation
PrepMonthly = PrepareMonthlyData(included_features, monthly_data_input_fname, impute_method, resample_monthly)                                
monthly_df_out = PrepMonthly.run()

No Gap Filling of Monthly Data, Filling all NA with -1
Confirmed: No NA values remain


In [24]:
# Save out
monthly_df_out.to_csv(monthly_data_output_fname, index=False)

### Check Monthly Data for errors/overwriting of non-NA values

In [14]:
# Create pre and post copies to compare
df_imputed = monthly_df_out.copy()
df_init = pd.read_csv(monthly_data_input_fname, usecols=included_features)
df_init['date'] = pd.to_datetime(df_init['TIMESTAMP'],  format="%Y%m")

# confirm no NAs remain in new df
na_rows_post = df_imputed.isna().any(axis=1).sum()
print(f"Number of NA rows post imputation: {na_rows_post}")

# Drop NA rows from both (using indices) confirm they are same df now
drop_na = df_init.dropna(how='any')
drop_imp = df_imputed.loc[drop_na.index, ]
drop_na.reset_index(inplace=True, drop=True)
drop_imp.reset_index(inplace=True, drop=True)
print(f"Are all rows with no NAs the same as before? {drop_na.equals(drop_imp)}")

# Check that 50 rows that initiall had NA are the same in non-NA cols
na_inds = df_init.loc[df_init.isna().any(axis=1), ].index
errors = 0
for ind in na_inds[:50]:
    check_ind = pd.concat([df_init.iloc[ind], df_imputed.iloc[ind]], axis=1).dropna()
    check_ind.columns = ['initial', 'post_imp']
    if not check_ind['initial'].equals(check_ind['post_imp']):
        errors += 1
        print(ind)
print(f"Number of non-NA values changed by error: {errors}")

# DF length is the same 
print(f"DF is same length as before: {len(df_init) == len(df_imputed)}")

Number of NA rows post imputation: 0
Are all rows with no NAs the same as before? True
Number of non-NA values changed by error: 0
DF is same length as before: True


## Check Interpolation

In [12]:
# Loop through hourly site data to determine which months are present
s = 'AR-SLu'
resample_monthly = True
month_df = pd.read_csv(monthly_data_input_fname, usecols=included_features)
month_df['date'] = pd.to_datetime(month_df['TIMESTAMP'],  format="%Y%m")
site_month = month_df[month_df['SITE_ID'] == s].copy()
site_month.reset_index(drop = True, inplace=True)

def to_datetime(row):
    return pd.to_datetime(f'{row.year}{row.month:02}', format='%Y%m')

if resample_monthly:
    # Get hourly data for site to find months to fill
    site_file = f'data_full_half_hourly_raw_v0_1_{s}.csv'
    try:
        site_hr_df = pd.read_csv(f"{tmp_dir}/{site_file}", usecols=['SITE_ID', 'year', 'month'])
    except:
        print(f"{site_file} not available")

    # Get set of year-months represented in site-hourly dataset
    site_hr_df.drop_duplicates(inplace=True)
    site_hr_df['datetime'] = site_hr_df.apply(to_datetime, axis=1)

    # Resample montlhly data to get the months required in hourly data
    pft = site_month['MODIS_PFT'][0] # retain PFT to fill new rows
    site_month = pd.merge(site_hr_df, site_month, how='left', on =['SITE_ID', 'year', 'month'])
    site_month['MODIS_PFT'] = pft
    site_month['SITE_ID'] = s

# Fill in known values for new/resampled month-level rows
site_month['datetime'] = site_month.apply(to_datetime, axis=1)
site_month.set_index('datetime', inplace=True)
site_month.drop(columns='TIMESTAMP', inplace=True)
site_month.drop(columns='date', inplace=True)

na_inds = site_month[site_month.isna().any(axis=1)].index
print(na_inds[:2])

# If any new months added by resample, interpolate gap values at site-level
if site_month.isna().sum().sum() != 0: 
    site_month.interpolate(method='linear', limit_direction='both', inplace=True)

DatetimeIndex(['2009-12-01'], dtype='datetime64[ns]', name='datetime', freq=None)


In [10]:
site_month.head()

Unnamed: 0_level_0,year,month,SITE_ID,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2009-12-01,2009,12,AR-SLu,154.0,40.0,336.0,0.20432,-0.01339,302.46967,0.15152,7.0,0.03542,0.0,0.49,1.2,313.84,293.58,SH
2010-01-01,2010,1,AR-SLu,154.0,40.0,336.0,0.20432,-0.01339,302.46967,0.15152,7.0,0.03542,0.0,0.49,1.2,313.84,293.58,SH
2010-02-01,2010,2,AR-SLu,120.0,46.0,258.0,0.14553,-0.00894,298.78864,0.16656,7.0,0.0004,0.0,0.43,0.9,309.86,292.96,SH
2010-03-01,2010,3,AR-SLu,107.0,31.0,231.0,0.1098,-0.00813,297.54816,0.16408,7.0,-0.02286,0.0,0.41,0.8,309.18,290.52,SH
2010-04-01,2010,4,AR-SLu,81.0,27.0,175.0,0.07673,-0.00676,291.69604,0.12402,7.0,-0.04202,0.0,0.36,0.5,303.24,286.34,SH
