# Notebook Setup

CHANGES FROM V1 - V2:
- Fixed some issues with date columns in gap-filled records
- Use linear interpolator instead of quadratic
- For features that were 100% missing at site-level, impute them using linear interpolator at global level
RESULT: 0 NA values in monthly df

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

## Import Modules

In [2]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH) # <------------------ ADDED
import math
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pyspark.sql.functions import col
import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO
from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Constant Definitions

In [39]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = root_dir + os.sep + 'data'
data_dir = root_dir + os.sep + 'data/datasets'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data"

monthly_data_filename = data_dir + os.sep + 'data_monthly_v1_0.csv'
interpolated_monthly_data_filename = raw_data_dir + os.sep + "monthly-interpolated-v2.csv"

# Load Monthly Data

Full features from the monthly data:
```
['SITE_ID', 'year', 'month', 'time', 'TIMESTAMP', 'dataset',
'LOCATION_LAT', 'LOCATION_LONG',
'TA_F', 'VPD_F', 'P_F', 'NETRAD',
'NEE_VUT_REF', 'NEE_VUT_REF_QC', 'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF', 'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF', 'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF',
'ET', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'CSIF-SIFinst', 'PET', 'Ts', 'Tmean', 'prcp', 'vpd', 'prcp-lag3', 'ESACCI-sm', 'MODIS_LC', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'EVI', 'GCI', 'NDVI', 'NDWI', 'NIRv', 'kNDVI',
'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night',
'SITE_IGBP', 'MODIS_IGBP','MODIS_PFT', 'koppen_sub', 'koppen', 'CO2_concentration']
```

In [5]:
included_features= ['SITE_ID', 'year', 'month', 'TIMESTAMP',
                   'ESACCI-sm',    # ESACCI Soil Moisture (%)
                   'Percent_Snow', # Percentage of snow cover (%)
                   'NDWI',      # Normalized Different Water Index (NDWI)
                   'PET',       # Potential ET (m)
                   'MODIS_PFT', # Plant Function Type
                   'MODIS_LC',  # MODIS Land Cover
                   'Ts',        # Skin temperature (K) ??
                   'LST_Day',   # Daytime land surface temperature (K)
                   'LST_Night', # Nightime land surface temperature (K)
                   'Lai',       # Leaf Area Index (LAI)
                   'Fpar',      # Fraction of photosynthetically active radiation (fPAR)
                   'CSIF-SIFdaily', # All-sky daily average SIF
                   'BESS-PAR',      # Photosynthetic Active Radiation (PAR) (W/m^2)
                   'BESS-PARdiff',  # Diffuse PAR (W/m^2)
                   'BESS-RSDN'      # Shortwave downwelling radiation (W/m^2)
                   ]
month_df = pd.read_csv(monthly_data_filename, usecols = included_features)

# only focus on target sites
sites = month_df['SITE_ID'].unique()
month_df['date'] = pd.to_datetime(month_df['TIMESTAMP'],  format="%Y%m")
print(f"size:{month_df.shape}")
month_df.head(2)

size:(19015, 20)


Unnamed: 0,SITE_ID,year,month,TIMESTAMP,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,date
0,AR-SLu,2010,1,201001,154,40,336,0.20432,-0.01339,302.46967,0.15152,7,0.03542,0.0,0.49,1.2,313.84,293.58,SH,2010-01-01
1,AR-SLu,2010,2,201002,120,46,258,0.14553,-0.00894,298.78864,0.16656,7,0.0004,0.0,0.43,0.9,309.86,292.96,SH,2010-02-01


# Check NA

In [14]:
# For each site, determine if any features have over 80% missing
import numpy as np
thresh = 0.99
fully_missing_counts = dict(zip(included_features, np.zeros(len(included_features), dtype=int)))

for group_id, group_data in month_df.groupby('SITE_ID'):
    null_bools = group_data.isna().sum()/len(group_data)
    cols_missing = list(null_bools[null_bools > thresh].index)
    if len(cols_missing) > 0:
        for col in cols_missing:
            fully_missing_counts[col] += 1

fully_missing_counts

{'SITE_ID': 0,
 'year': 0,
 'month': 0,
 'TIMESTAMP': 0,
 'ESACCI-sm': 11,
 'Percent_Snow': 0,
 'NDWI': 1,
 'PET': 1,
 'MODIS_PFT': 0,
 'MODIS_LC': 0,
 'Ts': 1,
 'LST_Day': 0,
 'LST_Night': 0,
 'Lai': 0,
 'Fpar': 0,
 'CSIF-SIFdaily': 2,
 'BESS-PAR': 0,
 'BESS-PARdiff': 0,
 'BESS-RSDN': 0}

In [15]:
# Check_NA
month_df.isna().sum()

SITE_ID             0
year                0
month               0
TIMESTAMP           0
BESS-PAR            0
BESS-PARdiff        0
BESS-RSDN           0
CSIF-SIFdaily     121
PET               101
Ts                101
ESACCI-sm        1610
MODIS_LC            0
NDWI              406
Percent_Snow      143
Fpar              935
Lai               935
LST_Day             0
LST_Night           0
MODIS_PFT           0
date                0
dtype: int64

## Filln NAN's

In [34]:
# # One site example
# site_df = group_data.copy()
# site_df = site_df.reset_index(drop = True) # <---- ADDED DROP
# site_df.set_index('date', inplace=True)
# site_df = site_df.resample('M').first()
# print(len(site_df))
# print(len(group_data))
# site_df['year'] = site_df.index.year.astype(int) # <----- REPLACED
# site_df['month'] = site_df.index.month.astype(int)# <----- REPLACED
# site_df['TIMESTAMP'] = site_df['year'].astype(str) + site_df['month'].astype(str) # <----- REPLACED

# if site_df.isna().sum().sum() != 0:
#     pft = site_df['MODIS_PFT'][0]

# print(pft)

# #site_df.interpolate(method='linear', limit_direction='both', inplace=True)
# #site_df.head()

21
20
GRA


In [56]:
## Fill in missing month gaps, and interpolate values at site-levle
pd.options.mode.chained_assignment = None
data_df = None
for i, s in tqdm(enumerate(sites)):
  site_df = month_df[month_df['SITE_ID'] == s]
  site_df = site_df.reset_index(drop = True) # <---- ADDED DROP
  site_df.set_index('date', inplace=True)

  # Identify gaps in sequence
  pft = site_df['MODIS_PFT'][0] # <--- MOVED UP, EDITED
  site_df = site_df.resample('M').mean()
  site_df['year'] = site_df.index.year.astype(int)
  site_df['month'] = site_df.index.month.astype(int)
  site_df['TIMESTAMP'] = site_df['year'].astype(str) + site_df['month'].astype(str)
  site_df['MODIS_PFT'] = pft # <--- MOVED UP DUE TO NAs IF BELOW
  site_df['SITE_ID'] = s # <--- MOVED UP DUE TO NAs IF BELOW

  # If any new months added by resample, interpolate gap values
  if site_df.isna().sum().sum() != 0: 
    site_df.interpolate(method='linear', limit_direction='both', inplace=True)
    site_df = site_df.reset_index()
    site_df.set_index('date', inplace=True)

  if type(data_df) == type(None):
    data_df = site_df
  else:
    data_df = pd.concat([data_df, site_df])

pd.options.mode.chained_assignment = 'warn'

243it [00:00, 255.53it/s]


In [57]:
# There will still be NAs where the feature is completely missing for site -> it couldn't interpolate at site-level
data_df.isna().sum()

year                0
month               0
TIMESTAMP           0
BESS-PAR            0
BESS-PARdiff        0
BESS-RSDN           0
CSIF-SIFdaily     150
PET               125
Ts                125
ESACCI-sm        1144
MODIS_LC            0
NDWI               42
Percent_Snow        0
Fpar                0
Lai                 0
LST_Day             0
LST_Night           0
MODIS_PFT           0
SITE_ID             0
dtype: int64

In [58]:
data_df.interpolate(method='linear', limit_direction='both', inplace=True)

In [59]:
data_df.isna().sum()

year             0
month            0
TIMESTAMP        0
BESS-PAR         0
BESS-PARdiff     0
BESS-RSDN        0
CSIF-SIFdaily    0
PET              0
Ts               0
ESACCI-sm        0
MODIS_LC         0
NDWI             0
Percent_Snow     0
Fpar             0
Lai              0
LST_Day          0
LST_Night        0
MODIS_PFT        0
SITE_ID          0
dtype: int64

In [61]:
# Save interpolated monthly data checkpoint
data_df.to_csv(interpolated_monthly_data_filename)