# Notebook Setup

In [None]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Mounted at /content/drive/


## Import Modules

In [None]:
# install required modules quietly
required_packages = ['azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH) # <------------------ ADDED
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


# Define paths
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = root_dir + os.sep + 'data'
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/W210/Data"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/387.8 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m245.8/387.8 KB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m387.8/387.8 KB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.5/174.5 KB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h

# Constant Definitions

In [None]:
# Select monthly features to use
included_features= ['SITE_ID', 'year', 'month', 'TIMESTAMP',
                   'ESACCI-sm',    # ESACCI Soil Moisture (%)
                   'Percent_Snow', # Percentage of snow cover (%)
                   'NDWI',      # Normalized Different Water Index (NDWI)
                   'PET',       # Potential ET (m)
                   'MODIS_PFT', # Plant Function Type
                   'MODIS_LC',  # MODIS Land Cover
                   'MODIS_IGBP',# MODIS IGBP
                   'Ts',        # Skin temperature (K) ??
                   'LST_Day',   # Daytime land surface temperature (K)
                   'LST_Night', # Nightime land surface temperature (K)
                   'Lai',       # Leaf Area Index (LAI)
                   'Fpar',      # Fraction of photosynthetically active radiation (fPAR)
                   'CSIF-SIFdaily', # All-sky daily average SIF
                   'BESS-PAR',      # Photosynthetic Active Radiation (PAR) (W/m^2)
                   'BESS-PARdiff',  # Diffuse PAR (W/m^2)
                   'BESS-RSDN'      # Shortwave downwelling radiation (W/m^2)
                   ]

# Define in and out files for monthly data
monthly_data_input_fname = raw_data_dir + os.sep + 'data_monthly_v1_0.csv'
monthly_data_output_fname = data_dir + os.sep + "monthly-mvp.csv"

# Define methods for filling NA (interpolate or -1) and gap-fill (fill or leave be)
impute = True
impute_method = 'knn' # other options are 'interpolate', 'knn', 'constant' or None
resample_monthly = True
knn_imp_cols = ['year', 'month', 'ESACCI-sm', 'Percent_Snow', 'NDWI', 'PET', 'MODIS_LC', 'Ts', 'LST_Day',
                'LST_Night', 'Lai', 'Fpar', 'CSIF-SIFdaily', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN']
k=5
weights='uniform'
c=-1 # if impute_method = 'constant'

# Execute and Save Out

In [None]:
# Execute Monthly Preparation
PrepMonthly = PrepareMonthlyData(included_features, monthly_data_input_fname, raw_data_dir+os.sep+'half_hourly_data')
monthly_df_out = PrepMonthly.run(impute, impute_method, resample_monthly, knn_imp_cols, k, weights, c)

Impute method: knn
Resampling and gap filling missing months: True
# sites dropped bc not available in data_dir: 9


33it [01:21,  1.89it/s]

CA-TP3 has column(s) with only NAN: ['ESACCI-sm']


35it [01:22,  2.56it/s]

CG-Tch has column(s) with only NAN: ['NDWI']


61it [01:32,  2.50it/s]

GL-ZaF has column(s) with only NAN: ['ESACCI-sm']


62it [01:33,  2.28it/s]

GL-ZaH has column(s) with only NAN: ['ESACCI-sm']


68it [01:36,  2.14it/s]

IT-Cpz has column(s) with only NAN: ['ESACCI-sm']


69it [01:37,  1.75it/s]

IT-Noe has column(s) with only NAN: ['PET', 'Ts', 'ESACCI-sm']


93it [01:47,  2.33it/s]

US-KS2 has column(s) with only NAN: ['ESACCI-sm']


116it [02:02,  1.46it/s]

CA-Ca3 has column(s) with only NAN: ['ESACCI-sm']


128it [02:09,  1.42it/s]

CA-TP4 has column(s) with only NAN: ['ESACCI-sm']


203it [03:26,  1.21it/s]

FI-Qvd has column(s) with only NAN: ['CSIF-SIFdaily']


208it [03:31,  1.04it/s]

FR-FBn has column(s) with only NAN: ['ESACCI-sm']


215it [03:38,  1.01it/s]

GF-Guy has column(s) with only NAN: ['ESACCI-sm']


218it [03:42,  1.10s/it]

IT-Cp2 has column(s) with only NAN: ['ESACCI-sm']


232it [03:57,  1.25it/s]

DE-Hte has column(s) with only NAN: ['CSIF-SIFdaily']


234it [04:00,  1.03s/it]


Imputing values where site has 100 percent of feature missing
# of NA features before global impute: 1864
# of NA features after global impute: 0
Confirmed: No NA values remain


In [None]:
# Save out
monthly_df_out.to_csv(monthly_data_output_fname, index=False)

In [None]:
from google.colab import runtime
runtime.unassign()