# Notebook Setup

In [17]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

In [18]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
  from google.colab import drive
  drive.mount('/content/drive/')
else:
  IN_COLLAB = False

## Import Modules

In [19]:
# install required modules quietly
required_packages = ['azure-storage-blob']
for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import sys
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH)
from io import BytesIO
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load locale custome modules
if IN_COLLAB:
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [25]:
# Define paths
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = root_dir + os.sep + 'data'
monthly_data_dir = raw_data_dir + os.sep + 'datasets'
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/W210/Data"

# Constant Definitions

In [26]:
# Select monthly features to use
included_features= ['SITE_ID', 'year', 'month', 'TIMESTAMP',
                   'ESACCI-sm',    # ESACCI Soil Moisture (%)
                   'Percent_Snow', # Percentage of snow cover (%)
                   'NDWI',      # Normalized Different Water Index (NDWI)
                   'PET',       # Potential ET (m)
                   'MODIS_PFT', # Plant Function Type
                   'MODIS_LC',  # MODIS Land Cover
                   'MODIS_IGBP',# MODIS IGBP
                   'Ts',        # Skin temperature (K) ??
                   'LST_Day',   # Daytime land surface temperature (K)
                   'LST_Night', # Nightime land surface temperature (K)
                   'Lai',       # Leaf Area Index (LAI)
                   'Fpar',      # Fraction of photosynthetically active radiation (fPAR)
                   'CSIF-SIFdaily', # All-sky daily average SIF
                   'BESS-PAR',      # Photosynthetic Active Radiation (PAR) (W/m^2)
                   'BESS-PARdiff',  # Diffuse PAR (W/m^2)
                   'BESS-RSDN'      # Shortwave downwelling radiation (W/m^2)
                   ]

# Define methods for filling NA (interpolate or -1) and gap-fill (fill or leave be)
impute = True
impute_method = 'knn' # other options are 'interpolate', 'knn', 'constant' or None
resample_monthly = True
knn_imp_cols = ['year', 'month', 'ESACCI-sm', 'Percent_Snow', 'NDWI', 'PET', 'MODIS_LC', 'Ts', 'LST_Day',
                'LST_Night', 'Lai', 'Fpar', 'CSIF-SIFdaily', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN']
k=5
weights='uniform'
c=-1 # if impute_method = 'constant'

In [27]:
# Monthly data input
monthly_data_input_fname = monthly_data_dir + os.sep + 'data_monthly_v1_0.csv'

# Azure file naming
container = "all-sites-data"
ext = "parquet"
ver = "mvp"

blob_name_base = f"monthly-{ver}"
blob_name = f"{blob_name_base}.{ext}"

# Local file naming
monthly_data_output_fname = data_dir + os.sep + "monthly-mvp.csv"

# Execute and Save Out

In [29]:
# Execute Monthly Preparation
PrepMonthly = PrepareMonthlyData(included_features, monthly_data_input_fname, tmp_dir)
monthly_df_out = PrepMonthly.run(impute, impute_method, resample_monthly, knn_imp_cols, k, weights, c)

Impute method: knn
Resampling and gap filling missing months: True
# sites dropped bc not available in data_dir: 9
CA-TP3 has column(s) with only NAN: ['ESACCI-sm']
CG-Tch has column(s) with only NAN: ['NDWI']
GL-ZaF has column(s) with only NAN: ['ESACCI-sm']
GL-ZaH has column(s) with only NAN: ['ESACCI-sm']
IT-Cpz has column(s) with only NAN: ['ESACCI-sm']
IT-Noe has column(s) with only NAN: ['PET', 'Ts', 'ESACCI-sm']
US-KS2 has column(s) with only NAN: ['ESACCI-sm']
CA-Ca3 has column(s) with only NAN: ['ESACCI-sm']
CA-TP4 has column(s) with only NAN: ['ESACCI-sm']
FI-Qvd has column(s) with only NAN: ['CSIF-SIFdaily']
FR-FBn has column(s) with only NAN: ['ESACCI-sm']
GF-Guy has column(s) with only NAN: ['ESACCI-sm']
IT-Cp2 has column(s) with only NAN: ['ESACCI-sm']
DE-Hte has column(s) with only NAN: ['CSIF-SIFdaily']
Imputing values where site has 100 percent of feature missing
# of NA features before global impute: 1864
# of NA features after global impute: 0
Confirmed: No NA values

## Write out to local disk and/or Azure blob

In [30]:
# Save out
monthly_df_out.to_csv(monthly_data_output_fname, index=False)

In [31]:
# Upload to Azure Storage Blob
monthly_to_azure = True

if monthly_to_azure:
  parquet_file = BytesIO()
  monthly_df_out.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to all-sites-data/monthly-mvp.parquet


In [None]:
from google.colab import runtime
runtime.unassign()