# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

In [2]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

## Import Modules

In [3]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH) # <------------------ ADDED
import math
import json

import pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO
from sklearn.impute import KNNImputer # <----------- ADDED
from tqdm import tqdm # <----------- ADDED

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [5]:
# # Import SparkSession
# from pyspark.sql import SparkSession
# # Create a Spark Session
# spark = SparkSession.builder.master("local[*]").config(
#     "spark.jars.packages", 
#     "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-storage:8.6.6"
#     ).getOrCreate()
# # Check Spark Session Information
# spark

# Define Constants

In [6]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + 'monthly-interpolated-v2.csv'

In [7]:
# File
container = "baseline-data"
ext = "parquet"
ver = "1-i"
blob_name_base = f"baseline_all_v_{ver}"
train_blob_name_base = f"baseline-train-v-{ver}"
test_blob_name_base = f"baseline-test-v-{ver}"

In [8]:
# "Golden" Sites
tier1_sites = ["IT-Lav", "US-NR1"]#["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
#tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

train_sites = tier1_sites# + tier2_sites

# Selected Test Sites
test_sites = ['IT-Lsn']
#test_sites = # ["US-GLE", # ENF, Cold
              # "US-AR1", # GRA, Temperate
              # "US-Seg", # GRA, Arid
            #   "US-FR2", # WSA, Temperate
            #   "ES-LM2", # WSA, Arid
            #   "CA-Cbo", # DBF, Cold
            #   "FR-Lam", # CRO, Temperate
            #   "IT-Cpz", # EBF, Temperate
            #   "CN-Cha", # MF Cold
            #   "IT-Lsn", # OSH, Temperate
            #   ]

In [9]:
# Define features and target variables of the data pipelines
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = 'NEE_VUT_REF_QC'
target_variable = 'GPP_NT_VUT_REF'

# Define the features to use in KNN imputer, only using real values as cat are same per site
knn_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP', 'koppen']
knn_imp_cols = [x for x in included_features + ['GPP_NT_VUT_REF'] if x not in knn_exclude_cols]
print(knn_imp_cols)

['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'GPP_NT_VUT_REF']


# Get Gold Sample Site Data

In [10]:
# Load site metadata
included_site_features = ['site_id', 'filename', 'lat', 'long', #'elevation', <---- REMOVED
                          'koppen_sub', 'koppen_main', 'koppen_name',
                          'c3c4', 'c4_percent', 'monthly_data_available']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df = site_metadata_df.loc[(site_metadata_df['site_id'].isin(train_sites + test_sites)) & (site_metadata_df['monthly_data_available']=='Yes'), ]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.drop(columns='monthly_data_available', inplace=True)
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

size:(3, 10)


Unnamed: 0,site_id,lat,long,koppen_sub,koppen_main,koppen_name,c3c4,c4_percent,filename
0,US-NR1,40.0329,-105.5464,27,4,Dfc,C3,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv
1,IT-Lav,45.9562,11.28132,26,4,Dfb,C3,3.57,data_full_half_hourly_raw_v0_1_IT-Lav.csv
2,IT-Lsn,45.74048,12.7503,14,3,Cfa,C3,2.7,data_full_half_hourly_raw_v0_1_IT-Lsn.csv


# Get Monthly Data

In [11]:
# Load monthly metadata
monthly_df = pd.read_csv(monthly_data_filename)

# only focus on target sites
monthly_df = monthly_df.loc[monthly_df['SITE_ID'].isin(train_sites + test_sites)]
print(f"size:{monthly_df.shape}")
monthly_df.reset_index(inplace=True, drop=True)
monthly_df[['year','month']] = monthly_df[['year','month']].astype('int')
monthly_df[['MODIS_LC']] = monthly_df[['MODIS_LC']].astype('int')
monthly_df.head()

size:(461, 19)


Unnamed: 0,datetime,year,month,SITE_ID,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT
0,1999-05-01,1999,5,US-NR1,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA
1,1999-06-01,1999,6,US-NR1,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA
2,1999-07-01,1999,7,US-NR1,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA
3,1999-08-01,1999,8,US-NR1,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA
4,1999-09-01,1999,9,US-NR1,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA


# Stage 1: Trim and Merge Site Metadata

All available features from Half=hourly data:
```
'TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
'b7', 'IGBP', 'koppen'
``` 

In [12]:
def knn_impute(site_df, imp_cols, k=5, weights='uniform', n=50000):

    # Fit and transform the data using KNNImputer, format as DF
    group_knn_df = site_df[imp_cols].copy()
    group_knn_df = group_knn_df.dropna(axis=1, how='all') # drop col if all NA, need to globally impute later

    # Get subset of rows to speed up impute time (instead of fitting on every single record)
    na_mask = group_knn_df.isna().any(axis=1)
    na_rows = group_knn_df[na_mask]
    not_na_rows = group_knn_df.dropna().sample(n=n)
    print(f"na_rows count: {len(na_mask)}")

    # Execute imputation
    imputer = KNNImputer(n_neighbors=k, weights=weights)
    imputer.fit(not_na_rows)
    imputed_group = imputer.transform(na_rows)
    imputed_group = pd.DataFrame(imputed_group, columns=group_knn_df.columns)

    # Reinsert NA rows
    group_knn_df.loc[na_mask] = imputed_group

    # Fill NA in initial site/group df
    #site_df.set_index(inds)
    site_df.fillna(group_knn_df, inplace=True)

    return site_df

In [25]:
def data_cleanup_new(data_dir, site_id_file_df, target, target_qc, features, imp_cols, k=5, weights='uniform', n=50000):
  data_df = None
  # qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
  qc_flags_features = [s for s in features if "_QC" in s]

  # Iterate through each site:
  for i, r in tqdm(site_id_file_df.iterrows()):        
    if not r.filename or type(r.filename) != type(""):
      print(f'\nERROR: {r.site_id} is mssing hourly data.')
      continue

    # Get only `features` from file
    print(r.site_id)
    local_filename = data_dir + os.sep + r.filename
    site_df = pd.read_csv(local_filename, usecols = [target, target_qc] + features)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    site_df['date'] = pd.to_datetime(site_df['date'])
    site_df['minute'] = site_df['datetime'].dt.minute
    if len(qc_flags_features) != 0:
      site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
    site_df['site_id'] = r.site_id

    # Move from HH to H level
    site_df = site_df.loc[site_df['datetime'].dt.minute == 0, ].copy()

    # Remove zero or negative SW 
    #site_df.drop(site_df[site_df['SW_IN_ERA'] <= 0].index, inplace = True) #<-------------- REMOVE OR EDIT LOGIC LATER?

    # Drop rows with NAs for Target Variable
    site_df.dropna(subset=[target], axis=0, inplace=True)

    # Drop rows with bad NEE_VUT_REF_QC (aka bad GPP records)
    site_df.drop(site_df[site_df[target_qc] == 3].index, inplace = True)
    site_df.drop([target_qc], axis=1, inplace=True)

    # Drop rows with any NA
    #site_df.dropna(axis=0, inplace=True) 

    # Impute missing values
    site_df = knn_impute(site_df, imp_cols, k, weights, n) # <------ can use weights='distance' as well

    # Handle DF
    print(f"{r.site_id}: {site_df.shape}")
    if type(data_df) == type(None):
      data_df = site_df
    else:
      data_df = pd.concat([data_df, site_df])
          
          
  return data_df

In [39]:
# Initial data clean and feature selections from raw data
data_df = data_cleanup_new(raw_data_dir, site_metadata_df[['site_id','filename']],
                target_variable, target_variable_qc, included_features,
                knn_imp_cols, k=3, weights='uniform', n=20000)
print(f"Data size after cleanup: {data_df.shape}")

0it [00:00, ?it/s]

US-NR1
na_rows count: 68009


1it [01:42, 102.62s/it]

US-NR1: (68009, 27)
IT-Lav
na_rows count: 79001


2it [02:49, 81.39s/it] 

IT-Lav: (79001, 27)
IT-Lsn
na_rows count: 20923


3it [02:50, 56.98s/it]

IT-Lsn: (20923, 27)
Data size after cleanup: (167933, 27)





In [40]:
# Merge with site metadata
data_df = merge_site_metadata(data_df, site_metadata_df.drop(['filename', 'koppen_main', 'koppen_name'], axis=1))
print(f"Data size after after merged with site metadata: {data_df.shape}")

# Merge with monthly data
print(len(data_df))
data_df = data_df.merge(monthly_df.drop('datetime', axis=1), how='left',
                        left_on =['site_id', 'year', 'month'],
                        right_on=['SITE_ID', 'year', 'month'])
data_df.drop('SITE_ID', axis=1, inplace=True)
print(f"Data size after after merged with monthly data: {data_df.shape}")

# Drop rows with NA
#check_and_drop_na(data_df) # <----------------------------- REMOVED
#print(f"Data size after after final drop: {data_df.shape}")

#reorder columns
features = data_df.columns.to_list()
features.remove(target_variable)
data_df = data_df[([target_variable] + features)]

display(data_df.head())

Data size after after merged with site metadata: (167933, 32)
167933
Data size after after merged with monthly data: (167933, 47)


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT
0,0.55824,-1.987,12.821,272.749,2.025,0.0,69.262,1999-05-02 05:00:00,1999,5,2,5,1999-05-02,0.28553,0.35211,0.08608,0.1187,0.24433,0.11443,0.12237,0.16297,0.07587,0.03173,ENF,Cold,0,US-NR1,40.0329,-105.5464,27,C3,0.35,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA
1,0.51798,-0.963,98.803,272.749,2.354,0.178,69.242,1999-05-02 06:00:00,1999,5,2,6,1999-05-02,0.24555,0.47712,0.08027,0.1198,0.2251,0.1161,0.13587,0.15557,0.07983,0.0383,ENF,Cold,0,US-NR1,40.0329,-105.5464,27,C3,0.35,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA
2,1.74363,0.06,183.205,272.749,2.682,0.178,69.222,1999-05-02 07:00:00,1999,5,2,7,1999-05-02,0.2549,0.40819,0.08061,0.11847,0.23417,0.1077,0.11887,0.16413,0.08347,0.04187,ENF,Cold,0,US-NR1,40.0329,-105.5464,27,C3,0.35,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA
3,3.73694,1.084,492.858,247.259,3.011,0.0,69.202,1999-05-02 08:00:00,1999,5,2,8,1999-05-02,0.23949,0.6332,0.09441,0.03423,0.15007,0.02037,0.0374,0.1608,0.07597,0.0383,ENF,Cold,0,US-NR1,40.0329,-105.5464,27,C3,0.35,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA
4,1.59155,2.136,614.988,247.259,3.539,0.023,69.196,1999-05-02 09:00:00,1999,5,2,9,1999-05-02,0.26609,0.62955,0.10705,0.03927,0.17003,0.02397,0.0425,0.1597,0.0984,0.0577,ENF,Cold,0,US-NR1,40.0329,-105.5464,27,C3,0.35,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA


In [41]:
data_df.isna().sum()/len(data_df)

GPP_NT_VUT_REF   0.00000
TA_ERA           0.05231
SW_IN_ERA        0.05231
LW_IN_ERA        0.05231
VPD_ERA          0.05231
P_ERA            0.05231
PA_ERA           0.05231
datetime         0.00000
year             0.00000
month            0.00000
day              0.00000
hour             0.00000
date             0.00000
EVI              0.09121
NDVI             0.08705
NIRv             0.08705
b1               0.08705
b2               0.08691
b3               0.09121
b4               0.08703
b5               0.09080
b6               0.14215
b7               0.08691
IGBP             0.00000
koppen           0.00000
minute           0.00000
site_id          0.00000
lat              0.00000
long             0.00000
koppen_sub       0.00000
c3c4             0.00000
c4_percent       0.00000
BESS-PAR         0.00000
BESS-PARdiff     0.00000
BESS-RSDN        0.00000
CSIF-SIFdaily    0.00000
PET              0.00000
Ts               0.00000
ESACCI-sm        0.00000
MODIS_LC         0.00000


### Handle Features Missing for Entire Site

In [85]:
def knn_impute_global(df, imp_cols, k=5, weights='uniform', n=50000):
    # Create copy
    df.reset_index(drop=True, inplace=True)
    data_df_copy = df[imp_cols].copy()

    # Use Global Imputing for Sites that have 100% of one feature missing (couldn't impute at site-level)
    na_mask = data_df_copy.isna().any(axis=1)
    na_inds = na_mask[na_mask==True].index
    na_rows = data_df_copy.loc[na_inds, ].copy()
    not_na_rows = data_df_copy.dropna().sample(n=n)

    # Execute imputation
    imputer = KNNImputer(n_neighbors=k, weights=weights)
    imputer.fit(not_na_rows)
    imputed_group1 = imputer.transform(na_rows)
    imputed_group = pd.DataFrame(imputed_group1, columns=na_rows.columns)

    # Reinsert NA rows
    imputed_group.set_index(na_inds, inplace=True)
    data_df_copy.loc[na_inds] = imputed_group

    # Fill NA in initial site/group df
    #site_df.set_index(inds)
    df.fillna(data_df_copy, inplace=True)

    return df

data_df = knn_impute_global(data_df, knn_imp_cols, k=3, weights='uniform', n=40000)

na_inds: 32707
na_rows: 32707
imputed_group1: 32707
imputed_group2: 32707


### Check Imputation Work

In [103]:
site_id = train_sites[0]

# Copy post-imp site_df
df_imputed = data_df.loc[data_df['site_id']==site_id, ].copy()

# Copy pre-imp site_df
qc_flags_features = [s for s in features if "_QC" in s]
target_qc = 'NEE_VUT_REF_QC'
target = 'GPP_NT_VUT_REF'
filename = f"data_full_half_hourly_raw_v0_1_{site_id}.csv"
local_filename = tmp_dir + os.sep + filename
site_df = pd.read_csv(local_filename, usecols = [target, target_qc] + included_features)
site_df['datetime'] = pd.to_datetime(site_df['datetime'])
site_df['date'] = pd.to_datetime(site_df['date'])
site_df['minute'] = site_df['datetime'].dt.minute
if len(qc_flags_features) != 0:
    site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
site_df['site_id'] = site_id

# Move from HH to H level
site_df = site_df.loc[site_df['datetime'].dt.minute == 0, ].copy()

# Remove zero or negative SW 
site_df.drop(site_df[site_df['SW_IN_ERA'] <= 0].index, inplace = True) # <------------- REMOVED FOR NOW DUE TO GAP-FILLING

# Drop rows with NAs for Target Variable
site_df.dropna(subset=[target], axis=0, inplace=True)

# Drop rows with bad NEE_VUT_REF_QC (aka bad GPP records)
site_df.drop(site_df[site_df[target_qc] == 3].index, inplace = True)
site_df.drop([target_qc], axis=1, inplace=True)
site_df.drop(columns='datetime', inplace=True)
df_init = site_df.copy()

# Align dataframes for comparison
df_init.reset_index(drop=True, inplace=True)
df_imputed.reset_index(drop=True, inplace=True)
df_imputed = df_imputed[df_init.columns]

In [106]:
## Compare data_df init to data_df_copy with filled NA values
# confirm no NAs remain in new df
na_rows_post = df_imputed.isna().any(axis=1).sum()
print(f"Number of NA rows post imputation: {na_rows_post}")

# Drop NA rows from both (using indices) confirm they are same df now
drop_na = df_init.dropna(how='any')
drop_imp = df_imputed.loc[drop_na.index, ]
drop_na.reset_index(inplace=True, drop=True)
drop_imp.reset_index(inplace=True, drop=True)
print(f"Are all rows with no NAs the same as before? {drop_na.equals(drop_imp)}")

# Check that 50 rows that initiall had NA are the same in non-NA cols
na_inds = df_init.loc[df_init.isna().any(axis=1), ].index
errors = 0
for ind in na_inds[:50]:
    check_ind = pd.concat([df_init.iloc[ind], df_imputed.iloc[ind]], axis=1).dropna()
    check_ind.columns = ['initial', 'post_imp']
    if not check_ind['initial'].equals(check_ind['post_imp']):
        errors += 1
        print(ind)
print(f"Number of non-NA values changed by error: {errors}")

# DF length is the same 
print(f"DF is same length as before: {len(df_init) == len(df_imputed)}")

Number of NA rows post imputation: 17967
Are all rows with no NAs the same as before? True
Number of non-NA values changed by error: 0
DF is same length as before: True


# CheckPoint: Upload Data to Azure Storage Blob as Parquet

In [19]:
# # Upload to Azure Storage Blob
# # ref: https://stackoverflow.com/a/54666079
# data_cleanup_checkpoint = True
# tag = "raw"
# blob_name = f"{blob_name_base}_{tag}.{ext}"

# if data_cleanup_checkpoint:

#   parquet_file = BytesIO()
#   data_df.to_parquet(parquet_file, engine='pyarrow')
#   parquet_file.seek(0)

#   azStorageClient = AzStorageClient(az_cred_file)
#   azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

In [20]:
# # Save no transformed data
# get_non_transform_train_test = True
# if get_non_transform_train_test:
#   data_transformer = TFTDataTransformer(train_sites, test_sites,
#                                         None, data_df)
#   train_df, test_df = data_transformer.get_test_train_raw()
#   print("Train data peak:")
#   display(train_df.head(5))
#   print("Test data peak:")
#   display(test_df.head(5))

#   train_blob_name= f"{train_blob_name_base}-{tag}.{ext}"
#   test_blob_name= f"{test_blob_name_base}-{tag}.{ext}"
#   data_transformer.upload_train_test_to_azure(az_cred_file, container,\
#                                               train_blob_name, test_blob_name)

# Stage 2: Data Transform - Convert to Model Ready Data
- Encode data
- Split into train and test
- Assemble data through VectorAssembler
- Normalized data through minmax

In [21]:
load_data_from_previous_checkpoint = False
useSpark = False

raw_data_file_path = None
if load_data_from_previous_checkpoint:
  data_df = None
  raw_data_file_path = tmp_dir + os.sep + blob_name
  print(f"loading {raw_data_file_path}...")
  if not (os.path.exists(raw_data_file_path)):
      if not (os.path.exists(tmp_dir)):
          os.mkdir(tmp_dir)
      azStorageClient = AzStorageClient(az_cred_file)
      file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
      data_df = pd.read_parquet(file_stream, engine='pyarrow')
      data_df.to_parquet(raw_data_file_path)
  
if useSpark:
  data_transformer = PySparkMLDataTransformer(spark, train_sites, test_sites,
                                              raw_data_file_path, data_df)
else:
  data_transformer = TFTDataTransformer(train_sites, test_sites,
                                              raw_data_file_path, data_df)

Data size: (335686, 46).


In [22]:
timestamp_col = 'datetime'
target_col = 'GPP_NT_VUT_REF'

if useSpark: # Spark ML Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'MODIS_PFT', 'MODIS_LC'] 
  data_transformer.data_transform(categorical_cols, timestamp_col, target_col)

  print("Train data peak:")
  data_transformer.train_df.show(5, False)
  print("Test data peak:")
  data_transformer.test_df.show(5, False)

  train_blob_name= f"{train_blob_name_base}"
  test_blob_name= f"{test_blob_name_base}"

else: # TFT Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'site_id',
                      'year', 'month', 'day', 'hour', 'minute',
                      'MODIS_PFT', 'MODIS_LC']
  realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                  'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                  'lat', 'long', 'c4_percent', #'elevation', <----------------------- REMOVED
                  'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                  'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                  'LST_Day', 'LST_Night']
  backup_cols = ['IGBP', 'koppen','site_id']
  data_transformer.data_transform(categorical_cols, realNum_cols, backup_cols,\
                                  timestamp_col, target_col)

  print("Train data peak:")
  display(data_transformer.train_df.head(5))
  print("Test data peak:")
  display(data_transformer.test_df.head(5))

  train_blob_name= f"{train_blob_name_base}.{ext}"
  test_blob_name= f"{test_blob_name_base}.{ext}"

Data size: (335686, 49).
Data size after encoding: (335686, 49)


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,IGBP_name,koppen_name,site_id_name
0,0.55824,-1.987,12.821,272.749,2.025,0.0,69.262,1999-05-02 05:00:00,0,4,1,5,0.27418,0.28167,0.06784,0.13883,0.2432,0.1509,0.1596,0.14577,0.05903,0.02537,0,0,0,2,40.0329,-105.5464,2,0,0.35,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,1,0.42081,16.51613,0.53,0.8,270.22,262.84,2,ENF,Cold,US-NR1
1,0.55824,-1.475,55.642,272.749,2.189,0.0,69.252,1999-05-02 05:30:00,0,4,1,5,0.23226,0.37881,0.07672,0.09573,0.2071,0.07647,0.09943,0.1444,0.08143,0.03813,0,0,1,2,40.0329,-105.5464,2,0,0.35,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,1,0.42081,16.51613,0.53,0.8,270.22,262.84,2,ENF,Cold,US-NR1
2,0.51798,-0.963,98.803,272.749,2.354,0.178,69.242,1999-05-02 06:00:00,0,4,1,6,0.23576,0.23552,0.06801,0.18863,0.29767,0.1671,0.19463,0.14563,0.06363,0.03077,0,0,0,2,40.0329,-105.5464,2,0,0.35,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,1,0.42081,16.51613,0.53,0.8,270.22,262.84,2,ENF,Cold,US-NR1
3,0.51798,-0.452,141.568,272.749,2.518,0.178,69.232,1999-05-02 06:30:00,0,4,1,6,0.28043,0.49979,0.10296,0.07343,0.21043,0.0572,0.0771,0.18233,0.1172,0.06437,0,0,1,2,40.0329,-105.5464,2,0,0.35,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,1,0.42081,16.51613,0.53,0.8,270.22,262.84,2,ENF,Cold,US-NR1
4,1.74363,0.06,183.205,272.749,2.682,0.178,69.222,1999-05-02 07:00:00,0,4,1,7,0.28043,0.49979,0.10296,0.07343,0.21043,0.0572,0.0771,0.18233,0.1172,0.06437,0,0,0,2,40.0329,-105.5464,2,0,0.35,37.0,16.0,87.0,0.07419,-0.00427,262.2357,0.27341,1,0.42081,16.51613,0.53,0.8,270.22,262.84,2,ENF,Cold,US-NR1


ValueError: list.remove(x): x not in list

# Checkpoint: Upload train and test to Azure Blob Storage

In [None]:
# final_checkpoint = True

# if final_checkpoint:
#   data_transformer.upload_train_test_to_azure(az_cred_file, container, \
#                                             train_blob_name, test_blob_name)