# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

## Import Modules

In [2]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH) # <------------------ ADDED
import math
import json

import pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO
from sklearn.impute import KNNImputer # <----------- ADDED
from tqdm import tqdm # <----------- ADDED

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Define Constants

In [3]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + 'monthly-interpolated-v3.csv'

# File
container = "baseline-data"
ext = "parquet"
ver = "1-i"
blob_name_base = f"baseline_all_v_{ver}"
train_blob_name_base = f"baseline-train-v-{ver}"
test_blob_name_base = f"baseline-test-v-{ver}"

In [10]:
# Define features and target variables of the data pipelines
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = 'NEE_VUT_REF_QC'
target_variable = 'GPP_NT_VUT_REF'
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main',
                     'koppen_name', 'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
knn_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP', 'koppen']
knn_imp_cols = [x for x in included_features + ['GPP_NT_VUT_REF'] if x not in knn_exclude_cols]

In [11]:
# "Golden" Sites
tier1_sites = ["IT-Lav", "US-NR1", "US-Vcp"]#, "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
#tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

train_sites = tier1_sites# + tier2_sites

# Selected Test Sites
test_sites = ['IT-Lsn']
#test_sites = # ["US-GLE", # ENF, Cold
              # "US-AR1", # GRA, Temperate
              # "US-Seg", # GRA, Arid
            #   "US-FR2", # WSA, Temperate
            #   "ES-LM2", # WSA, Arid
            #   "CA-Cbo", # DBF, Cold
            #   "FR-Lam", # CRO, Temperate
            #   "IT-Cpz", # EBF, Temperate
            #   "CN-Cha", # MF Cold
            #   "IT-Lsn", # OSH, Temperate
            #   ]

In [12]:
# Define imput params
impute = False
resample = True
impute_method = 'ffill'
impute_global = False
time_col = 'datetime'
duration = 'H'

# KNNImputer params (if used)
k=5
weights='uniform'
n_fit=20000

# Get Gold Sample Site Data

In [13]:
# Load site metadata
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = metadata_features)

# Keep sites that are in train/test AND have monthly data available
site_metadata_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(train_sites + test_sites), ]
site_metadata_df = site_metadata_df.loc[site_metadata_df['monthly_data_available']=='Yes', ]
site_metadata_df.drop(columns='monthly_data_available', inplace=True)
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df.head(3)

Unnamed: 0,site_id,lat,long,koppen_sub,koppen_main,koppen_name,c3c4,c4_percent,filename
0,US-NR1,40.0329,-105.5464,27,4,Dfc,C3,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv
1,US-Vcp,35.8624,-106.5974,26,4,Dfb,C3,0.04,data_full_half_hourly_raw_v0_1_US-Vcp.csv
2,IT-Lav,45.9562,11.28132,26,4,Dfb,C3,3.57,data_full_half_hourly_raw_v0_1_IT-Lav.csv


# Get Monthly Data

In [14]:
# Load monthly metadata
monthly_df = pd.read_csv(monthly_data_filename)
monthly_df = monthly_df.loc[monthly_df['SITE_ID'].isin(train_sites + test_sites)]
monthly_df.reset_index(inplace=True, drop=True)
monthly_df[['year','month', 'MODIS_LC']] = monthly_df[['year','month', 'MODIS_LC']].astype('int')
monthly_df.head(2)

Unnamed: 0,SITE_ID,year,month,TIMESTAMP,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,date
0,US-NR1,2001,1,200101,37,16,87,0.07419,-0.00427,262.2357,-1.0,8,0.42081,16.51613,0.53,0.8,270.22,262.84,SA,2001-01-01
1,US-NR1,2001,2,200102,47,24,110,0.07939,-0.00625,264.60532,-1.0,8,0.57521,10.28571,0.48,0.8,270.56,262.52,SA,2001-02-01


# Stage 1: Trim and Merge Site Metadata

In [21]:
class PrepareAllSitesHourly:
    def __init__(self, site_metadata_df, monthly_df, included_features, target_variable_qc,
                 target_variable, knn_imp_cols, train_sites, test_sites,
                 resample, impute, impute_method, impute_global, k, weights, n_fit, data_dir, time_col, duration):
        self.site_metadata_df = site_metadata_df
        self.monthly_df = monthly_df
        self.included_features = included_features
        self.target_variable_qc = target_variable_qc
        self.target_variable = target_variable
        self.train_sites = train_sites
        self.test_sites = test_sites
        self.resample = resample
        self.impute = impute
        self.impute_method = impute_method
        self.imp_cols = knn_imp_cols
        self.impute_global = impute_global
        self.k = k
        self.n_fit = n_fit
        self.weights = weights
        self.data_dir = data_dir
        self.time_col = time_col
        self.duration = duration


    def add_time_index(self, df):
        df['gap_flag'] = int(0)
        df.sort_values(self.time_col, inplace=True)
        df.set_index(self.time_col, inplace=True)
        df = df.resample(self.duration).first()
        df = df.reset_index()
        df.index.name='timestep_idx'
        df = df.reset_index()
        df['gap_flag'].fillna(1, inplace=True)
        #df.loc[df['gap_flag']!= 0, 'gap_flag'] = int(1) # add flag to new records

        # Fix time records that are NA for new rows
        df['year'] = df['datetime'].dt.year.astype(int)
        df['month'] = df['datetime'].dt.month.astype(int)
        df['day'] = df['datetime'].dt.day.astype(int)
        df['hour'] = df['datetime'].dt.hour.astype(int)
        df['date'] = df['datetime'].dt.date

        return df


    def knn_impute_site(self, site_df):
        # Fit and transform the data using KNNImputer, format as DF
        group_knn_df = site_df[self.imp_cols].copy()
        group_knn_df = group_knn_df.dropna(axis=1, how='all') # drop col if all NA, need to globally impute later

        # Get subset of rows to speed up impute time (instead of fitting on every single record)
        na_mask = group_knn_df.isna().any(axis=1)
        na_rows = group_knn_df[na_mask]
        not_na_rows = group_knn_df.dropna().sample(n=self.n)

        # Execute imputation
        imputer = KNNImputer(n_neighbors=self.k, weights=self.weights)
        imputer.fit(not_na_rows)
        imputed_group = imputer.transform(na_rows)
        imputed_group = pd.DataFrame(imputed_group, columns=group_knn_df.columns)

        # Reinsert NA rows
        group_knn_df.loc[na_mask] = imputed_group

        # Fill NA in initial site/group df
        site_df.fillna(group_knn_df, inplace=True)

        return site_df


    def knn_impute_global(self, df):
        print("Begin global imputing for fully missing features at site-level")
        print(f"NA values remaining before global impute: {df.isna().sum().sum()}")
        
        # Create copy
        df_inds = df.index
        data_df_copy = df[self.imp_cols].copy()
        data_df_copy.reset_index(drop=True, inplace=True)

        # Use Global Imputing for Sites that have 100% of one feature missing (couldn't impute at site-level)
        na_mask = data_df_copy.isna().any(axis=1)
        na_inds = na_mask[na_mask==True].index
        na_rows = data_df_copy.loc[na_inds, ].copy()
        not_na_rows = data_df_copy.dropna().sample(n=n)

        # Execute imputation
        imputer = KNNImputer(n_neighbors=self.k, weights=self.weights)
        imputer.fit(not_na_rows)
        na_rows_imp = imputer.transform(na_rows)
        na_rows_imp = pd.DataFrame(na_rows_imp, columns=na_rows.columns)

        # Reinsert NA rows
        na_rows_imp.set_index(na_inds, inplace=True)
        data_df_copy.loc[na_inds] = na_rows_imp
        data_df_copy.set_index(df_inds, inplace=True)

        # Fill NA in initial site/group df
        df.fillna(data_df_copy, inplace=True)
        print(f"Final NA Count: {df.isna().sum().sum()}")

        return df
    

    def site_data_cleanup(self):
        data_df = None
        qc_flags_features = [s for s in self.included_features if "_QC" in s]

        ## PRINT THE PLAN
        if self.impute:
            print(f"Filling missing values with {self.impute_method} at site-level, then at global-level at end")
        else:
            print("Filling all NA values with -1")

        ## SITE-LEVEL CLEANING -> CONCATENATE
        for i, r in tqdm(self.site_metadata_df[['site_id','filename']].iterrows()):        
            if not r.filename or type(r.filename) != type(""):
                print(f'ERROR: {r.site_id} is mssing hourly data.')
                continue

            # Prepare hourly site df
            local_filename = self.data_dir + os.sep + r.filename
            site_df = pd.read_csv(local_filename, usecols = [self.target_variable, self.target_variable_qc] + self.included_features)

            # Format columns
            site_df['datetime'] = pd.to_datetime(site_df['datetime'])
            site_df['date'] = pd.to_datetime(site_df['date'])
            site_df['minute'] = site_df['datetime'].dt.minute
            if len(qc_flags_features) != 0:
                site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
            site_df['site_id'] = r.site_id


            # ----------------- #
            # LATER: FILTER SITE-DF TO THE BEST SEQUENCE OF X YEARS (e.g., if we only use 1.5 years per site)
            # ----------------- #


            # Move from HH to H level
            site_df = site_df.loc[site_df['datetime'].dt.minute == 0, ].copy()

            # Drop rows with NAs, or bad NEE_VUT_REF_QC for Target Variable <-------------------- MAKE BAD_QC_FLAGS==3 -> NAN to be imputed
            #site_df.dropna(subset=[self.target_variable], axis=0, inplace=True)
            #site_df.drop(site_df[site_df[self.target_variable_qc] == 3].index, inplace = True)
            #site_df.drop([self.target_variable_qc], axis=1, inplace=True)

            # Resample to add rows for missing timesteps, assign timestep_idx and "gap_flag"
            if self.resample:
                site_df = self.add_time_index(site_df)
            else:
                site_df.sort_values(self.time_col, inplace=True)
                site_df = site_df.reset_index()
                site_df.index.name='timestep_idx'
                site_df = site_df.reset_index()

            # Impute missing values at site-level, otherwise fillna w/ -1 at very end
            if self.impute:
                if self.impute_method=='ffill': # select most recent record
                    site_df.sort_values(self.time_col, ascending=True, inplace=True)
                    site_df.fillna(method="ffill", inplace=True)
                    
                elif self.impute_method=='knn': # use KNNImputer
                    site_df = self.knn_impute_site(site_df, self.imp_cols, self.k, self.weights, self.n)

            # When done cleaning site -> concatenate site_dfs together into global data_df
            if type(data_df) == type(None):
                data_df = site_df
            else:
                data_df = pd.concat([data_df, site_df])

        ## Handle Global Data
        # If we imputed at site-level already, there may be some features 100% missing for site...
        # ... -> thus, we need to impute using global data to fill these
        if data_df.isna().sum().sum() != 0:
            if self.impute_global:
                print("Filling global NA with KNNImpute")
                data_df = self.knn_impute_global(data_df, self.imp_cols, self.k, self.weights, self.n)
            elif type(self.impute_method) != type(None):
                data_df.fillna(-1, inplace=True)
                print("Filling global NA w/ -1")
            elif type(self.impute_method) == type(None):
                print("Not filling global NA values")
        print(f"NA values remaining at end of cleanup: {data_df.isna().sum().sum()}")

        return data_df


    def merge_site_metadata(self, data_df, site_metadata):
        data_df = data_df.merge(site_metadata_df, how='left', left_on='site_id', right_on='site_id')

        return data_df


    def all_sites_all_sources(self):
        data_df = self.site_data_cleanup()

        # Merge with site metadata
        data_df = self.merge_site_metadata(data_df, self.site_metadata_df.drop(['filename', 'koppen_main', 'koppen_name'], axis=1))
        print(f"Data size after after merged with site metadata: {data_df.shape}")

        # Merge with monthly data
        data_df = data_df.merge(monthly_df.drop('date', axis=1), how='left',
                                left_on =['site_id', 'year', 'month'],
                                right_on=['SITE_ID', 'year', 'month'])
        data_df.drop('SITE_ID', axis=1, inplace=True)
        print(f"Data size after after merged with monthly data: {data_df.shape}")

        #reorder columns
        features = data_df.columns.to_list()
        features.remove(target_variable)
        data_df = data_df[([target_variable] + features)]

        return data_df


In [22]:
prep_hourly = PrepareAllSitesHourly(site_metadata_df, monthly_df, included_features, target_variable_qc,
                 target_variable, knn_imp_cols, train_sites, test_sites, resample, impute, impute_method, 
                 impute_global, k, weights, n_fit, raw_data_dir, time_col, duration)
data_df = prep_hourly.all_sites_all_sources()

Filling all NA values with -1


4it [00:02,  1.54it/s]


NA values remaining at end of cleanup: 0
Data size after after merged with site metadata: (435432, 37)
Data size after after merged with monthly data: (435432, 53)


## Dev Resampling

In [23]:
# Choose sample site
site_id = 'US-NR1'
filename = f'data_full_half_hourly_raw_v0_1_{site_id}.csv'

# Prepare hourly site df
local_filename = raw_data_dir + os.sep + filename
site_df = pd.read_csv(local_filename, usecols = [target_variable, target_variable_qc] + included_features)
site_df['site_id'] = site_id

# Move from HH to H level
site_df['datetime'] = pd.to_datetime(site_df['datetime'])
site_df = site_df.loc[site_df['datetime'].dt.minute == 0, ].copy()


In [24]:
# Resampling
def add_time_index(df_init, time_col, duration):
    df_init['gap_flag'] = int(0)
    df = df_init.sort_values(by=[time_col])
    df.set_index(time_col, inplace=True)
    df = df.resample(duration).first() # <-- changed to first to preserve categorical features
    df = df.reset_index()
    df.index.name='timestep_idx'
    df = df.reset_index()
    df.loc[df['gap_flag']!= 0, 'gap_flag'] = int(1) # add flag to new records

    # Fix time records that are NA for new rows
    df['year'] = df['datetime'].dt.year.astype(int)
    df['month'] = df['datetime'].dt.month.astype(int)
    df['day'] = df['datetime'].dt.day.astype(int)
    df['hour'] = df['datetime'].dt.hour.astype(int)
    df['date'] = df['datetime'].dt.date
    
    return df


def impute_ffill(df, time_col, method):
    if method == 'ffill':
        df.sort_values(time_col, ascending=True, inplace=True)
        df.fillna(method="ffill", inplace=True)

    return df

In [29]:
print(f"Initial len: {site_df.shape}")
resampled_df = add_time_index(site_df, 'datetime', 'H')
print(f"New len: {resampled_df.shape}")
display(resampled_df.loc[resampled_df['gap_flag']!= 0, ].head())


df_ffill = impute_ffill(resampled_df.copy(), 'datetime', 'ffill')
print("\n")
display(df_ffill.loc[df_ffill['gap_flag']!= 0, ].head())

Initial len: (135384, 28)
New len: (137352, 29)


Unnamed: 0,timestep_idx,datetime,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,NEE_VUT_REF_QC,GPP_NT_VUT_REF,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,site_id,gap_flag
1896,1896,1999-07-20 00:00:00,,,,,,,,,1999,7,20,0,1999-07-20,,,,,,,,,,,,,,1.0
1897,1897,1999-07-20 01:00:00,,,,,,,,,1999,7,20,1,1999-07-20,,,,,,,,,,,,,,1.0
1898,1898,1999-07-20 02:00:00,,,,,,,,,1999,7,20,2,1999-07-20,,,,,,,,,,,,,,1.0
1899,1899,1999-07-20 03:00:00,,,,,,,,,1999,7,20,3,1999-07-20,,,,,,,,,,,,,,1.0
1900,1900,1999-07-20 04:00:00,,,,,,,,,1999,7,20,4,1999-07-20,,,,,,,,,,,,,,1.0






Unnamed: 0,timestep_idx,datetime,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,NEE_VUT_REF_QC,GPP_NT_VUT_REF,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,site_id,gap_flag
1896,1896,1999-07-20 00:00:00,6.559,0.0,286.49,2.154,0.0,71.058,1.0,0.34003,1999,7,20,0,1999-07-20,,,,,,,,,,,ENF,Cold,US-NR1,1.0
1897,1897,1999-07-20 01:00:00,6.559,0.0,286.49,2.154,0.0,71.058,1.0,0.34003,1999,7,20,1,1999-07-20,,,,,,,,,,,ENF,Cold,US-NR1,1.0
1898,1898,1999-07-20 02:00:00,6.559,0.0,286.49,2.154,0.0,71.058,1.0,0.34003,1999,7,20,2,1999-07-20,,,,,,,,,,,ENF,Cold,US-NR1,1.0
1899,1899,1999-07-20 03:00:00,6.559,0.0,286.49,2.154,0.0,71.058,1.0,0.34003,1999,7,20,3,1999-07-20,,,,,,,,,,,ENF,Cold,US-NR1,1.0
1900,1900,1999-07-20 04:00:00,6.559,0.0,286.49,2.154,0.0,71.058,1.0,0.34003,1999,7,20,4,1999-07-20,,,,,,,,,,,ENF,Cold,US-NR1,1.0


# CheckPoint: Upload Data to Azure Storage Blob as Parquet

In [11]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
data_cleanup_checkpoint = False
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

if data_cleanup_checkpoint:
  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  print(f"Uploading raw data checkpoint to Azure")
  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

In [12]:
# Save non-transformed data
get_non_transform_train_test = False
if get_non_transform_train_test:
  data_transformer = TFTDataTransformer(train_sites, test_sites,
                                        None, data_df)
  train_df, test_df = data_transformer.get_test_train_raw()
  print("Train data peak:")
  display(train_df.head(5))
  print("Test data peak:")
  display(test_df.head(5))

  train_blob_name= f"{train_blob_name_base}-{tag}.{ext}"
  test_blob_name= f"{test_blob_name_base}-{tag}.{ext}"
  data_transformer.upload_train_test_to_azure(az_cred_file, container,\
                                              train_blob_name, test_blob_name)

# Stage 2: Data Transform - Convert to Model Ready Data
- Encode data
- Split into train and test
- Assemble data through VectorAssembler
- Normalized data through minmax

In [13]:
load_data_from_previous_checkpoint = False
useSpark = False

raw_data_file_path = None
if load_data_from_previous_checkpoint:
  data_df = None
  raw_data_file_path = tmp_dir + os.sep + blob_name
  print(f"loading {raw_data_file_path}...")
  if not (os.path.exists(raw_data_file_path)):
      if not (os.path.exists(tmp_dir)):
          os.mkdir(tmp_dir)
      azStorageClient = AzStorageClient(az_cred_file)
      file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
      data_df = pd.read_parquet(file_stream, engine='pyarrow')
      data_df.to_parquet(raw_data_file_path)
  
if useSpark:
  data_transformer = PySparkMLDataTransformer(spark, train_sites, test_sites,
                                              raw_data_file_path, data_df)
else:
  data_transformer = TFTDataTransformer(train_sites, test_sites,
                                              raw_data_file_path, data_df)

Data size: (411854, 50).


In [14]:
timestamp_col = ['datetime']
target_col = 'GPP_NT_VUT_REF'

if useSpark: # Spark ML Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'MODIS_PFT', 'MODIS_LC'] 
  data_transformer.data_transform(categorical_cols, timestamp_col, target_col)

  print("Train data peak:")
  data_transformer.train_df.show(5, False)
  print("Test data peak:")
  data_transformer.test_df.show(5, False)

  train_blob_name= f"{train_blob_name_base}"
  test_blob_name= f"{test_blob_name_base}"

else: # TFT Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'site_id',
                      'year', 'month', 'day', 'hour', 'minute',
                      'MODIS_PFT', 'MODIS_LC']
  realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                  'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                  'lat', 'long', 'c4_percent',
                  'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                  'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                  'LST_Day', 'LST_Night']
  backup_cols = ['IGBP', 'koppen','site_id']
  data_transformer.data_transform(categorical_cols, realNum_cols, backup_cols,\
                                  timestamp_col, target_col)

  print("Train data peak:")
  display(data_transformer.train_df.head(5))
  print("Test data peak:")
  display(data_transformer.test_df.head(5))

  train_blob_name= f"{train_blob_name_base}.{ext}"
  test_blob_name= f"{test_blob_name_base}.{ext}"

Data size: (411854, 52).
Data size after encoding: (411854, 52)


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,lat,long,koppen_sub,koppen_main,koppen_name,c3c4,c4_percent,filename,TIMESTAMP,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,IGBP_name,site_id_name
0,0.38329,-1.298,0.0,292.592,1.998,0.061,69.384,1999-05-02 00:00:00,0,4,1,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,2,40.0329,-105.5464,2,4,Cold,0,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv,,,,,,,,,3,,,,,,,3,ENF,US-NR1
1,0.41899,-1.548,0.0,292.592,2.01,0.061,69.331,1999-05-02 01:00:00,0,4,1,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,2,40.0329,-105.5464,2,4,Cold,0,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv,,,,,,,,,3,,,,,,,3,ENF,US-NR1
2,0.51696,-1.798,0.0,282.547,2.022,0.0,69.278,1999-05-02 02:00:00,0,4,1,2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,2,40.0329,-105.5464,2,4,Cold,0,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv,,,,,,,,,3,,,,,,,3,ENF,US-NR1
3,0.56481,-1.861,0.0,282.547,2.023,0.621,69.273,1999-05-02 03:00:00,0,4,1,3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,2,40.0329,-105.5464,2,4,Cold,0,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv,,,,,,,,,3,,,,,,,3,ENF,US-NR1
4,0.58099,-1.924,0.0,282.547,2.024,0.621,69.267,1999-05-02 04:00:00,0,4,1,4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,2,40.0329,-105.5464,2,4,Cold,0,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv,,,,,,,,,3,,,,,,,3,ENF,US-NR1


Features(47): ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'year', 'month', 'day', 'hour', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'IGBP', 'koppen', 'minute', 'site_id', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c3c4', 'c4_percent', 'filename', 'TIMESTAMP', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night', 'MODIS_PFT']
Unique sites in df: <bound method Series.unique of 0         2
1         2
2         2
3         2
4         2
         ..
411849    1
411850    1
411851    1
411852    1
411853    1
Name: site_id, Length: 411854, dtype: int64>
Passed train: ['IT-Lav', 'US-NR1', 'US-Vcp']
Passed test: ['IT-Lsn']
Train data size: (0, 52).
Test data size: (0, 52).
Normalizinf features (35): ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'lat', 'long'

ValueError: Found array with 0 sample(s) (shape=(0, 35)) while a minimum of 1 is required by StandardScaler.

# Checkpoint: Upload train and test to Azure Blob Storage

In [None]:
final_checkpoint = True

if final_checkpoint:
  data_transformer.upload_train_test_to_azure(az_cred_file, container, \
                                            train_blob_name, test_blob_name)