# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

## Import Modules

In [2]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH)
import math
import json
import random

import pandas as pd
import numpy as np
from calendar import monthrange
from datetime import datetime
from io import BytesIO
from sklearn.impute import KNNImputer
from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Define Constants

In [3]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-imputed-v1-i.csv"

# File
container = "baseline-data"
ext = "parquet"
ver = "1-i"
blob_name_base = f"baseline_all_v_{ver}"
train_blob_name = f"baseline-train-v-{ver}.{ext}"
test_blob_name = f"baseline-test-v-{ver}.{ext}"

In [4]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
target_variable_qc = 'NEE_VUT_REF_QC'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main', 'IGBP',
                     'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

In [5]:
# "Golden" Sites
tier1_sites = ["IT-Lav", "US-NR1", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

train_sites = tier1_sites + tier2_sites

# Selected Test Sites
#test_sites = ['IT-Lsn']
test_sites = ["US-GLE", # ENF, Cold
              "US-AR1", # GRA, Temperate
              "US-Seg", # GRA, Arid
              "US-FR2", # WSA, Temperate
              "ES-LM2", # WSA, Arid
              "CA-Cbo", # DBF, Cold
              "FR-Lam", # CRO, Temperate
              "IT-Cpz", # EBF, Temperate
              "CN-Cha", # MF Cold
              "IT-Lsn", # OSH, Temperate
              ]

# Stage 1: Trim and Merge Site Metadata

In [6]:
class PrepareAllSitesHourly:
    def __init__(self, site_metadata_filename, monthly_data_filename, train_sites, test_sites, 
                hourly_features, metadata_features, target_variable_qc, target_variable, data_dir):
        self.site_metadata_filename = site_metadata_filename
        self.monthly_data_filename = monthly_data_filename
        self.train_sites = train_sites
        self.test_sites = test_sites
        self.hourly_features = hourly_features
        self.metadata_features = metadata_features
        self.target_variable_qc = target_variable_qc
        self.target_variable = target_variable
        self.data_dir = data_dir

    def add_time_index(self, df, time_col, duration, site_id):
        df['gap_flag_hour'] = int(0)
        df.sort_values(time_col, inplace=True)
        df.set_index(time_col, inplace=True)
        df = df.resample(duration).first()
        df = df.reset_index()
        df['gap_flag_hour'].fillna(int(1), inplace=True)

        # Fix time records that are NA for new rows
        df['year'] = df['datetime'].dt.year.astype(int)
        df['month'] = df['datetime'].dt.month.astype(int)
        df['day'] = df['datetime'].dt.day.astype(int)
        df['hour'] = df['datetime'].dt.hour.astype(int)
        df['date'] = df['datetime'].dt.date
        df['site_id'] = site_id

        return df


    def knn_impute(self, df, imp_cols, k, weights, n_fit=20000):
        # Init Imputer
        imputer = KNNImputer(n_neighbors=k, weights=weights)

        # Get subset of rows to speed up impute time (instead of fitting on every single record)
        df_subcols = df[imp_cols].copy()
        na_mask = df_subcols.isna().any(axis=1)
        na_rows = df_subcols[na_mask]

        # If there are at least 10k rows that don't have NA, use them to fit imputer (saves time)
        if (len(df) - len(na_rows)) > 10000:
            not_na_rows = df_subcols.dropna()
            not_na_rows = not_na_rows.sample(n=np.min([n_fit, len(not_na_rows)]))
            imputer.fit(not_na_rows)
            imputed_group = imputer.transform(na_rows)
        else:
            imputed_group = imputer.fit_transform(na_rows)
        imputed_group = pd.DataFrame(imputed_group, columns=df_subcols.columns, index=na_rows.index)

        # Reinsert NA rows
        df_subcols.loc[na_mask] = imputed_group

        # Fill NA in initial site/group df
        df.fillna(df_subcols, inplace=True)

        return df

    
    def check_imputation(self, df_init,  df_imputed):
        # Drop NA rows from both (using indices) confirm they are same df now
        drop_na = df_init.dropna(how='any')
        drop_imp = df_imputed.loc[drop_na.index, ]
        drop_na.reset_index(inplace=True, drop=True)
        drop_imp.reset_index(inplace=True, drop=True)
        if not drop_na.equals(drop_imp):
            print("IMPUTATION ERROR: Non-NA values were affected in imputation")

        # Check that 50 rows that initial had NA are the same in non-NA cols
        na_inds = df_init.loc[df_init.isna().any(axis=1), ].index
        errors = 0
        for ind in na_inds[:50]:
            check_ind = pd.concat([df_init.iloc[ind], df_imputed.iloc[ind]], axis=1).dropna()
            check_ind.columns = ['initial', 'post_imp']
            if not check_ind['initial'].equals(check_ind['post_imp']):
                errors += 1
                print(ind)

        if errors != 0:
            print("IMPUTATION ERROR: Non-NA values in rows with NA were affected by imputation")

        # DF length is the same 
        if len(df_init) != len(df_imputed):
            print("IMPUTATION ERROR: Post imputation df has different row count than initial df")


    def filter_date_range(self, df, start_date, end_date, time_col, missing_thresh=0.2):
        df.set_index(time_col, inplace=True)
        filtered_df = df.loc[start_date:end_date].copy()

        # Remove sites without at least one year of records
        if len(filtered_df) < 365*24:
            return None
        else:
            # Remove sites that have > 20% gaps in sequence
            first_date = filtered_df.index.min()
            last_date = filtered_df.index.max()
            total_expected_count = len(pd.date_range(start=first_date, end=last_date, freq='H'))
            missing_percentage = (total_expected_count - len(filtered_df)) / total_expected_count

            if missing_percentage > missing_thresh:
                return None
            else:
                filtered_df.reset_index(inplace=True)
                return filtered_df
            

    def prep_metadata(self):
        site_metadata_df = pd.read_csv(self.site_metadata_filename, usecols = self.metadata_features)
        site_metadata_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(self.train_sites + self.test_sites), ]
        site_metadata_df = site_metadata_df.loc[site_metadata_df['monthly_data_available']=='Yes', ] # <---- not including sites that have zero monthly data (ask team)
        site_metadata_df.reset_index(inplace=True, drop=True)
        return site_metadata_df


    def merge_site_metadata(self, data_df, site_metadata_df):
        site_metadata_df = site_metadata_df.drop(['filename', 'monthly_data_available'], axis=1)
        data_df = data_df.merge(site_metadata_df, how='left', left_on='site_id', right_on='site_id')
        print(f"Data size after after merged with site metadata: {data_df.shape}")

        if data_df.isna().sum().sum() != 0:
            print(f"Missing values after metadata merge {data_df.isna().sum().sum()} ")

        return data_df


    def merge_monthly_data(self, data_df):
        # Prep monthly
        monthly_df = pd.read_csv(self.monthly_data_filename)
        monthly_df = monthly_df.loc[monthly_df['SITE_ID'].isin(self.train_sites + self.test_sites)]
        monthly_df.reset_index(inplace=True, drop=True)
        monthly_df[['year','month', 'MODIS_LC']] = monthly_df[['year','month', 'MODIS_LC']].astype('int')

        # Merge
        data_df = data_df.merge(monthly_df, how='left',
                        left_on =['site_id', 'year', 'month'],
                        right_on=['SITE_ID', 'year', 'month'])
        data_df.drop('SITE_ID', axis=1, inplace=True)
        print(f"Data size after after merged with site metadata: {data_df.shape}")

        if data_df.isna().sum().sum() != 0:
            print(f"{data_df.isna().sum().sum()} missing values introduced after monthly merge")

        return data_df
    

    def site_data_cleanup(self, site_metadata_df, imp_cols, resample, impute, impute_method,
                         impute_global, k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh=0.2, c=None):
        data_df = None
        num_records = 0
        available_site_count = 0
        retained_site_count = 0
        qc_flags_features = [s for s in self.hourly_features if "_QC" in s]

        ## SITE-LEVEL CLEANING -> CONCATENATE
        for i, r in tqdm(site_metadata_df[['site_id','filename']].iterrows()):        
            if not r.filename or type(r.filename) != type(""):
                print(f'ERROR: {r.site_id} is missing hourly data.')
                continue
            else:
                available_site_count += 1

            # Prepare hourly site df
            local_filename = self.data_dir + os.sep + r.filename
            site_df = pd.read_csv(local_filename, usecols = [self.target_variable, self.target_variable_qc] + self.hourly_features)

            # Format columns
            site_df['datetime'] = pd.to_datetime(site_df['datetime'])
            site_df['date'] = pd.to_datetime(site_df['date'])
            site_df['minute'] = site_df['datetime'].dt.minute
            if len(qc_flags_features) != 0:
                site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
            site_df['site_id'] = r.site_id

            # Move from HH to H level
            site_df = site_df.loc[site_df['datetime'].dt.minute == 0, ].copy()
            site_df.drop('minute', axis=1, inplace=True)
            
            # Filter site date-range and drop sites without > 1 year and <20% gaps after trim
            site_df = self.filter_date_range(site_df, start_date, end_date, time_col, missing_thresh)
            if site_df is None:
                continue
            else:
                retained_site_count += 1
                num_records += len(site_df)

            # For records with bad target QC, make NAN and impute
            site_df.loc[site_df[self.target_variable_qc] == 3, self.target_variable] = np.nan
            site_df.drop([self.target_variable_qc], axis=1, inplace=True)

            # Resample to add rows for missing timesteps, assign timestep_idx and "gap_flag"
            if resample:
                site_df = self.add_time_index(site_df, time_col, duration, site_id=r.site_id)
            else:
                site_df.sort_values(time_col, inplace=True)
                site_df = site_df.reset_index()

            # Save site_df pre-imputation to check post-imputation (once per run, random site each time)
            random_check = random.randint(0, len(self.train_sites) + len(self.test_sites))
            if i == random_check:   
                site_df_pre_imp = site_df.copy()

            # Impute missing values at site-level, otherwise fillna w/ -1 at very end
            if (impute) & (site_df.isna().sum().sum() != 0):
                if impute_method=='ffill': # select most recent record
                    site_df.sort_values(time_col, ascending=True, inplace=True)
                    site_df.fillna(method="ffill", inplace=True)
                    
                elif impute_method=='knn': # use KNNImputer
                    site_df = self.knn_impute(site_df, imp_cols, k, weights, n_fit)

                elif impute_method=='constant':
                    site_df[imp_cols] = site_df[imp_cols].fillna(c)

            if i == random_check:
                self.check_imputation(site_df_pre_imp, site_df)

            # Create local timestep_idx
            site_df.sort_values(time_col, ascending=True, inplace=True)
            site_df['timestep_idx_local'] = range(len(site_df))

            # Concatenate site_dfs together into global data_df
            if type(data_df) == type(None):
                data_df = site_df
            else:
                data_df = pd.concat([data_df, site_df])


        ## Global Data-DF Cleanup
        # Create global timestamp inds
        dates = sorted(data_df['datetime'].unique())
        date_to_idx = {date: idx for idx, date in enumerate(dates)}
        data_df['timestep_idx_global'] = data_df['datetime'].map(date_to_idx)

        # Order cols + sort
        data_df.sort_values(['site_id', time_col], ascending=True, inplace=True)

        # Print stats
        print(f"Initial records: {num_records}, Final records after resampling + gap-filling: {len(data_df)}")
        print(f"Total retained sites: {retained_site_count}/{available_site_count} = {retained_site_count/available_site_count:.2f}")

        # Handle remaining missing data (if 100% of feature missing for one site)
        print(f"Missing values after site-level imputation: {data_df.isna().sum().sum()}")
        if (impute_global) & (data_df.isna().sum().sum() != 0):
            if impute_method=='ffill': # select most recent record
                data_df.sort_values(time_col, ascending=True, inplace=True)
                data_df.fillna(method="ffill", inplace=True)
                data_df.fillna(method="bfill", inplace=True) # in rare case of missing first record
                
            elif impute_method=='knn': # use KNNImputer
                data_df = self.knn_impute(data_df, imp_cols, k, weights, n_fit)

            elif impute_method=='constant':
                data_df[imp_cols] = data_df[imp_cols].fillna(c)
        else:
            print("Not imputing missing values at global level")
        print(f"Missing values after global-level imputation: {data_df.isna().sum().sum()}")

        return data_df
    

    def all_sites_all_sources(self, imp_cols, resample, impute, impute_method, impute_global, k,
                            weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c):
        site_metadata_df = self.prep_metadata()
        data_df = self.site_data_cleanup(site_metadata_df, imp_cols, resample, impute, impute_method, 
                                        impute_global, k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

        # Merge with site metadata and monthly data
        data_df = self.merge_site_metadata(data_df, site_metadata_df)
        data_df = self.merge_monthly_data(data_df)

        #reorder columns
        features = data_df.columns.to_list()
        remove_cols = [target_variable, 'site_id', 'timestep_idx_local', 'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day', 'hour', 'gap_flag_hour', 'gap_flag_month']
        features = list(filter(lambda x: x not in remove_cols, features))
        data_df = data_df[([target_variable, 'site_id', 'timestep_idx_local', 'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day', 'hour'] + features + ['gap_flag_hour', 'gap_flag_month'])]

        return data_df


In [7]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = True
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date = '2015-12-31'


# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

In [8]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, train_sites, test_sites, 
                                    hourly_features, metadata_features, target_variable_qc, target_variable, raw_data_dir)

data_df = prep_hourly.all_sites_all_sources(imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

In [9]:
# Get the memory usage of the dataframe in bytes
memory_usage = data_df.memory_usage().sum()
memory_usage_gb = memory_usage / 1_000_000_000
print(f"The dataframe uses {memory_usage_gb:.2f} GB of memory.")

# Stage 2: Data Transform - Convert to Model Ready Data
- Encode data
- Split into train and test
- Assemble data through VectorAssembler
- Normalized data through minmax

In [13]:
raw_data_file_path = None
data_transformer = TFTDataTransformer(train_sites, test_sites, raw_data_file_path, data_df)

Data size: (865920, 50).


In [14]:
non_transform_cols = [target_variable, 'site_id', 'datetime', 'timestep_idx_local', 'timestep_idx_global', 'gap_flag_hour', 'gap_flag_month']
categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen_main',
                    'year', 'month', 'day', 'hour', 'MODIS_PFT', 'MODIS_LC']
realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                'lat', 'long', 'c4_percent',
                'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                'LST_Day', 'LST_Night']
backup_cols = ['IGBP', 'site_id'] # <--- ask mary, not sure what this does in lib
data_transformer.data_transform(categorical_cols, realNum_cols, non_transform_cols)

print("\nTrain data peak:")
display(data_transformer.train_df.head(5))
print("\nTest data peak:")
display(data_transformer.test_df.head(5))

Data size: (865920, 50).
Data size after encoding: (865920, 50)
Number of sites in df: 19
Train Sites: ['IT-Lav', 'US-NR1', 'US-Vcp', 'FR-Pue', 'CH-Lae', 'US-Var', 'US-Ne2', 'ES-LJu', 'US-Ton', 'US-UMB', 'US-Me2', 'FI-Hyy', 'US-NR1', 'IT-Lav', 'US-Wkg', 'US-ARM', 'US-SRM']
Test Sites: ['US-GLE', 'US-AR1', 'US-Seg', 'US-FR2', 'ES-LM2', 'CA-Cbo', 'FR-Lam', 'IT-Cpz', 'CN-Cha', 'IT-Lsn']
Normalizing real features (32)
Train data size: (631032, 50).
Test data size: (234888, 50).

Train data peak:


Unnamed: 0,GPP_NT_VUT_REF,site_id,timestep_idx_local,timestep_idx_global,datetime,year,month,day,hour,timestep_idx,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,lat,long,koppen_sub,koppen_main,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,gap_flag_hour,gap_flag_month
0,8.0007,CH-Lae,0,0,2010-01-01 00:00:00,0,0,0,0,0,-0.93218,-0.7129,-1.0638,-0.86293,-0.20824,0.05816,0.38289,0.83299,0.36225,-0.786,-0.4303,-0.5928,-0.84128,-0.51941,-0.62359,-0.75529,5,0.77039,1.18369,5,2,0,-0.58974,-1.62789,-1.41854,-1.63492,-0.86451,1.08868,-1.48659,0.93231,5,0.15866,2.32445,-1.47498,-0.96933,-1.47888,-1.10114,5,0.0,0.0
1,8.04417,CH-Lae,1,1,2010-01-01 01:00:00,0,0,0,1,1,-0.96832,-0.7129,-1.0757,-0.86145,-0.20824,0.05344,0.38289,0.83299,0.36225,-0.786,-0.4303,-0.5928,-0.84128,-0.51941,-0.62359,-0.75529,5,0.77039,1.18369,5,2,0,-0.58974,-1.62789,-1.41854,-1.63492,-0.86451,1.08868,-1.48659,0.93231,5,0.15866,2.32445,-1.47498,-0.96933,-1.47888,-1.10114,5,0.0,0.0
2,7.96453,CH-Lae,2,2,2010-01-01 02:00:00,0,0,0,2,2,-0.97005,-0.7129,-1.17348,-0.85556,-0.20824,0.04883,0.38289,0.83299,0.36225,-0.786,-0.4303,-0.5928,-0.84128,-0.51941,-0.62359,-0.75529,5,0.77039,1.18369,5,2,0,-0.58974,-1.62789,-1.41854,-1.63492,-0.86451,1.08868,-1.48659,0.93231,5,0.15866,2.32445,-1.47498,-0.96933,-1.47888,-1.10114,5,0.0,0.0
3,12.2833,CH-Lae,3,3,2010-01-01 03:00:00,0,0,0,3,3,-1.01997,-0.7129,-1.09674,-0.84746,-0.20824,0.04701,0.38289,0.83299,0.36225,-0.786,-0.4303,-0.5928,-0.84128,-0.51941,-0.62359,-0.75529,5,0.77039,1.18369,5,2,0,-0.58974,-1.62789,-1.41854,-1.63492,-0.86451,1.08868,-1.48659,0.93231,5,0.15866,2.32445,-1.47498,-0.96933,-1.47888,-1.10114,5,0.0,0.0
4,13.7571,CH-Lae,4,4,2010-01-01 04:00:00,0,0,0,4,4,-1.05284,-0.7129,-1.02731,-0.85286,-0.20824,0.04121,0.38289,0.83299,0.36225,-0.786,-0.4303,-0.5928,-0.84128,-0.51941,-0.62359,-0.75529,5,0.77039,1.18369,5,2,0,-0.58974,-1.62789,-1.41854,-1.63492,-0.86451,1.08868,-1.48659,0.93231,5,0.15866,2.32445,-1.47498,-0.96933,-1.47888,-1.10114,5,0.0,0.0



Test data peak:


Unnamed: 0,GPP_NT_VUT_REF,site_id,timestep_idx_local,timestep_idx_global,datetime,year,month,day,hour,timestep_idx,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,lat,long,koppen_sub,koppen_main,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,gap_flag_hour,gap_flag_month
0,1.30673,CA-Cbo,0,0,2010-01-01 00:00:00,0,0,0,0,0,-1.17585,-0.7129,0.22906,-0.8768,-0.08017,0.87252,1.35485,-0.72675,-0.22183,1.01709,1.03748,2.44793,1.58673,-0.77128,-1.35134,-1.05483,1,0.3683,-0.34851,5,2,0,-0.52345,-1.14726,-0.63639,-1.16041,-1.0106,0.92265,-1.63887,-0.85179,5,0.49786,3.3538,-0.53467,-0.57577,-1.89649,-2.01633,5,0.0,1.0
1,1.38098,CA-Cbo,1,1,2010-01-01 01:00:00,0,0,0,1,1,-1.18095,-0.7129,0.11042,-0.87655,-0.0654,0.87113,0.82779,-0.72757,-0.47345,1.24961,0.82743,2.63208,1.93234,-0.50577,-1.24273,-0.9793,1,0.3683,-0.34851,5,2,0,-0.52345,-1.14726,-0.63639,-1.16041,-1.0106,0.92265,-1.63887,-0.85179,5,0.49786,3.3538,-0.53467,-0.57577,-1.89649,-2.01633,5,0.0,1.0
2,0.27869,CA-Cbo,2,2,2010-01-01 02:00:00,0,0,0,2,2,-1.19922,-0.7129,0.11042,-0.87385,-0.0654,0.87199,0.80654,-0.70947,-0.45892,1.20096,0.80097,2.54465,1.84862,-0.5316,-1.20639,-0.96197,1,0.3683,-0.34851,5,2,0,-0.52345,-1.14726,-0.63639,-1.16041,-1.0106,0.92265,-1.63887,-0.85179,5,0.49786,3.3538,-0.53467,-0.57577,-1.89649,-2.01633,5,0.0,1.0
3,-1.72097,CA-Cbo,3,3,2010-01-01 03:00:00,0,0,0,3,3,-1.21749,-0.7129,0.11042,-0.87115,-0.0654,0.87285,0.67607,-1.18963,-0.61559,1.81932,1.30657,3.25802,2.58428,-0.16395,-1.1384,-0.97076,1,0.3683,-0.34851,5,2,0,-0.52345,-1.14726,-0.63639,-1.16041,-1.0106,0.92265,-1.63887,-0.85179,5,0.49786,3.3538,-0.53467,-0.57577,-1.89649,-2.01633,5,0.0,1.0
4,0.67984,CA-Cbo,4,4,2010-01-01 04:00:00,0,0,0,4,4,-1.23577,-0.7129,0.00026,-0.86845,0.12178,0.8737,0.57396,-0.72748,-0.4262,0.79934,0.48541,1.83862,1.22247,-0.74313,-1.0439,-0.8316,1,0.3683,-0.34851,5,2,0,-0.52345,-1.14726,-0.63639,-1.16041,-1.0106,0.92265,-1.63887,-0.85179,5,0.49786,3.3538,-0.53467,-0.57577,-1.89649,-2.01633,5,0.0,1.0


# FINISH: Upload train and test to Azure Blob Storage

In [15]:
final_checkpoint = True

if final_checkpoint:
  data_transformer.upload_train_test_to_azure(az_cred_file, container, train_blob_name, test_blob_name)

Uploading train dataset to baseline-train-v-1-i.parquet...
File uploaded to baseline-data/baseline-train-v-1-i.parquet
Uploading test dataset to baseline-test-v-1-i.parquet...
File uploaded to baseline-data/baseline-test-v-1-i.parquet
