# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

In [2]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Mounted at /content/drive/


## Import Modules

In [3]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.6/96.6 KB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
flask 1.1.4 requires click<8.0,>=5.1, but you have click 8.1.3 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

import pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [5]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").config(
    "spark.jars.packages", 
    "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-storage:8.6.6"
    ).getOrCreate()
# Check Spark Session Information
spark

# Define Constants

In [6]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + 'monthly-interpolated.csv'

In [7]:
# File
container = "baseline-data"
ext = "parquet"
ver = "1"
blob_name_base = f"baseline_all_v_{ver}"
train_blob_name_base = f"baseline-train-v-{ver}"
test_blob_name_base = f"baseline-test-v-{ver}"

In [8]:
# "Golden" Sites
tier1_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

train_sites = tier1_sites + tier2_sites

# Selected Test Sites
test_sites = ["US-GLE", # ENF, Cold
              "US-AR1", # GRA, Temperate
              "US-Seg", # GRA, Arid
              "US-FR2", # WSA, Temperate
              "ES-LM2", # WSA, Arid
              "CA-Cbo", # DBF, Cold
              "FR-Lam", # CRO, Temperate
              "IT-Cpz", # EBF, Temperate
              "CN-Cha", # MF Cold
              "IT-Lsn", # OSH, Temperate
              ]

In [9]:
# Define features and target variables of the data pipelines
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = 'NEE_VUT_REF_QC'
target_variable = 'GPP_NT_VUT_REF'

# Get Gold Sample Site Data

In [10]:
# Load site metadata
included_site_features = ['site_id', 'filename', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name',
                          'c3c4', 'c4_percent']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(train_sites + test_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

size:(26, 10)


Unnamed: 0,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,c3c4,c4_percent,filename
0,CN-Cha,,42.4025,128.0958,22,4,Dwb,C3,12.17,data_full_half_hourly_raw_v0_1_CN-Cha.csv
1,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,C3,6.59,data_full_half_hourly_raw_v0_1_FR-Pue.csv
2,IT-Cpz,68.0,41.70525,12.37611,8,3,Csa,C3,0.0,data_full_half_hourly_raw_v0_1_IT-Cpz.csv
3,US-GLE,3197.0,41.36653,-106.2399,27,4,Dfc,C3,0.16,data_full_half_hourly_raw_v0_1_US-GLE.csv
4,US-NR1,3050.0,40.0329,-105.5464,27,4,Dfc,C3,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv
5,US-Ne2,362.0,41.16487,-96.4701,25,4,Dfa,rotation,48.91,
6,US-SRM,1120.0,31.8214,-110.8661,6,2,BSh,C3,55.39,data_full_half_hourly_raw_v0_1_US-SRM.csv
7,US-Ton,177.0,38.4316,-120.96598,8,3,Csa,C3,0.0,data_full_half_hourly_raw_v0_1_US-Ton.csv
8,US-Var,129.0,38.4133,-120.9507,8,3,Csa,C3,0.0,data_full_half_hourly_raw_v0_1_US-Var.csv
9,US-Wkg,1531.0,31.7365,-109.9419,7,2,BSk,C4,40.93,data_full_half_hourly_raw_v0_1_US-Wkg.csv


# Get Monthly Data

In [11]:
# Load monthly metadata
monthly_df = pd.read_csv(monthly_data_filename)

# only focus on target sites
monthly_df = monthly_df.loc[monthly_df['SITE_ID'].isin(train_sites + test_sites)]
print(f"size:{monthly_df.shape}")
monthly_df.reset_index(inplace=True, drop=True)
monthly_df[['year','month', 'TIMESTAMP']] = monthly_df[['year','month', 'TIMESTAMP']].astype('int')
monthly_df[['MODIS_LC']] = monthly_df[['MODIS_LC']].astype('int')
monthly_df

size:(3246, 20)


Unnamed: 0,date,SITE_ID,year,month,TIMESTAMP,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT
0,2003-02-28,CN-Cha,2003,2,200302,48.00000,20.00000,112.00000,0.00914,-0.00601,263.63275,0.22836,4,-0.08581,66.89286,0.12000,0.10000,266.30000,258.42000,DBF
1,2003-03-31,CN-Cha,2003,3,200303,69.00000,33.00000,161.00000,0.02139,-0.01054,268.19214,0.22836,4,-0.08581,57.19355,0.20000,0.40000,273.90000,262.30000,DBF
2,2003-04-30,CN-Cha,2003,4,200304,82.00000,41.00000,187.00000,0.03646,-0.01813,277.78570,0.23438,4,-0.17377,0.00000,0.25000,0.50000,290.54000,273.06000,DBF
3,2003-05-31,CN-Cha,2003,5,200305,92.00000,46.00000,213.00000,0.30082,-0.02293,285.67480,0.24953,4,0.12629,0.00000,0.61000,2.40000,296.58000,278.78000,DBF
4,2003-06-30,CN-Cha,2003,6,200306,95.00000,55.00000,212.00000,0.53407,-0.01825,290.13977,0.28139,4,0.30743,0.00877,0.75000,3.50000,296.22000,286.08000,DBF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3241,2020-08-01,IT-Lsn,2020,8,202008,419.00000,168.00000,198.00000,0.46822,-0.00736,297.98160,0.31786,12,0.28867,0.00000,0.73000,2.40000,304.40000,290.70000,CRO
3242,2020-09-01,IT-Lsn,2020,9,202009,336.00000,137.00000,161.00000,0.20971,-0.00635,293.63483,0.32494,12,0.10983,0.00000,0.47000,1.00000,301.16000,286.78000,CRO
3243,2020-10-01,IT-Lsn,2020,10,202010,186.00000,88.00000,89.00000,0.07442,-0.00285,287.04584,0.34953,12,-0.00192,0.00000,0.38000,0.50000,291.60000,280.70000,CRO
3244,2020-11-01,IT-Lsn,2020,11,202011,130.00000,55.00000,63.00000,0.05041,-0.00234,282.16028,0.35629,12,-0.05228,0.00000,0.32000,0.40000,287.92000,275.82000,CRO


In [12]:
monthly_df.columns

Index(['date', 'SITE_ID', 'year', 'month', 'TIMESTAMP', 'BESS-PAR',
       'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts', 'ESACCI-sm',
       'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day',
       'LST_Night', 'MODIS_PFT'],
      dtype='object')

In [13]:
monthly_df.isna().sum()

date              0
SITE_ID           0
year              0
month             0
TIMESTAMP         0
BESS-PAR          0
BESS-PARdiff      0
BESS-RSDN         0
CSIF-SIFdaily     0
PET               0
Ts                0
ESACCI-sm        95
MODIS_LC          0
NDWI              0
Percent_Snow      0
Fpar              0
Lai               0
LST_Day           0
LST_Night         0
MODIS_PFT         0
dtype: int64

# Stage 1: Trim and Merge Site Metadata

All available features from Half=hourly data:
```
'TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
'b7', 'IGBP', 'koppen'
``` 

In [14]:
# Initial data clean and feature selections from raw data
data_df = data_cleanup(raw_data_dir, site_metadata_df[['site_id','filename']],
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

# Merge with site metadata
data_df = merge_site_metadata(data_df, site_metadata_df.drop(['filename', 'koppen_main', 'koppen_name'], axis=1))
print(f"Data size after after merged with site metadata: {data_df.shape}")

# Merge with monthly data
data_df = data_df.merge(monthly_df.drop(['date', 'TIMESTAMP'], axis=1), how='left',\
                        left_on =['site_id', 'year', 'month'],
                        right_on=['SITE_ID', 'year', 'month'])
data_df.drop('SITE_ID', axis=1, inplace=True)
print(f"Data size after after merged with monthly data: {data_df.shape}")

# Drop rows with NA
check_and_drop_na(data_df)
print(f"Data size after after final drop: {data_df.shape}")

#reorder columns
features = data_df.columns.to_list()
features.remove(target_variable)
data_df = data_df[([target_variable] + features)]

display(data_df.head())

CN-Cha: (16228, 27)
FR-Pue: (117200, 27)
IT-Cpz: (59175, 27)
US-GLE: (54687, 27)
US-NR1: (98652, 27)

ERROR: US-Ne2 is mssing hourly data.
US-SRM: (95419, 27)
US-Ton: (113031, 27)
US-Var: (119950, 27)
US-Wkg: (93319, 27)
CA-Cbo: (79273, 27)
US-AR1: (28956, 27)
US-ARM: (125756, 27)
US-FR2: (30426, 27)

ERROR: US-MMS is mssing hourly data.
US-Me2: (99780, 27)
US-Seg: (91884, 27)
US-UMB: (70639, 27)
US-Vcp: (78491, 27)
CH-Lae: (112718, 27)
ES-LJu: (112724, 27)
ES-LM2: (58806, 27)
FI-Hyy: (127362, 27)
FR-Lam: (115812, 27)
IT-Lav: (120885, 27)
IT-Lsn: (40182, 27)
Data size after cleanup: (2061355, 27)
Data size after after merged with site metadata: (2061355, 33)
Data size after after merged with monthly data: (2111289, 48)
Data has NA.


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20413,0,0,0,0,0,192726,192726,192726,192726,192726,192726,258509,192726,192726,192726,192726,192726,192726,192726,192726


Data size after after final drop: (1832903, 48)


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT
20413,-0.53574,5.311,25.016,272.218,1.708,0.0,97.939,2001-01-01 08:30:00,2001,1,1,8,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,8.0,0.04801,0.0,0.52,1.0,281.08,275.98,SA
20414,0.86438,5.744,59.734,272.218,1.738,0.0,97.939,2001-01-01 09:00:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,8.0,0.04801,0.0,0.52,1.0,281.08,275.98,SA
20415,-0.02627,6.176,91.235,272.218,1.767,0.0,97.939,2001-01-01 09:30:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,8.0,0.04801,0.0,0.52,1.0,281.08,275.98,SA
20416,-0.17229,6.608,79.264,333.933,1.797,0.05,97.939,2001-01-01 10:00:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,8.0,0.04801,0.0,0.52,1.0,281.08,275.98,SA
20417,1.20865,7.043,94.929,333.933,1.817,0.0,97.923,2001-01-01 10:30:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,8.0,0.04801,0.0,0.52,1.0,281.08,275.98,SA


In [15]:
data_df.describe()

Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute,elevation,lat,long,koppen_sub,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night
count,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0,1832903.0
mean,5.67392,15.12272,378.33696,318.42341,10.3264,0.03863,91.06518,2010.30513,6.55568,15.73565,11.92877,0.29002,0.51413,0.12731,0.08732,0.24753,0.04799,0.07792,0.26581,0.21095,0.12982,14.99852,957.17754,41.49494,-59.2597,15.89936,7.88135,96.00124,34.33081,205.1805,0.12909,-0.01017,287.15449,0.22605,7.78412,0.083,5.42351,0.43945,1.18323,296.59942,279.81335
std,7.48683,9.22053,268.43576,49.41909,9.79793,0.18858,9.46003,4.64866,2.99331,8.79309,3.77279,0.13102,0.22078,0.07084,0.06208,0.06314,0.04626,0.04871,0.07049,0.09639,0.08566,15.0,928.46135,7.55381,58.27373,8.57306,14.79938,51.36439,17.31205,84.88766,0.10969,0.00783,9.57685,0.07001,3.22635,0.18953,17.74316,0.18591,0.8965,13.27152,7.87586
min,-49.7372,-30.687,0.001,107.639,0.0,0.0,66.031,2001.0,1.0,1.0,3.0,-0.11958,-0.18252,-0.01764,0.0054,0.0305,0.0,0.0039,0.0,0.0132,0.0,0.0,1.0,29.9495,-121.5574,5.0,0.0,0.05451,0.93639,-0.31293,-0.0101,-0.05494,259.00342,0.04904,0.0,-0.21086,-13.41176,0.08,0.1,253.81851,246.43556
25%,0.41299,8.875,139.307,284.03,3.255,0.0,84.977,2007.0,4.0,8.0,9.0,0.19584,0.33047,0.07654,0.0399,0.2023,0.0226,0.0457,0.2076,0.1282,0.0559,0.0,181.0,36.6058,-109.9419,8.0,0.0,63.0,23.0,139.0,0.0461,-0.01229,281.06277,0.17185,7.0,-0.07965,0.0,0.31,0.5,287.92,274.72
50%,2.95963,15.177,345.675,319.885,7.149,0.0,94.374,2010.0,7.0,16.0,12.0,0.28093,0.5355,0.11491,0.0708,0.2374,0.0375,0.0689,0.2713,0.2049,0.1142,0.0,611.0,40.0329,-97.4888,14.0,0.35,98.0,34.0,215.0,0.09454,-0.00803,287.59136,0.22776,9.0,0.06793,0.0,0.45,0.94938,295.64,280.44
75%,9.03701,21.67,588.219,353.926,14.046,0.0,98.939,2014.0,9.0,23.0,15.0,0.35798,0.68307,0.15903,0.1222,0.2864,0.0573,0.0965,0.3221,0.288,0.188,30.0,1531.0,45.5598,3.5957,26.0,6.59,123.0,44.0,269.0,0.19488,-0.00484,294.04456,0.2814,10.0,0.20549,0.0,0.57,1.6,305.64,285.92
max,85.0309,44.087,1094.341,473.085,80.091,15.493,103.921,2020.0,12.0,31.0,23.0,2.38835,0.93551,0.42385,0.812,0.7762,0.7689,0.8053,0.4666,0.4748,0.4287,30.0,3197.0,61.84741,24.29477,27.0,55.39,600.0,217.0,373.0,0.51503,0.00193,310.28552,0.39579,12.0,0.86413,108.24437,0.88,5.1,328.28,297.34


In [16]:
data_df.site_id.unique()

array(['FR-Pue', 'US-GLE', 'US-NR1', 'US-SRM', 'US-Ton', 'US-Var',
       'US-Wkg', 'CA-Cbo', 'US-AR1', 'US-ARM', 'US-FR2', 'US-Me2',
       'US-Seg', 'US-UMB', 'US-Vcp', 'CH-Lae', 'ES-LJu', 'ES-LM2',
       'FI-Hyy', 'FR-Lam', 'IT-Lav', 'IT-Lsn'], dtype=object)

In [17]:
print(f"IGBP: {data_df.IGBP.unique()}")
print(f"koppen: {data_df.koppen.unique()}")
print(f"MODIS_PFT: {data_df.MODIS_PFT.unique()}")
print(f"MODIS_LC: {data_df.MODIS_LC.unique()}")

IGBP: ['EBF' 'ENF' 'WSA' 'GRA' 'DBF' 'CRO' 'MF' 'OSH']
koppen: ['Temperate' 'Cold' 'Arid']
MODIS_PFT: ['SA' 'GRA' 'SH' 'CRO' 'ENF' 'DBF' 'MF']
MODIS_LC: [ 8. 10.  9.  7. 12.  1.  4.  0.  5.]


# CheckPoint: Upload Data to Azure Storage Blob as Parquet

In [18]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
data_cleanup_checkpoint = True
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

if data_cleanup_checkpoint:

  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to baseline-data/baseline_all_v_1_raw.parquet


In [19]:
# Save no transformed data
get_non_transform_train_test = True
if get_non_transform_train_test:
  data_transformer = TFTDataTransformer(train_sites, test_sites,
                                        None, data_df)
  train_df, test_df = data_transformer.get_test_train_raw()
  print("Train data peak:")
  display(train_df.head(5))
  print("Test data peak:")
  display(test_df.head(5))

  train_blob_name= f"{train_blob_name_base}-{tag}.{ext}"
  test_blob_name= f"{test_blob_name_base}-{tag}.{ext}"
  data_transformer.upload_train_test_to_azure(az_cred_file, container,\
                                              train_blob_name, test_blob_name)

Data size: (1832903, 47).
Train data size: (1416803, 47).
Test data size: (416100, 47).
Train data peak:


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT
20413,-0.53574,5.311,25.016,272.218,1.708,0.0,97.939,2001-01-01 08:30:00,2001,1,1,8,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,8.0,0.04801,0.0,0.52,1.0,281.08,275.98,SA
20414,0.86438,5.744,59.734,272.218,1.738,0.0,97.939,2001-01-01 09:00:00,2001,1,1,9,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,8.0,0.04801,0.0,0.52,1.0,281.08,275.98,SA
20415,-0.02627,6.176,91.235,272.218,1.767,0.0,97.939,2001-01-01 09:30:00,2001,1,1,9,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,8.0,0.04801,0.0,0.52,1.0,281.08,275.98,SA
20416,-0.17229,6.608,79.264,333.933,1.797,0.05,97.939,2001-01-01 10:00:00,2001,1,1,10,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,8.0,0.04801,0.0,0.52,1.0,281.08,275.98,SA
20417,1.20865,7.043,94.929,333.933,1.817,0.0,97.923,2001-01-01 10:30:00,2001,1,1,10,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,8.0,0.04801,0.0,0.52,1.0,281.08,275.98,SA


Test data peak:


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT
210503,0.22517,-6.647,27.249,209.171,1.796,0.0,68.88,2005-05-03 05:00:00,2005,5,3,5,0.13459,0.03011,0.01128,0.3527,0.3746,0.4112,0.4372,0.1643,0.0494,0.0161,ENF,Cold,0,US-GLE,3197.0,41.36653,-106.2399,27,C3,0.16,107.0,60.0,243.0,0.0713,-0.01983,275.3042,0.17684,10.0,0.6276,66.06451,0.21,0.5,277.32,268.24,GRA
210504,0.01171,-5.555,108.461,209.171,1.999,0.0,68.896,2005-05-03 05:30:00,2005,5,3,5,0.13459,0.03011,0.01128,0.3527,0.3746,0.4112,0.4372,0.1643,0.0494,0.0161,ENF,Cold,30,US-GLE,3197.0,41.36653,-106.2399,27,C3,0.16,107.0,60.0,243.0,0.0713,-0.01983,275.3042,0.17684,10.0,0.6276,66.06451,0.21,0.5,277.32,268.24,GRA
210505,-0.45489,-4.464,190.445,209.171,2.202,0.0,68.912,2005-05-03 06:00:00,2005,5,3,6,0.13459,0.03011,0.01128,0.3527,0.3746,0.4112,0.4372,0.1643,0.0494,0.0161,ENF,Cold,0,US-GLE,3197.0,41.36653,-106.2399,27,C3,0.16,107.0,60.0,243.0,0.0713,-0.01983,275.3042,0.17684,10.0,0.6276,66.06451,0.21,0.5,277.32,268.24,GRA
210506,-0.5207,-3.372,271.799,209.171,2.406,0.0,68.928,2005-05-03 06:30:00,2005,5,3,6,0.13459,0.03011,0.01128,0.3527,0.3746,0.4112,0.4372,0.1643,0.0494,0.0161,ENF,Cold,30,US-GLE,3197.0,41.36653,-106.2399,27,C3,0.16,107.0,60.0,243.0,0.0713,-0.01983,275.3042,0.17684,10.0,0.6276,66.06451,0.21,0.5,277.32,268.24,GRA
210507,0.31839,-2.281,351.132,209.171,2.609,0.0,68.945,2005-05-03 07:00:00,2005,5,3,7,0.13459,0.03011,0.01128,0.3527,0.3746,0.4112,0.4372,0.1643,0.0494,0.0161,ENF,Cold,0,US-GLE,3197.0,41.36653,-106.2399,27,C3,0.16,107.0,60.0,243.0,0.0713,-0.01983,275.3042,0.17684,10.0,0.6276,66.06451,0.21,0.5,277.32,268.24,GRA


Uploading train dataset to baseline-train-v-1-raw.parquet...
File uploaded to baseline-data/baseline-train-v-1-raw.parquet
Uploading test dataset to baseline-test-v-1-raw.parquet...
File uploaded to baseline-data/baseline-test-v-1-raw.parquet


# Stage 2: Data Transform - Convert to Model Ready Data
- Encode data
- Split into train and test
- Assemble data through VectorAssembler
- Normalized data through minmax

In [21]:
load_data_from_previous_checkpoint = False
useSpark = False

raw_data_file_path = None
if load_data_from_previous_checkpoint:
  data_df = None
  raw_data_file_path = tmp_dir + os.sep + blob_name
  print(f"loading {raw_data_file_path}...")
  if not (os.path.exists(raw_data_file_path)):
      if not (os.path.exists(tmp_dir)):
          os.mkdir(tmp_dir)
      azStorageClient = AzStorageClient(az_cred_file)
      file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
      data_df = pd.read_parquet(file_stream, engine='pyarrow')
      data_df.to_parquet(raw_data_file_path)
  
if useSpark:
  data_transformer = PySparkMLDataTransformer(spark, train_sites, test_sites,
                                              raw_data_file_path, data_df)
else:
  data_transformer = TFTDataTransformer(train_sites, test_sites,
                                              raw_data_file_path, data_df)

Data size: (1832903, 47).


In [22]:
timestamp_col = 'datetime'
target_col = 'GPP_NT_VUT_REF'

if useSpark: # Spark ML Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'MODIS_PFT', 'MODIS_LC'] 
  data_transformer.data_transform(categorical_cols, timestamp_col, target_col)

  print("Train data peak:")
  data_transformer.train_df.show(5, False)
  print("Test data peak:")
  data_transformer.test_df.show(5, False)

  train_blob_name= f"{train_blob_name_base}"
  test_blob_name= f"{test_blob_name_base}"

else: # TFT Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'site_id',
                      'year', 'month', 'day', 'hour', 'minute',
                      'MODIS_PFT', 'MODIS_LC']
  realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                  'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                  'elevation', 'lat', 'long', 'c4_percent',
                  'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                  'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                  'LST_Day', 'LST_Night']
  backup_cols = ['IGBP', 'koppen','site_id']
  data_transformer.data_transform(categorical_cols, realNum_cols, backup_cols,\
                                  timestamp_col, target_col)

  print("Train data peak:")
  display(data_transformer.train_df.head(5))
  print("Test data peak:")
  display(data_transformer.test_df.head(5))

  train_blob_name= f"{train_blob_name_base}.{ext}"
  test_blob_name= f"{test_blob_name_base}.{ext}"

Data size: (1832903, 50).
Data size after encoding: (1832903, 50)


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,IGBP_name,koppen_name,site_id_name
20413,-0.53574,5.311,25.016,272.218,1.708,0.0,97.939,2001-01-01 08:30:00,0,0,0,5,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,2,2,1,6,270.0,43.7413,3.5957,3,0,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,5,0.04801,0.0,0.52,1.0,281.08,275.98,5,EBF,Temperate,FR-Pue
20414,0.86438,5.744,59.734,272.218,1.738,0.0,97.939,2001-01-01 09:00:00,0,0,0,6,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,2,2,0,6,270.0,43.7413,3.5957,3,0,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,5,0.04801,0.0,0.52,1.0,281.08,275.98,5,EBF,Temperate,FR-Pue
20415,-0.02627,6.176,91.235,272.218,1.767,0.0,97.939,2001-01-01 09:30:00,0,0,0,6,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,2,2,1,6,270.0,43.7413,3.5957,3,0,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,5,0.04801,0.0,0.52,1.0,281.08,275.98,5,EBF,Temperate,FR-Pue
20416,-0.17229,6.608,79.264,333.933,1.797,0.05,97.939,2001-01-01 10:00:00,0,0,0,7,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,2,2,0,6,270.0,43.7413,3.5957,3,0,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,5,0.04801,0.0,0.52,1.0,281.08,275.98,5,EBF,Temperate,FR-Pue
20417,1.20865,7.043,94.929,333.933,1.817,0.0,97.923,2001-01-01 10:30:00,0,0,0,7,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,2,2,1,6,270.0,43.7413,3.5957,3,0,6.59,22.0,12.0,46.0,0.06816,-0.00377,279.00327,0.35371,5,0.04801,0.0,0.52,1.0,281.08,275.98,5,EBF,Temperate,FR-Pue


Features(45): ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'year', 'month', 'day', 'hour', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'IGBP', 'koppen', 'minute', 'site_id', 'elevation', 'lat', 'long', 'koppen_sub', 'c3c4', 'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night', 'MODIS_PFT']
Train data size: (1416803, 50).
Test data size: (416100, 50).
Normalizinf features (33): ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'elevation', 'lat', 'long', 'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


Train data size: (1416803, 50).
Test data size: (416100, 50).
Train data peak:


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,IGBP_name,koppen_name,site_id_name
20413,-0.53574,-1.05663,-1.30947,-0.93066,-0.87456,-0.20843,0.78691,2001-01-01 08:30:00,0,0,0,5,-0.36163,0.88176,-0.34607,-1.01242,-1.5948,-0.90295,-1.05898,-1.56856,-1.08563,-1.50485,2,2,1,6,-0.78965,0.20971,1.09663,3,0,-0.143,-1.63648,-1.52883,-1.85499,-0.56558,0.85564,-0.81712,1.86749,5,-0.23126,-0.31164,0.31827,-0.29653,-1.12868,-0.49273,5,EBF,Temperate,FR-Pue
20414,0.86438,-1.00861,-1.18019,-0.93066,-0.87144,-0.20843,0.78691,2001-01-01 09:00:00,0,0,0,6,-0.36163,0.88176,-0.34607,-1.01242,-1.5948,-0.90295,-1.05898,-1.56856,-1.08563,-1.50485,2,2,0,6,-0.78965,0.20971,1.09663,3,0,-0.143,-1.63648,-1.52883,-1.85499,-0.56558,0.85564,-0.81712,1.86749,5,-0.23126,-0.31164,0.31827,-0.29653,-1.12868,-0.49273,5,EBF,Temperate,FR-Pue
20415,-0.02627,-0.9607,-1.06289,-0.93066,-0.86843,-0.20843,0.78691,2001-01-01 09:30:00,0,0,0,6,-0.36163,0.88176,-0.34607,-1.01242,-1.5948,-0.90295,-1.05898,-1.56856,-1.08563,-1.50485,2,2,1,6,-0.78965,0.20971,1.09663,3,0,-0.143,-1.63648,-1.52883,-1.85499,-0.56558,0.85564,-0.81712,1.86749,5,-0.23126,-0.31164,0.31827,-0.29653,-1.12868,-0.49273,5,EBF,Temperate,FR-Pue
20416,-0.17229,-0.91279,-1.10747,0.37838,-0.86531,0.06955,0.78691,2001-01-01 10:00:00,0,0,0,7,-0.36163,0.88176,-0.34607,-1.01242,-1.5948,-0.90295,-1.05898,-1.56856,-1.08563,-1.50485,2,2,0,6,-0.78965,0.20971,1.09663,3,0,-0.143,-1.63648,-1.52883,-1.85499,-0.56558,0.85564,-0.81712,1.86749,5,-0.23126,-0.31164,0.31827,-0.29653,-1.12868,-0.49273,5,EBF,Temperate,FR-Pue
20417,1.20865,-0.86455,-1.04914,0.37838,-0.86323,-0.20843,0.78513,2001-01-01 10:30:00,0,0,0,7,-0.36163,0.88176,-0.34607,-1.01242,-1.5948,-0.90295,-1.05898,-1.56856,-1.08563,-1.50485,2,2,1,6,-0.78965,0.20971,1.09663,3,0,-0.143,-1.63648,-1.52883,-1.85499,-0.56558,0.85564,-0.81712,1.86749,5,-0.23126,-0.31164,0.31827,-0.29653,-1.12868,-0.49273,5,EBF,Temperate,FR-Pue


Test data peak:


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,IGBP_name,koppen_name,site_id_name
210503,0.22517,-2.38276,-1.30115,-2.26795,-0.86541,-0.20843,-2.44933,2005-05-03 05:00:00,4,4,2,2,-1.24559,-2.31714,-1.68459,4.91758,2.22538,9.40716,8.77348,-1.34219,-1.6002,-1.30056,3,1,0,12,2.53325,-0.08189,-0.74751,7,0,-0.54016,0.30728,1.91961,0.4469,-0.53646,-1.20536,-1.20596,-0.85958,7,2.85238,3.46251,-1.34152,-0.82145,-1.41466,-1.50189,3,ENF,Cold,US-GLE
210504,0.01171,-2.26165,-0.99875,-2.26795,-0.84431,-0.20843,-2.44754,2005-05-03 05:30:00,4,4,2,2,-1.24559,-2.31714,-1.68459,4.91758,2.22538,9.40716,8.77348,-1.34219,-1.6002,-1.30056,3,1,1,12,2.53325,-0.08189,-0.74751,7,0,-0.54016,0.30728,1.91961,0.4469,-0.53646,-1.20536,-1.20596,-0.85958,7,2.85238,3.46251,-1.34152,-0.82145,-1.41466,-1.50189,3,ENF,Cold,US-GLE
210505,-0.45489,-2.14066,-0.69347,-2.26795,-0.82321,-0.20843,-2.44576,2005-05-03 06:00:00,4,4,2,3,-1.24559,-2.31714,-1.68459,4.91758,2.22538,9.40716,8.77348,-1.34219,-1.6002,-1.30056,3,1,0,12,2.53325,-0.08189,-0.74751,7,0,-0.54016,0.30728,1.91961,0.4469,-0.53646,-1.20536,-1.20596,-0.85958,7,2.85238,3.46251,-1.34152,-0.82145,-1.41466,-1.50189,3,ENF,Cold,US-GLE
210506,-0.5207,-2.01956,-0.39054,-2.26795,-0.80201,-0.20843,-2.44398,2005-05-03 06:30:00,4,4,2,3,-1.24559,-2.31714,-1.68459,4.91758,2.22538,9.40716,8.77348,-1.34219,-1.6002,-1.30056,3,1,1,12,2.53325,-0.08189,-0.74751,7,0,-0.54016,0.30728,1.91961,0.4469,-0.53646,-1.20536,-1.20596,-0.85958,7,2.85238,3.46251,-1.34152,-0.82145,-1.41466,-1.50189,3,ENF,Cold,US-GLE
210507,0.31839,-1.89857,-0.09513,-2.26795,-0.78091,-0.20843,-2.44209,2005-05-03 07:00:00,4,4,2,4,-1.24559,-2.31714,-1.68459,4.91758,2.22538,9.40716,8.77348,-1.34219,-1.6002,-1.30056,3,1,0,12,2.53325,-0.08189,-0.74751,7,0,-0.54016,0.30728,1.91961,0.4469,-0.53646,-1.20536,-1.20596,-0.85958,7,2.85238,3.46251,-1.34152,-0.82145,-1.41466,-1.50189,3,ENF,Cold,US-GLE


# Checkpoint: Upload train and test to Azure Blob Storage

In [23]:
final_checkpoint = True

if final_checkpoint:
  data_transformer.upload_train_test_to_azure(az_cred_file, container, \
                                            train_blob_name, test_blob_name)

Uploading train dataset to baseline-train-v-1.parquet...
File uploaded to baseline-data/baseline-train-v-1.parquet
Uploading test dataset to baseline-test-v-1.parquet...
File uploaded to baseline-data/baseline-test-v-1.parquet
