# Notebook Setup

In [3]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import Modules

In [4]:
# install required modules quietly
required_packages = ['azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Define Constants

In [5]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-imputed-v1-i.csv"

# File
container = "all-sites-data"
ext = "parquet"
ver = "1-i-knn"
blob_name_base = f"full_2010_2015_all_v_{ver}"
train_blob_name = f"full_2010_2015-train-v-{ver}.{ext}"
test_blob_name = f"full_2010_2015-test-v-{ver}.{ext}"

In [6]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
target_variable_qc = 'NEE_VUT_REF_QC'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main', 'IGBP',
                     'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

# Stage 1: Trim and Merge Site Metadata

In [7]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = True
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date ='2015-12-31'

# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

train_sites = None
test_sites = None

In [None]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, train_sites, test_sites, 
                                    hourly_features, metadata_features, target_variable_qc, target_variable, raw_data_dir)

data_df = prep_hourly.all_sites_all_sources(imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

18it [05:45, 19.52s/it]

ERROR: AU-Tum is missing hourly data.


23it [06:08,  8.61s/it]

ERROR: BR-Sa1 is missing hourly data.


46it [09:44,  1.38s/it]

ERROR: CN-Sw2 run into error. Exception: Shape of passed values is (912, 16), indices imply (912, 17)


89it [20:12,  1.02s/it]

ERROR: US-Cop is missing hourly data.


93it [22:22, 24.14s/it]

ERROR: US-Ha1 is missing hourly data.


103it [25:15, 34.15s/it]

ERROR: US-Ne1 is missing hourly data.
ERROR: US-Ne2 is missing hourly data.
ERROR: US-Ne3 is missing hourly data.
ERROR: US-PFa is missing hourly data.


122it [30:06,  7.34s/it]

ERROR: CA-Ca1 run into error. Exception: Shape of passed values is (2736, 16), indices imply (2736, 17)


123it [30:08,  5.76s/it]

ERROR: CA-Ca2 run into error. Exception: Shape of passed values is (2856, 16), indices imply (2856, 17)


152it [39:19, 20.72s/it]

ERROR: US-MMS is missing hourly data.


168it [48:26, 52.18s/it]

In [None]:
# Get the memory usage of the dataframe in bytes
memory_usage = data_df.memory_usage().sum()
memory_usage_gb = memory_usage / 1_000_000_000
print(f"The dataframe uses {memory_usage_gb:.2f} GB of memory.")

In [None]:
data_df['site_id'].unique()

# CHECKPOINT: Save full raw data

In [None]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
from io import BytesIO
data_cleanup_checkpoint = True
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

if data_cleanup_checkpoint:

  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to baseline-data/baseline_all_v_1-i-knn_raw.parquet


# Stage 2: Data Transform - Convert to Model Ready Data
- Encode data
- Split into train and test
- Assemble data through VectorAssembler
- Normalized data through minmax

In [None]:
raw_data_file_path = None
data_transformer = TFTDataTransformer(train_sites, test_sites, raw_data_file_path, data_df)

Data size: (865920, 49).


In [None]:
non_transform_cols = [target_variable, 'site_id', 'datetime', 'timestep_idx_local', 'timestep_idx_global', 'gap_flag_hour', 'gap_flag_month']
categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen_main',
                    'year', 'month', 'day', 'hour', 'MODIS_PFT', 'MODIS_LC']
realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                'lat', 'long', 'c4_percent',
                'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                'LST_Day', 'LST_Night']
data_transformer.data_transform(categorical_cols, realNum_cols, non_transform_cols)

print("\nTrain data peak:")
display(data_transformer.train_df.head(5))
print("\nTest data peak:")
display(data_transformer.test_df.head(5))

Data size: (865920, 49).
Data size after encoding: (865920, 49)
Number of sites in df: 19
Train Sites: ['IT-Lav', 'US-NR1', 'US-Vcp', 'FR-Pue', 'CH-Lae', 'US-Var', 'US-Ne2', 'ES-LJu', 'US-Ton', 'US-UMB', 'US-Me2', 'FI-Hyy', 'US-NR1', 'IT-Lav', 'US-Wkg', 'US-ARM', 'US-SRM']
Test Sites: ['US-GLE', 'US-AR1', 'US-Seg', 'US-FR2', 'ES-LM2', 'CA-Cbo', 'FR-Lam', 'IT-Cpz', 'CN-Cha', 'IT-Lsn']
Normalizing real features (32)
Train data size: (631032, 49).
Test data size: (234888, 49).

Train data peak:


Unnamed: 0,GPP_NT_VUT_REF,site_id,timestep_idx_local,timestep_idx_global,datetime,year,month,day,hour,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,lat,long,koppen_sub,koppen_main,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,gap_flag_hour,gap_flag_month
0,8.0007,CH-Lae,0,0,2010-01-01 00:00:00,0,0,0,0,-0.93215,-0.71272,-1.06375,-0.86296,-0.20802,0.05816,0.38113,0.83348,0.36236,-0.78665,-0.4305,-0.59344,-0.84229,-0.51919,-0.62349,-0.75516,5,0.77039,1.18369,5,2,0,-0.58974,-1.62789,-1.41854,-1.63492,-0.86451,1.08868,-1.48659,0.93231,5,0.15866,2.32445,-1.47498,-0.96933,-1.47888,-1.10114,5,0.0,0.0
1,8.04417,CH-Lae,1,1,2010-01-01 01:00:00,0,0,0,1,-0.96828,-0.71272,-1.07565,-0.86149,-0.20802,0.05344,0.38113,0.83348,0.36236,-0.78665,-0.4305,-0.59344,-0.84229,-0.51919,-0.62349,-0.75516,5,0.77039,1.18369,5,2,0,-0.58974,-1.62789,-1.41854,-1.63492,-0.86451,1.08868,-1.48659,0.93231,5,0.15866,2.32445,-1.47498,-0.96933,-1.47888,-1.10114,5,0.0,0.0
2,7.96453,CH-Lae,2,2,2010-01-01 02:00:00,0,0,0,2,-0.97002,-0.71272,-1.17342,-0.8556,-0.20802,0.04883,0.38113,0.83348,0.36236,-0.78665,-0.4305,-0.59344,-0.84229,-0.51919,-0.62349,-0.75516,5,0.77039,1.18369,5,2,0,-0.58974,-1.62789,-1.41854,-1.63492,-0.86451,1.08868,-1.48659,0.93231,5,0.15866,2.32445,-1.47498,-0.96933,-1.47888,-1.10114,5,0.0,0.0
3,12.2833,CH-Lae,3,3,2010-01-01 03:00:00,0,0,0,3,-1.01993,-0.71272,-1.09669,-0.84749,-0.20802,0.04701,0.38113,0.83348,0.36236,-0.78665,-0.4305,-0.59344,-0.84229,-0.51919,-0.62349,-0.75516,5,0.77039,1.18369,5,2,0,-0.58974,-1.62789,-1.41854,-1.63492,-0.86451,1.08868,-1.48659,0.93231,5,0.15866,2.32445,-1.47498,-0.96933,-1.47888,-1.10114,5,0.0,0.0
4,13.7571,CH-Lae,4,4,2010-01-01 04:00:00,0,0,0,4,-1.0528,-0.71272,-1.02725,-0.8529,-0.20802,0.04121,0.38113,0.83348,0.36236,-0.78665,-0.4305,-0.59344,-0.84229,-0.51919,-0.62349,-0.75516,5,0.77039,1.18369,5,2,0,-0.58974,-1.62789,-1.41854,-1.63492,-0.86451,1.08868,-1.48659,0.93231,5,0.15866,2.32445,-1.47498,-0.96933,-1.47888,-1.10114,5,0.0,0.0



Test data peak:


Unnamed: 0,GPP_NT_VUT_REF,site_id,timestep_idx_local,timestep_idx_global,datetime,year,month,day,hour,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,lat,long,koppen_sub,koppen_main,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,gap_flag_hour,gap_flag_month
0,1.30673,CA-Cbo,0,0,2010-01-01 00:00:00,0,0,0,0,-1.1758,-0.71272,0.22907,-0.87683,-0.07995,0.87251,1.34874,-0.72758,-0.22281,1.01824,1.0366,2.44975,1.58852,-0.77073,-1.35122,-1.05468,1,0.3683,-0.34851,5,2,0,-0.52345,-1.14726,-0.63639,-1.16041,-1.0106,0.92265,-1.63887,-0.85179,5,0.49786,3.3538,-0.53467,-0.57577,-1.89649,-2.01633,5,0.0,1.0
1,1.38098,CA-Cbo,1,1,2010-01-01 01:00:00,0,0,0,1,-1.18091,-0.71272,0.11045,-0.87659,-0.06518,0.87112,0.82468,-0.7276,-0.47349,1.25023,0.8272,2.63323,1.93407,-0.50555,-1.24262,-0.97915,1,0.3683,-0.34851,5,2,0,-0.52345,-1.14726,-0.63639,-1.16041,-1.0106,0.92265,-1.63887,-0.85179,5,0.49786,3.3538,-0.53467,-0.57577,-1.89649,-2.01633,5,0.0,1.0
2,0.27869,CA-Cbo,2,2,2010-01-01 02:00:00,0,0,0,2,-1.19918,-0.71272,0.11045,-0.87389,-0.06518,0.87198,0.82468,-0.7276,-0.47349,1.25023,0.8272,2.63323,1.93407,-0.50555,-1.24262,-0.97915,1,0.3683,-0.34851,5,2,0,-0.52345,-1.14726,-0.63639,-1.16041,-1.0106,0.92265,-1.63887,-0.85179,5,0.49786,3.3538,-0.53467,-0.57577,-1.89649,-2.01633,5,0.0,1.0
3,-1.72097,CA-Cbo,3,3,2010-01-01 03:00:00,0,0,0,3,-1.21745,-0.71272,0.11045,-0.87119,-0.06518,0.87284,0.65217,-1.17147,-0.60127,1.76995,1.27795,3.17049,2.50126,-0.18988,-1.10024,-0.95277,1,0.3683,-0.34851,5,2,0,-0.52345,-1.14726,-0.63639,-1.16041,-1.0106,0.92265,-1.63887,-0.85179,5,0.49786,3.3538,-0.53467,-0.57577,-1.89649,-2.01633,5,0.0,1.0
4,0.67984,CA-Cbo,4,4,2010-01-01 04:00:00,0,0,0,4,-1.23572,-0.71272,0.00028,-0.86849,0.122,0.87369,0.95731,-0.72295,-0.32818,1.09781,0.93626,2.29454,1.57462,-0.74375,-1.12675,-0.88474,1,0.3683,-0.34851,5,2,0,-0.52345,-1.14726,-0.63639,-1.16041,-1.0106,0.92265,-1.63887,-0.85179,5,0.49786,3.3538,-0.53467,-0.57577,-1.89649,-2.01633,5,0.0,1.0


# FINISH: Upload train and test to Azure Blob Storage

In [None]:
final_checkpoint = True

if final_checkpoint:
  data_transformer.upload_train_test_to_azure(az_cred_file, container, train_blob_name, test_blob_name)

Uploading train dataset to baseline-train-v-1-i-knn.parquet...
File uploaded to baseline-data/baseline-train-v-1-i-knn.parquet
Uploading test dataset to baseline-test-v-1-i-knn.parquet...
File uploaded to baseline-data/baseline-test-v-1-i-knn.parquet
