# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import Modules

In [2]:
# install required modules quietly
required_packages = ['azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Define Constants

In [3]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-imputed-v1-i.csv"

# File
container = "all-sites-data"
ext = "parquet"
ver = "exp1"
blob_name_base = f"full_2010_2015_all_v_{ver}"
train_blob_name = f"full_2010_2015-train-v-{ver}.{ext}"
test_blob_name = f"full_2010_2015-test-v-{ver}.{ext}"

In [4]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
target_variable_qc = 'NEE_VUT_REF_QC'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main', 'IGBP',
                     'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

In [5]:
site_splits =[
  ['AR-SLu', 'AU-ASM', 'AU-Cum', 'AU-How', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-Lnf',
   'IT-CA3', 'NL-Hor', 'US-Syv', 'US-AR2', 'US-ARM', 'US-Vcp', 'CH-Cha', 'CZ-KrP', 
   'CZ-Lnz', 'DE-Geb', 'DE-Obe', 'ES-LJu', 'FI-Let', 'IT-Lav', 'SE-Deg'],
  ['AU-Cpr', 'AU-Wom', 'CZ-BK2', 'DE-SfN', 'IT-CA1', 'IT-CA2', 'IT-Ro2', 'US-IB2', 
   'US-Me6', 'US-Ton', 'CA-Ca3', 'US-CRT', 'US-KFS', 'US-Mpj', 'US-Prr', 'US-Ro1', 
   'US-Tw4', 'BE-Bra', 'CZ-BK1', 'DE-Hai', 'IL-Yat', 'IT-Tor', 'SE-Htm'],
  ['AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2',
   'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ',
   'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam'],
  ['AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue',
   'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie',
   'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn'],
  ['AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', 'CA-Ca1',
   'CA-Gro', 'US-AR1', 'US-Rws', 'US-UMd', 'US-Wjs', 'CH-Fru', 'CH-Oe2', 'DE-Tha',
   'DK-Sor', 'FR-Bil', 'FR-Hes', 'IT-BCi', 'IT-SR2', 'DE-Hte'],
  ['CA-Oas', 'ES-Amo', 'FI-Sod', 'US-Myb', 'US-SRM', 'US-Tw3', 'US-Var', 'US-WCr',
   'US-Ho1', 'US-Seg', 'US-UMB', 'BE-Lon', 'CH-Dav', 'DE-Gri', 'DE-HoH', 'ES-LM1',
   'FR-Aur', 'FR-FBn', 'GF-Guy', 'IT-MBo', 'IT-Ren', 'RU-Fyo']
]

# Stage 1: Trim and Merge Site Metadata

In [6]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = True
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date ='2015-12-31'

# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

train_sites = [item for sublist in site_splits[:2] for item in sublist] 
val_sites = site_splits[2]
test_sites = site_splits[3]

# exp v2 dataset
# train_sites = [item for sublist in site_splits[2:4] for item in sublist] 
# val_sites = site_splits[4]
# test_sites = site_splits[5]
# Train(44): ['AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2', 'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ', 'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam', 'AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue', 'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn']
# Validation(22): ['AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', 'CA-Ca1', 'CA-Gro', 'US-AR1', 'US-Rws', 'US-UMd', 'US-Wjs', 'CH-Fru', 'CH-Oe2', 'DE-Tha', 'DK-Sor', 'FR-Bil', 'FR-Hes', 'IT-BCi', 'IT-SR2', 'DE-Hte']
# Test(22): ['CA-Oas', 'ES-Amo', 'FI-Sod', 'US-Myb', 'US-SRM', 'US-Tw3', 'US-Var', 'US-WCr', 'US-Ho1', 'US-Seg', 'US-UMB', 'BE-Lon', 'CH-Dav', 'DE-Gri', 'DE-HoH', 'ES-LM1', 'FR-Aur', 'FR-FBn', 'GF-Guy', 'IT-MBo', 'IT-Ren', 'RU-Fyo']

print(f"Train({len(train_sites)}): {train_sites}")
print(f"Validation({len(val_sites)}): {val_sites}")
print(f"Test({len(test_sites)}): {test_sites}")

Train(46): ['AR-SLu', 'AU-ASM', 'AU-Cum', 'AU-How', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-Lnf', 'IT-CA3', 'NL-Hor', 'US-Syv', 'US-AR2', 'US-ARM', 'US-Vcp', 'CH-Cha', 'CZ-KrP', 'CZ-Lnz', 'DE-Geb', 'DE-Obe', 'ES-LJu', 'FI-Let', 'IT-Lav', 'SE-Deg', 'AU-Cpr', 'AU-Wom', 'CZ-BK2', 'DE-SfN', 'IT-CA1', 'IT-CA2', 'IT-Ro2', 'US-IB2', 'US-Me6', 'US-Ton', 'CA-Ca3', 'US-CRT', 'US-KFS', 'US-Mpj', 'US-Prr', 'US-Ro1', 'US-Tw4', 'BE-Bra', 'CZ-BK1', 'DE-Hai', 'IL-Yat', 'IT-Tor', 'SE-Htm']
Validation(22): ['AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2', 'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ', 'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam']
Test(22): ['AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue', 'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn']


In [7]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, train_sites, test_sites, 
                                    hourly_features, metadata_features, target_variable_qc, target_variable, raw_data_dir)

data_df = prep_hourly.all_sites_all_sources(imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

1it [00:00,  1.90it/s]

Processing: 1. AR-SLu
Processing: 2. AR-Vir


2it [00:24, 14.11s/it]

Processing: 3. AU-ASM


3it [00:30, 10.61s/it]

Processing: 4. AU-Cpr


4it [00:46, 12.79s/it]

Processing: 5. AU-Cum


5it [00:50,  9.50s/it]

Processing: 6. AU-DaS


6it [01:39, 22.98s/it]

Processing: 7. AU-Emr


7it [01:50, 18.97s/it]

Processing: 8. AU-Gin


8it [02:08, 18.85s/it]

Processing: 9. AU-How


9it [03:20, 35.40s/it]

Processing: 10. AU-Rig


10it [03:39, 30.21s/it]

Processing: 11. AU-TTE


11it [03:40, 21.39s/it]

Processing: 12. AU-Wom


12it [04:05, 22.31s/it]

Processing: 13. CA-TP3


13it [05:10, 35.42s/it]

Processing: 14. CA-TPD


14it [06:04, 40.85s/it]

Processing: 15. CN-Sw2


15it [06:04, 28.76s/it]

ERROR: CN-Sw2 run into error. Exception: Shape of passed values is (912, 16), indices imply (912, 17)
Processing: 16. CZ-BK2


16it [07:00, 37.00s/it]

Processing: 17. DE-Lnf


17it [07:53, 41.63s/it]

Processing: 18. DE-SfN


18it [08:10, 34.39s/it]

Processing: 19. DE-Spw


19it [09:37, 50.22s/it]

Processing: 20. FR-Pue


20it [10:02, 42.46s/it]

Processing: 21. IT-CA1


21it [10:20, 35.15s/it]

Processing: 22. IT-CA2


22it [10:34, 28.95s/it]

Processing: 23. IT-CA3


23it [10:58, 27.30s/it]

Processing: 24. IT-Noe


24it [11:44, 32.81s/it]

Processing: 25. IT-Ro2


25it [12:02, 28.43s/it]

Processing: 26. NL-Hor


26it [12:16, 24.09s/it]

Processing: 27. US-IB2


27it [12:36, 23.05s/it]

Processing: 28. US-Me6


28it [13:33, 33.25s/it]

Processing: 29. US-Syv


29it [14:25, 38.69s/it]

Processing: 30. US-Ton


30it [14:36, 30.46s/it]

Processing: 31. US-Twt


31it [15:01, 28.72s/it]

Processing: 32. US-WPT


32it [15:37, 30.86s/it]

Processing: 33. CA-Ca3


33it [15:39, 22.49s/it]

Processing: 34. CA-Cbo


34it [17:25, 47.36s/it]

Processing: 35. US-AR2


35it [17:34, 35.94s/it]

Processing: 36. US-ARM


36it [18:10, 35.95s/it]

Processing: 37. US-CRT


37it [18:28, 30.51s/it]

Processing: 38. US-KFS


38it [19:05, 32.56s/it]

Processing: 39. US-Mpj


39it [19:32, 30.82s/it]

Processing: 40. US-Prr


40it [22:40, 77.81s/it]

Processing: 41. US-Ro1


41it [23:46, 74.56s/it]

Processing: 42. US-Tw4


42it [23:49, 52.94s/it]

Processing: 43. US-Vcm


43it [25:32, 67.99s/it]

Processing: 44. US-Vcp


44it [26:31, 65.28s/it]

Processing: 45. BE-Bra


45it [28:05, 74.01s/it]

Processing: 46. BE-Dor


46it [29:57, 85.30s/it]

Processing: 47. BE-Vie


47it [32:53, 112.37s/it]

Processing: 48. CH-Cha


48it [33:59, 98.70s/it] 

Processing: 49. CZ-BK1


49it [36:38, 116.54s/it]

Processing: 50. CZ-KrP


50it [37:00, 88.29s/it] 

Processing: 51. CZ-Lnz


51it [37:02, 62.35s/it]

Processing: 52. CZ-Stn


52it [39:00, 79.07s/it]

Processing: 53. DE-Geb


53it [40:34, 83.60s/it]

Processing: 54. DE-Hai


54it [43:44, 115.38s/it]

Processing: 55. DE-Obe


55it [46:15, 126.27s/it]

Processing: 56. ES-LJu


56it [46:42, 96.37s/it] 

Processing: 57. FI-Hyy


57it [50:16, 131.62s/it]

Processing: 58. FI-Let


58it [53:44, 154.71s/it]

Processing: 59. IL-Yat


59it [54:40, 124.99s/it]

Processing: 60. IT-Lav


60it [56:27, 119.72s/it]

Processing: 61. IT-Tor


61it [1:00:34, 157.73s/it]

Processing: 62. SE-Deg


62it [1:04:08, 174.70s/it]

Processing: 63. SE-Htm


63it [1:04:11, 123.32s/it]

Processing: 64. SE-Nor


64it [1:04:55, 99.39s/it] 

Processing: 65. SE-Ros


65it [1:04:58, 70.41s/it]

ERROR: SE-Ros run into error. Exception: Shape of passed values is (3744, 16), indices imply (3744, 17)
Processing: 66. NL-Loo


66it [1:08:03, 104.83s/it]

Processing: 67. SE-Lnn


67it [1:08:39, 61.49s/it]


Initial records: 2264256, Final records after resampling + gap-filling: 2389272
Total retained sites: 67/67 = 1.00
Missing values after site-level imputation: 0
Not imputing missing values at global level
Missing values after global-level imputation: 0
Data size after after merged with site metadata: (2389272, 34)
Data size after after merged with monthly data: (2389272, 50)
49152 missing values introduced after monthly merge


In [8]:
# Get the memory usage of the dataframe in bytes
memory_usage = data_df.memory_usage().sum()
memory_usage_gb = memory_usage / 1_000_000_000
print(f"The dataframe uses {memory_usage_gb:.2f} GB of memory.")

The dataframe uses 0.97 GB of memory.


In [9]:
data_df['site_id'].unique()

array(['AR-SLu', 'AR-Vir', 'AU-ASM', 'AU-Cpr', 'AU-Cum', 'AU-DaS',
       'AU-Emr', 'AU-Gin', 'AU-How', 'AU-Rig', 'AU-TTE', 'AU-Wom',
       'BE-Bra', 'BE-Dor', 'BE-Vie', 'CA-Ca3', 'CA-Cbo', 'CA-TP3',
       'CA-TPD', 'CH-Cha', 'CZ-BK1', 'CZ-BK2', 'CZ-KrP', 'CZ-Lnz',
       'CZ-Stn', 'DE-Geb', 'DE-Hai', 'DE-Lnf', 'DE-Obe', 'DE-SfN',
       'DE-Spw', 'ES-LJu', 'FI-Hyy', 'FI-Let', 'FR-Pue', 'IL-Yat',
       'IT-CA1', 'IT-CA2', 'IT-CA3', 'IT-Lav', 'IT-Noe', 'IT-Ro2',
       'IT-Tor', 'NL-Hor', 'NL-Loo', 'SE-Deg', 'SE-Htm', 'SE-Lnn',
       'SE-Nor', 'US-AR2', 'US-ARM', 'US-CRT', 'US-IB2', 'US-KFS',
       'US-Me6', 'US-Mpj', 'US-Prr', 'US-Ro1', 'US-Syv', 'US-Ton',
       'US-Tw4', 'US-Twt', 'US-Vcm', 'US-Vcp', 'US-WPT'], dtype=object)

# CHECKPOINT: Save full raw data

In [10]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
from io import BytesIO
data_cleanup_checkpoint = True
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

# exp_v2 dataset: File uploaded to all-sites-data/full_2010_2015_all_v_exp2_raw.parquet

if data_cleanup_checkpoint:

  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to all-sites-data/full_2010_2015_all_v_exp1_raw.parquet


In [11]:
loaded_df = None
local_file = tmp_dir + os.sep + blob_name
if not (os.path.exists(local_file)):
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
    loaded_df = pd.read_parquet(file_stream, engine='pyarrow')
    loaded_df.to_parquet(local_file)
else:
    loaded_df = pd.read_parquet(local_file)

print(f"Data size: {loaded_df.shape}")

Data size: (2389272, 50)
