# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Mounted at /content/drive/


## Import Modules

In [2]:
# install required modules quietly
required_packages = ['azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m387.8/387.8 KB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.5/174.5 KB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25h

# Define Constants

In [3]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-imputed-v1-i.csv"

# File
container = "all-sites-data"
ext = "parquet"
ver = "exp1"
blob_name_base = f"full_2010_2015_all_v_{ver}"
train_blob_name = f"full_2010_2015-train-v-{ver}.{ext}"
test_blob_name = f"full_2010_2015-test-v-{ver}.{ext}"

In [4]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
target_variable_qc = 'NEE_VUT_REF_QC'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main', 'IGBP',
                     'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

In [5]:
site_splits =[
  ['AR-SLu', 'AU-ASM', 'AU-Cum', 'AU-How', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-Lnf',
   'IT-CA3', 'NL-Hor', 'US-Syv', 'US-AR2', 'US-ARM', 'US-Vcp', 'CH-Cha', 'CZ-KrP', 
   'CZ-Lnz', 'DE-Geb', 'DE-Obe', 'ES-LJu', 'FI-Let', 'IT-Lav', 'SE-Deg'],
  ['AU-Cpr', 'AU-Wom', 'CZ-BK2', 'DE-SfN', 'IT-CA1', 'IT-CA2', 'IT-Ro2', 'US-IB2', 
   'US-Me6', 'US-Ton', 'CA-Ca3', 'US-CRT', 'US-KFS', 'US-Mpj', 'US-Prr', 'US-Ro1', 
   'US-Tw4', 'BE-Bra', 'CZ-BK1', 'DE-Hai', 'IL-Yat', 'IT-Tor', 'SE-Htm'],
  ['AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2',
   'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ',
   'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam'],
  ['AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue',
   'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie',
   'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn'],
  ['AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', 'CA-Ca1',
   'CA-Gro', 'US-AR1', 'US-Rws', 'US-UMd', 'US-Wjs', 'CH-Fru', 'CH-Oe2', 'DE-Tha',
   'DK-Sor', 'FR-Bil', 'FR-Hes', 'IT-BCi', 'IT-SR2', 'DE-Hte'],
  ['CA-Oas', 'ES-Amo', 'FI-Sod', 'US-Myb', 'US-SRM', 'US-Tw3', 'US-Var', 'US-WCr',
   'US-Ho1', 'US-Seg', 'US-UMB', 'BE-Lon', 'CH-Dav', 'DE-Gri', 'DE-HoH', 'ES-LM1',
   'FR-Aur', 'FR-FBn', 'GF-Guy', 'IT-MBo', 'IT-Ren', 'RU-Fyo']
]

# Stage 1: Trim and Merge Site Metadata

In [6]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = True
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date ='2015-12-31'

# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

train_sites = [item for sublist in site_splits[:2] for item in sublist] 
val_sites = site_splits[2]
test_sites = site_splits[3]

# exp v2 dataset
# train_sites = [item for sublist in site_splits[2:4] for item in sublist] 
# val_sites = site_splits[4]
# test_sites = site_splits[5]
# Train(44): ['AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2', 'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ', 'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam', 'AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue', 'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn']
# Validation(22): ['AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', 'CA-Ca1', 'CA-Gro', 'US-AR1', 'US-Rws', 'US-UMd', 'US-Wjs', 'CH-Fru', 'CH-Oe2', 'DE-Tha', 'DK-Sor', 'FR-Bil', 'FR-Hes', 'IT-BCi', 'IT-SR2', 'DE-Hte']
# Test(22): ['CA-Oas', 'ES-Amo', 'FI-Sod', 'US-Myb', 'US-SRM', 'US-Tw3', 'US-Var', 'US-WCr', 'US-Ho1', 'US-Seg', 'US-UMB', 'BE-Lon', 'CH-Dav', 'DE-Gri', 'DE-HoH', 'ES-LM1', 'FR-Aur', 'FR-FBn', 'GF-Guy', 'IT-MBo', 'IT-Ren', 'RU-Fyo']

print(f"Train({len(train_sites)}): {train_sites}")
print(f"Validation({len(val_sites)}): {val_sites}")
print(f"Test({len(test_sites)}): {test_sites}")

train_sites = train_sites+val_sites

Train(46): ['AR-SLu', 'AU-ASM', 'AU-Cum', 'AU-How', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-Lnf', 'IT-CA3', 'NL-Hor', 'US-Syv', 'US-AR2', 'US-ARM', 'US-Vcp', 'CH-Cha', 'CZ-KrP', 'CZ-Lnz', 'DE-Geb', 'DE-Obe', 'ES-LJu', 'FI-Let', 'IT-Lav', 'SE-Deg', 'AU-Cpr', 'AU-Wom', 'CZ-BK2', 'DE-SfN', 'IT-CA1', 'IT-CA2', 'IT-Ro2', 'US-IB2', 'US-Me6', 'US-Ton', 'CA-Ca3', 'US-CRT', 'US-KFS', 'US-Mpj', 'US-Prr', 'US-Ro1', 'US-Tw4', 'BE-Bra', 'CZ-BK1', 'DE-Hai', 'IL-Yat', 'IT-Tor', 'SE-Htm']
Validation(22): ['AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2', 'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ', 'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam']
Test(22): ['AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue', 'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn']


In [7]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, train_sites, test_sites, 
                                    hourly_features, metadata_features, target_variable_qc, target_variable, raw_data_dir)

data_df = prep_hourly.all_sites_all_sources(imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

0it [00:00, ?it/s]

Processing: 1. AR-SLu


1it [00:05,  5.27s/it]

Processing: 2. AR-Vir


2it [00:20, 11.05s/it]

Processing: 3. AT-Neu


3it [00:56, 22.41s/it]

Processing: 4. AU-ASM


4it [01:02, 16.18s/it]

Processing: 5. AU-Cpr


5it [01:16, 15.26s/it]

Processing: 6. AU-Cum


6it [01:20, 11.32s/it]

Processing: 7. AU-DaS


7it [02:00, 20.88s/it]

Processing: 8. AU-Emr


8it [02:10, 17.31s/it]

Processing: 9. AU-Gin


9it [02:27, 17.14s/it]

Processing: 10. AU-How


10it [03:26, 30.04s/it]

Processing: 11. AU-RDF


11it [03:33, 23.25s/it]

Processing: 12. AU-Rig


12it [03:47, 20.42s/it]

Processing: 13. AU-TTE


13it [03:50, 14.96s/it]

Processing: 14. AU-Whr


14it [03:53, 11.27s/it]

Processing: 15. AU-Wom


15it [04:13, 14.17s/it]

Processing: 16. CA-TP1


16it [05:25, 31.47s/it]

Processing: 17. CA-TP3


17it [06:15, 37.00s/it]

Processing: 18. CA-TPD


18it [07:03, 40.41s/it]

Processing: 19. CN-Sw2


19it [07:05, 28.73s/it]

ERROR: CN-Sw2 run into error. Exception: Shape of passed values is (912, 16), indices imply (912, 17)
Processing: 20. CZ-BK2


20it [07:53, 34.48s/it]

Processing: 21. DE-Lnf


21it [08:53, 42.11s/it]

Processing: 22. DE-SfN


22it [09:09, 34.39s/it]

Processing: 23. DE-Spw


23it [10:37, 50.62s/it]

Processing: 24. FR-Pue


24it [10:55, 40.69s/it]

Processing: 25. IT-CA1


25it [11:10, 32.91s/it]

Processing: 26. IT-CA2


26it [11:22, 26.66s/it]

Processing: 27. IT-CA3


27it [11:39, 23.73s/it]

Processing: 28. IT-Noe


28it [12:18, 28.39s/it]

Processing: 29. IT-Ro2


29it [12:33, 24.29s/it]

Processing: 30. IT-SRo


30it [12:50, 22.20s/it]

Processing: 31. NL-Hor


31it [13:03, 19.40s/it]

Processing: 32. US-IB2


32it [13:19, 18.54s/it]

Processing: 33. US-Me6


33it [14:11, 28.47s/it]

Processing: 34. US-Syv


34it [14:57, 33.64s/it]

Processing: 35. US-Ton


35it [15:08, 26.92s/it]

Processing: 36. US-Twt


36it [15:27, 24.60s/it]

Processing: 37. US-WPT


37it [16:00, 27.04s/it]

Processing: 38. US-Wkg


38it [16:04, 20.22s/it]

Processing: 39. CA-Ca2


39it [16:07, 15.05s/it]

ERROR: CA-Ca2 run into error. Exception: Shape of passed values is (2856, 16), indices imply (2856, 17)
Processing: 40. CA-Ca3


40it [16:11, 11.53s/it]

Processing: 41. CA-Cbo


41it [17:33, 32.88s/it]

Processing: 42. CA-TP4


42it [18:35, 41.66s/it]

Processing: 43. US-AR2


43it [18:45, 32.19s/it]

Processing: 44. US-ARM


44it [19:20, 32.94s/it]

Processing: 45. US-Bar


45it [20:53, 50.92s/it]

Processing: 46. US-CRT


47it [21:13, 29.32s/it]

Processing: 47. US-Fmf
Processing: 48. US-KFS


48it [21:47, 30.81s/it]

Processing: 49. US-Mpj


49it [22:11, 28.55s/it]

Processing: 50. US-Oho


50it [22:48, 31.29s/it]

Processing: 51. US-Prr


51it [25:32, 71.16s/it]

Processing: 52. US-Ro1


52it [26:32, 67.71s/it]

Processing: 53. US-SRG


53it [26:37, 48.75s/it]

Processing: 54. US-Ses


54it [26:44, 36.38s/it]

Processing: 55. US-Tw4


55it [26:48, 26.61s/it]

Processing: 56. US-Vcm


56it [28:05, 41.77s/it]

Processing: 57. US-Vcp


57it [29:10, 48.63s/it]

Processing: 58. BE-Bra


58it [30:43, 61.88s/it]

Processing: 59. BE-Dor


59it [32:27, 74.68s/it]

Processing: 60. BE-Vie


60it [34:55, 96.59s/it]

Processing: 61. CH-Cha


61it [35:52, 84.62s/it]

Processing: 62. CH-Lae


62it [37:02, 80.52s/it]

Processing: 63. CZ-BK1


63it [39:22, 98.09s/it]

Processing: 64. CZ-KrP


64it [39:42, 74.67s/it]

Processing: 65. CZ-Lnz


65it [39:45, 53.18s/it]

Processing: 66. CZ-RAJ


66it [41:03, 60.65s/it]

Processing: 67. CZ-Stn


67it [42:42, 72.13s/it]

Processing: 68. CZ-wet


68it [43:50, 70.96s/it]

Processing: 69. DE-Geb


69it [45:08, 73.02s/it]

Processing: 70. DE-Hai


70it [47:42, 97.38s/it]

Processing: 71. DE-Kli


71it [48:56, 90.54s/it]

Processing: 72. DE-Obe


72it [51:32, 110.07s/it]

Processing: 73. DE-RuR


73it [53:01, 103.71s/it]

Processing: 74. ES-LJu


74it [53:27, 80.47s/it] 

Processing: 75. ES-LM2


75it [53:31, 57.56s/it]

Processing: 76. FI-Hyy


76it [56:30, 93.81s/it]

Processing: 77. FI-Let


77it [59:40, 122.69s/it]

Processing: 78. FR-Fon


78it [1:01:30, 118.90s/it]

Processing: 79. FR-Lam


79it [1:02:02, 92.75s/it] 

Processing: 80. IL-Yat


80it [1:03:01, 82.77s/it]

Processing: 81. IT-Lav


81it [1:04:51, 90.77s/it]

Processing: 82. IT-Tor


82it [1:09:00, 138.30s/it]

Processing: 83. SE-Deg


83it [1:12:28, 159.28s/it]

Processing: 84. SE-Htm


84it [1:12:32, 112.65s/it]

Processing: 85. SE-Nor


85it [1:13:15, 91.92s/it] 

Processing: 86. SE-Ros


86it [1:13:19, 65.49s/it]

ERROR: SE-Ros run into error. Exception: Shape of passed values is (3744, 16), indices imply (3744, 17)
Processing: 87. NL-Loo


87it [1:16:21, 100.35s/it]

Processing: 88. SE-Lnn


88it [1:16:56, 52.47s/it]


Initial records: 3038184, Final records after resampling + gap-filling: 3180792
Total retained sites: 88/88 = 1.00
Missing values after site-level imputation: 0
Not imputing missing values at global level
Missing values after global-level imputation: 0
Data size after after merged with site metadata: (3180792, 34)
Data size after after merged with monthly data: (3180792, 50)
64128 missing values introduced after monthly merge


In [8]:
# Get the memory usage of the dataframe in bytes
memory_usage = data_df.memory_usage().sum()
memory_usage_gb = memory_usage / 1_000_000_000
print(f"The dataframe uses {memory_usage_gb:.2f} GB of memory.")

The dataframe uses 1.30 GB of memory.


In [9]:
data_df['site_id'].unique()

array(['AR-SLu', 'AR-Vir', 'AT-Neu', 'AU-ASM', 'AU-Cpr', 'AU-Cum',
       'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-How', 'AU-RDF', 'AU-Rig',
       'AU-TTE', 'AU-Whr', 'AU-Wom', 'BE-Bra', 'BE-Dor', 'BE-Vie',
       'CA-Ca3', 'CA-Cbo', 'CA-TP1', 'CA-TP3', 'CA-TP4', 'CA-TPD',
       'CH-Cha', 'CH-Lae', 'CZ-BK1', 'CZ-BK2', 'CZ-KrP', 'CZ-Lnz',
       'CZ-RAJ', 'CZ-Stn', 'CZ-wet', 'DE-Geb', 'DE-Hai', 'DE-Kli',
       'DE-Lnf', 'DE-Obe', 'DE-RuR', 'DE-SfN', 'DE-Spw', 'ES-LJu',
       'ES-LM2', 'FI-Hyy', 'FI-Let', 'FR-Fon', 'FR-Lam', 'FR-Pue',
       'IL-Yat', 'IT-CA1', 'IT-CA2', 'IT-CA3', 'IT-Lav', 'IT-Noe',
       'IT-Ro2', 'IT-SRo', 'IT-Tor', 'NL-Hor', 'NL-Loo', 'SE-Deg',
       'SE-Htm', 'SE-Lnn', 'SE-Nor', 'US-AR2', 'US-ARM', 'US-Bar',
       'US-CRT', 'US-Fmf', 'US-IB2', 'US-KFS', 'US-Me6', 'US-Mpj',
       'US-Oho', 'US-Prr', 'US-Ro1', 'US-SRG', 'US-Ses', 'US-Syv',
       'US-Ton', 'US-Tw4', 'US-Twt', 'US-Vcm', 'US-Vcp', 'US-WPT',
       'US-Wkg'], dtype=object)

# CHECKPOINT: Save full raw data

In [10]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
from io import BytesIO
data_cleanup_checkpoint = True
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

# exp_v2 dataset: File uploaded to all-sites-data/full_2010_2015_all_v_exp2_raw.parquet

if data_cleanup_checkpoint:

  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to all-sites-data/full_2010_2015_all_v_exp1_raw.parquet


In [11]:
loaded_df = None
local_file = tmp_dir + os.sep + blob_name
if not (os.path.exists(local_file)):
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
    loaded_df = pd.read_parquet(file_stream, engine='pyarrow')
    loaded_df.to_parquet(local_file)
else:
    loaded_df = pd.read_parquet(local_file)

print(f"Data size: {loaded_df.shape}")

Data size: (2389272, 50)
