# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Mounted at /content/drive/


## Import Modules

In [2]:
# install required modules quietly
required_packages = ['azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/387.8 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m307.2/387.8 KB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m387.8/387.8 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/174.5 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.5/174.5 KB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[?25h

# Define Constants

In [3]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-imputed-v1-i.csv"

# File
container = "all-sites-data"
ext = "parquet"
ver = "exp"
blob_name_base = f"full_2010_2015_all_v_{ver}"
train_blob_name = f"full_2010_2015-train-v-{ver}.{ext}"
test_blob_name = f"full_2010_2015-test-v-{ver}.{ext}"

In [4]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
target_variable_qc = 'NEE_VUT_REF_QC'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main', 'IGBP',
                     'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

In [5]:
site_splits =[
  ['AR-SLu', 'AU-ASM', 'AU-Cum', 'AU-How', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-Lnf',
   'IT-CA3', 'NL-Hor', 'US-Syv', 'US-AR2', 'US-ARM', 'US-Vcp', 'CH-Cha', 'CZ-KrP', 
   'CZ-Lnz', 'DE-Geb', 'DE-Obe', 'ES-LJu', 'FI-Let', 'IT-Lav', 'SE-Deg'],
  ['AU-Cpr', 'AU-Wom', 'CZ-BK2', 'DE-SfN', 'IT-CA1', 'IT-CA2', 'IT-Ro2', 'US-IB2', 
   'US-Me6', 'US-Ton', 'CA-Ca3', 'US-CRT', 'US-KFS', 'US-Mpj', 'US-Prr', 'US-Ro1', 
   'US-Tw4', 'BE-Bra', 'CZ-BK1', 'DE-Hai', 'IL-Yat', 'IT-Tor', 'SE-Htm'],
  ['AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2',
   'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ',
   'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam'],
  ['AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue',
   'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie',
   'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn'],
  ['AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', 'CA-Ca1',
   'CA-Gro', 'US-AR1', 'US-Rws', 'US-UMd', 'US-Wjs', 'CH-Fru', 'CH-Oe2', 'DE-Tha',
   'DK-Sor', 'FR-Bil', 'FR-Hes', 'IT-BCi', 'IT-SR2', 'DE-Hte'],
  ['CA-Oas', 'ES-Amo', 'FI-Sod', 'US-Myb', 'US-SRM', 'US-Tw3', 'US-Var', 'US-WCr',
   'US-Ho1', 'US-Seg', 'US-UMB', 'BE-Lon', 'CH-Dav', 'DE-Gri', 'DE-HoH', 'ES-LM1',
   'FR-Aur', 'FR-FBn', 'GF-Guy', 'IT-MBo', 'IT-Ren', 'RU-Fyo']
]

# Stage 1: Trim and Merge Site Metadata

In [6]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = True
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date ='2015-12-31'

# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

train_sites = [item for sublist in site_splits[:4] for item in sublist] 
val_sites = site_splits[4]
test_sites = site_splits[5]

# exp v2 dataset
# train_sites = [item for sublist in site_splits[2:4] for item in sublist] 
# val_sites = site_splits[4]
# test_sites = site_splits[5]
# Train(44): ['AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2', 'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ', 'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam', 'AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue', 'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn']
# Validation(22): ['AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', 'CA-Ca1', 'CA-Gro', 'US-AR1', 'US-Rws', 'US-UMd', 'US-Wjs', 'CH-Fru', 'CH-Oe2', 'DE-Tha', 'DK-Sor', 'FR-Bil', 'FR-Hes', 'IT-BCi', 'IT-SR2', 'DE-Hte']
# Test(22): ['CA-Oas', 'ES-Amo', 'FI-Sod', 'US-Myb', 'US-SRM', 'US-Tw3', 'US-Var', 'US-WCr', 'US-Ho1', 'US-Seg', 'US-UMB', 'BE-Lon', 'CH-Dav', 'DE-Gri', 'DE-HoH', 'ES-LM1', 'FR-Aur', 'FR-FBn', 'GF-Guy', 'IT-MBo', 'IT-Ren', 'RU-Fyo']

print(f"Train({len(train_sites)}): {train_sites}")
print(f"Validation({len(val_sites)}): {val_sites}")
print(f"Test({len(test_sites)}): {test_sites}")

train_sites = train_sites+val_sites

Train(90): ['AR-SLu', 'AU-ASM', 'AU-Cum', 'AU-How', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-Lnf', 'IT-CA3', 'NL-Hor', 'US-Syv', 'US-AR2', 'US-ARM', 'US-Vcp', 'CH-Cha', 'CZ-KrP', 'CZ-Lnz', 'DE-Geb', 'DE-Obe', 'ES-LJu', 'FI-Let', 'IT-Lav', 'SE-Deg', 'AU-Cpr', 'AU-Wom', 'CZ-BK2', 'DE-SfN', 'IT-CA1', 'IT-CA2', 'IT-Ro2', 'US-IB2', 'US-Me6', 'US-Ton', 'CA-Ca3', 'US-CRT', 'US-KFS', 'US-Mpj', 'US-Prr', 'US-Ro1', 'US-Tw4', 'BE-Bra', 'CZ-BK1', 'DE-Hai', 'IL-Yat', 'IT-Tor', 'SE-Htm', 'AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2', 'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ', 'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam', 'AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue', 'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn']
Validation(22): ['AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', '

In [7]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, train_sites, test_sites, 
                                    hourly_features, metadata_features, target_variable_qc, target_variable, raw_data_dir)

data_df = prep_hourly.all_sites_all_sources(imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

1it [00:03,  3.60s/it]

Processing: 1. AR-SLu
Processing: 2. AR-Vir


2it [00:15,  8.22s/it]

Processing: 3. AT-Neu


3it [00:45, 18.54s/it]

Processing: 4. AU-ASM


4it [00:50, 13.16s/it]

Processing: 5. AU-Cpr


5it [01:01, 12.16s/it]

Processing: 6. AU-Cum


6it [01:03,  8.83s/it]

Processing: 7. AU-DaP


7it [01:45, 19.63s/it]

Processing: 8. AU-DaS


8it [02:17, 23.69s/it]

Processing: 9. AU-Emr


9it [02:24, 18.49s/it]

Processing: 10. AU-Gin


10it [02:39, 17.29s/it]

Processing: 11. AU-How


11it [03:33, 28.56s/it]

Processing: 12. AU-RDF


12it [03:39, 21.56s/it]

Processing: 13. AU-Rig


13it [03:49, 18.32s/it]

Processing: 14. AU-Stp


14it [04:19, 21.71s/it]

Processing: 15. AU-TTE


15it [04:20, 15.57s/it]

Processing: 16. AU-Whr


16it [04:22, 11.40s/it]

Processing: 17. AU-Wom


17it [04:39, 12.99s/it]

Processing: 18. CA-Oas


18it [04:41,  9.84s/it]

Processing: 19. CA-TP1


19it [05:40, 24.53s/it]

Processing: 20. CA-TP3


20it [06:20, 29.24s/it]

Processing: 21. CA-TPD


22it [07:00, 22.63s/it]

Processing: 22. CN-Sw2
ERROR: CN-Sw2 run into error. Exception: Shape of passed values is (912, 16), indices imply (912, 17)
Processing: 23. CZ-BK2


23it [07:45, 29.35s/it]

Processing: 24. DE-Lnf


24it [08:25, 32.59s/it]

Processing: 25. DE-SfN


25it [08:38, 26.69s/it]

Processing: 26. DE-Spw


26it [09:46, 39.25s/it]

Processing: 27. ES-Amo


27it [09:53, 29.43s/it]

Processing: 28. FI-Sod


28it [12:06, 60.63s/it]

Processing: 29. FR-Pue


29it [12:19, 46.44s/it]

Processing: 30. IT-CA1


30it [12:32, 36.22s/it]

Processing: 31. IT-CA2


31it [12:42, 28.26s/it]

Processing: 32. IT-CA3


32it [12:56, 24.03s/it]

Processing: 33. IT-Noe


33it [13:25, 25.68s/it]

Processing: 34. IT-Ro2


34it [13:38, 21.95s/it]

Processing: 35. IT-SRo


35it [13:53, 19.72s/it]

Processing: 36. NL-Hor


36it [14:03, 16.84s/it]

Processing: 37. US-GLE


37it [15:45, 42.25s/it]

Processing: 38. US-IB2


38it [15:58, 33.44s/it]

Processing: 39. US-Me6


39it [16:33, 34.15s/it]

Processing: 40. US-Myb


40it [16:40, 25.80s/it]

Processing: 41. US-NR1


41it [17:23, 31.07s/it]

Processing: 42. US-SRM


42it [17:26, 22.74s/it]

Processing: 43. US-Syv


43it [17:55, 24.54s/it]

Processing: 44. US-Ton


44it [18:03, 19.48s/it]

Processing: 45. US-Twt


45it [18:14, 16.97s/it]

Processing: 46. US-Var


46it [18:19, 13.30s/it]

Processing: 47. US-WCr


47it [20:03, 40.78s/it]

Processing: 48. US-WPT


48it [20:35, 37.99s/it]

Processing: 49. US-Whs


49it [20:36, 27.04s/it]

Processing: 50. US-Wkg


50it [20:39, 19.66s/it]

Processing: 51. CA-Ca1


51it [20:41, 14.39s/it]

ERROR: CA-Ca1 run into error. Exception: Shape of passed values is (2736, 16), indices imply (2736, 17)
Processing: 52. CA-Ca2


52it [20:43, 10.56s/it]

ERROR: CA-Ca2 run into error. Exception: Shape of passed values is (2856, 16), indices imply (2856, 17)
Processing: 53. CA-Ca3


53it [20:45,  7.96s/it]

Processing: 54. CA-Cbo


54it [21:59, 28.01s/it]

Processing: 55. CA-Gro


55it [23:32, 47.38s/it]

Processing: 56. CA-TP4


56it [24:24, 48.69s/it]

Processing: 57. US-AR1


57it [24:40, 38.98s/it]

Processing: 58. US-AR2


58it [24:47, 29.52s/it]

Processing: 59. US-ARM


59it [25:16, 29.24s/it]

Processing: 60. US-Bar


60it [26:33, 43.51s/it]

Processing: 61. US-CRT


62it [26:51, 25.22s/it]

Processing: 62. US-Fmf
Processing: 63. US-Ho1


63it [27:19, 26.10s/it]

Processing: 64. US-KFS


64it [27:36, 23.24s/it]

Processing: 65. US-Mpj


65it [27:49, 20.25s/it]

Processing: 66. US-Oho


66it [28:11, 20.71s/it]

Processing: 67. US-Prr


67it [30:26, 55.22s/it]

Processing: 68. US-Ro1


68it [31:19, 54.52s/it]

Processing: 69. US-Rws


69it [31:20, 38.47s/it]

Processing: 70. US-SRG


70it [31:23, 27.62s/it]

Processing: 71. US-Seg


71it [31:38, 23.93s/it]

Processing: 72. US-Ses


72it [31:43, 18.27s/it]

Processing: 73. US-Tw4


73it [31:45, 13.40s/it]

Processing: 74. US-UMB


74it [33:20, 37.89s/it]

Processing: 75. US-UMd


75it [34:53, 54.41s/it]

Processing: 76. US-Vcm


76it [35:50, 55.03s/it]

Processing: 77. US-Vcp


77it [36:22, 48.37s/it]

Processing: 78. US-Wjs


78it [36:32, 36.67s/it]

Processing: 79. BE-Bra


79it [37:27, 42.15s/it]

Processing: 80. BE-Dor


80it [38:32, 49.07s/it]

Processing: 81. BE-Lon


81it [40:27, 68.87s/it]

Processing: 82. BE-Vie


82it [42:30, 84.98s/it]

Processing: 83. CH-Cha


83it [43:22, 75.27s/it]

Processing: 84. CH-Dav


84it [44:15, 68.52s/it]

Processing: 85. CH-Fru


85it [45:07, 63.72s/it]

Processing: 86. CH-Lae


86it [46:08, 62.89s/it]

Processing: 87. CH-Oe2


87it [47:32, 69.20s/it]

Processing: 88. CZ-BK1


88it [49:21, 81.02s/it]

Processing: 89. CZ-KrP


89it [49:43, 63.43s/it]

Processing: 90. CZ-Lnz


90it [49:45, 44.81s/it]

Processing: 91. CZ-RAJ


91it [50:43, 48.83s/it]

Processing: 92. CZ-Stn


92it [52:06, 59.01s/it]

Processing: 93. CZ-wet


93it [52:59, 57.45s/it]

Processing: 94. DE-Geb


94it [53:57, 57.61s/it]

Processing: 95. DE-Gri


95it [54:44, 54.32s/it]

Processing: 96. DE-Hai


96it [56:15, 65.38s/it]

Processing: 97. DE-HoH


97it [56:17, 46.14s/it]

Processing: 98. DE-Kli


98it [57:04, 46.59s/it]

Processing: 99. DE-Obe


99it [58:28, 57.71s/it]

Processing: 100. DE-RuR


100it [59:34, 60.40s/it]

Processing: 101. DE-Tha


101it [1:00:36, 60.79s/it]

Processing: 102. DK-Sor


102it [1:01:47, 63.70s/it]

Processing: 103. ES-LJu


103it [1:02:07, 50.62s/it]

Processing: 104. ES-LM1


104it [1:02:09, 36.00s/it]

Processing: 105. ES-LM2


105it [1:02:12, 26.10s/it]

Processing: 106. FI-Hyy


106it [1:04:45, 64.16s/it]

Processing: 107. FI-Let


107it [1:07:07, 87.54s/it]

Processing: 108. FR-Aur


108it [1:07:26, 67.08s/it]

Processing: 109. FR-Bil


109it [1:07:29, 47.87s/it]

Processing: 110. FR-FBn


110it [1:07:38, 36.31s/it]

Processing: 111. FR-Fon


111it [1:08:47, 45.97s/it]

Processing: 112. FR-Hes


112it [1:08:56, 34.89s/it]

Processing: 113. FR-Lam


113it [1:09:17, 30.69s/it]

Processing: 114. GF-Guy


114it [1:13:42, 100.91s/it]

Processing: 115. IL-Yat


115it [1:14:22, 82.82s/it] 

Processing: 116. IT-BCi


116it [1:15:06, 71.16s/it]

Processing: 117. IT-Lav


117it [1:16:25, 73.46s/it]

Processing: 118. IT-MBo


118it [1:17:59, 79.64s/it]

Processing: 119. IT-Ren


119it [1:19:22, 80.60s/it]

Processing: 120. IT-SR2


120it [1:19:26, 57.51s/it]

Processing: 121. IT-Tor


121it [1:22:52, 102.15s/it]

Processing: 122. RU-Fyo


122it [1:25:36, 120.72s/it]

Processing: 123. SE-Deg


123it [1:27:59, 127.35s/it]

Processing: 124. SE-Htm


124it [1:28:01, 89.78s/it] 

Processing: 125. SE-Nor


125it [1:28:38, 73.91s/it]

Processing: 126. SE-Ros


126it [1:28:40, 52.32s/it]

ERROR: SE-Ros run into error. Exception: Shape of passed values is (3744, 16), indices imply (3744, 17)
Processing: 127. DE-Hte


127it [1:30:02, 61.27s/it]

Processing: 128. NL-Loo


128it [1:32:34, 88.40s/it]

Processing: 129. SE-Lnn


129it [1:33:06, 43.30s/it]


Initial records: 4613880, Final records after resampling + gap-filling: 4822944
Total retained sites: 129/129 = 1.00
Missing values after site-level imputation: 0
Not imputing missing values at global level
Missing values after global-level imputation: 0
Data size after after merged with site metadata: (4822944, 34)
Data size after after merged with monthly data: (4822944, 50)
72960 missing values introduced after monthly merge


In [8]:
# Get the memory usage of the dataframe in bytes
memory_usage = data_df.memory_usage().sum()
memory_usage_gb = memory_usage / 1_000_000_000
print(f"The dataframe uses {memory_usage_gb:.2f} GB of memory.")

The dataframe uses 1.97 GB of memory.


In [9]:
data_df['site_id'].unique()

array(['AR-SLu', 'AR-Vir', 'AT-Neu', 'AU-ASM', 'AU-Cpr', 'AU-Cum',
       'AU-DaP', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-How', 'AU-RDF',
       'AU-Rig', 'AU-Stp', 'AU-TTE', 'AU-Whr', 'AU-Wom', 'BE-Bra',
       'BE-Dor', 'BE-Lon', 'BE-Vie', 'CA-Ca3', 'CA-Cbo', 'CA-Gro',
       'CA-Oas', 'CA-TP1', 'CA-TP3', 'CA-TP4', 'CA-TPD', 'CH-Cha',
       'CH-Dav', 'CH-Fru', 'CH-Lae', 'CH-Oe2', 'CZ-BK1', 'CZ-BK2',
       'CZ-KrP', 'CZ-Lnz', 'CZ-RAJ', 'CZ-Stn', 'CZ-wet', 'DE-Geb',
       'DE-Gri', 'DE-Hai', 'DE-HoH', 'DE-Hte', 'DE-Kli', 'DE-Lnf',
       'DE-Obe', 'DE-RuR', 'DE-SfN', 'DE-Spw', 'DE-Tha', 'DK-Sor',
       'ES-Amo', 'ES-LJu', 'ES-LM1', 'ES-LM2', 'FI-Hyy', 'FI-Let',
       'FI-Sod', 'FR-Aur', 'FR-Bil', 'FR-FBn', 'FR-Fon', 'FR-Hes',
       'FR-Lam', 'FR-Pue', 'GF-Guy', 'IL-Yat', 'IT-BCi', 'IT-CA1',
       'IT-CA2', 'IT-CA3', 'IT-Lav', 'IT-MBo', 'IT-Noe', 'IT-Ren',
       'IT-Ro2', 'IT-SR2', 'IT-SRo', 'IT-Tor', 'NL-Hor', 'NL-Loo',
       'RU-Fyo', 'SE-Deg', 'SE-Htm', 'SE-Lnn', 'SE-Nor', 'US-A

# CHECKPOINT: Save full raw data

In [10]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
from io import BytesIO
data_cleanup_checkpoint = True
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

# exp_v2 dataset: File uploaded to all-sites-data/full_2010_2015_all_v_exp2_raw.parquet

if data_cleanup_checkpoint:

  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to all-sites-data/full_2010_2015_all_v_exp_raw.parquet


In [11]:
loaded_df = None
local_file = tmp_dir + os.sep + blob_name
if not (os.path.exists(local_file)):
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
    loaded_df = pd.read_parquet(file_stream, engine='pyarrow')
    loaded_df.to_parquet(local_file)
else:
    loaded_df = pd.read_parquet(local_file)

print(f"Data size: {loaded_df.shape}")

Data size: (4822944, 50)


In [None]:
from google.colab import runtime
runtime.unassign()