# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Mounted at /content/drive/


## Import Modules

In [2]:
# install required modules quietly
required_packages = ['azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m387.8/387.8 KB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 KB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.5/174.5 KB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h

# Define Constants

In [3]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/W210/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-mvp.csv"

# File
container = "all-sites-data"
ext = "parquet"
ver = "mvp"
blob_name_base = f"full_2010_2015_v_{ver}"
train_blob_name = f"full_2010_2015-train-v-{ver}.{ext}"
test_blob_name = f"full_2010_2015-test-v-{ver}.{ext}"

In [4]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
target_variable_qc = 'NEE_VUT_REF_QC'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main', 'IGBP',
                     'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

In [5]:
site_splits =[
  ['AR-SLu', 'AU-ASM', 'AU-Cum', 'AU-How', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-Lnf',
   'IT-CA3', 'NL-Hor', 'US-Syv', 'US-AR2', 'US-ARM', 'US-Vcp', 'CH-Cha', 'CZ-KrP', 
   'CZ-Lnz', 'DE-Geb', 'DE-Obe', 'ES-LJu', 'FI-Let', 'IT-Lav', 'SE-Deg'],
  ['AU-Cpr', 'AU-Wom', 'CZ-BK2', 'DE-SfN', 'IT-CA1', 'IT-CA2', 'IT-Ro2', 'US-IB2', 
   'US-Me6', 'US-Ton', 'CA-Ca3', 'US-CRT', 'US-KFS', 'US-Mpj', 'US-Prr', 'US-Ro1', 
   'US-Tw4', 'BE-Bra', 'CZ-BK1', 'DE-Hai', 'IL-Yat', 'IT-Tor', 'SE-Htm'],
  ['AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2',
   'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ',
   'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam'],
  ['AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue',
   'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie',
   'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn'],
  ['AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', 'CA-Ca1',
   'CA-Gro', 'US-AR1', 'US-Rws', 'US-UMd', 'US-Wjs', 'CH-Fru', 'CH-Oe2', 'DE-Tha',
   'DK-Sor', 'FR-Bil', 'FR-Hes', 'IT-BCi', 'IT-SR2', 'DE-Hte'],
  ['CA-Oas', 'ES-Amo', 'FI-Sod', 'US-Myb', 'US-SRM', 'US-Tw3', 'US-Var', 'US-WCr',
   'US-Ho1', 'US-Seg', 'US-UMB', 'BE-Lon', 'CH-Dav', 'DE-Gri', 'DE-HoH', 'ES-LM1',
   'FR-Aur', 'FR-FBn', 'GF-Guy', 'IT-MBo', 'IT-Ren', 'RU-Fyo']
]

# Stage 1: Trim and Merge Site Metadata

In [None]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = True
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date ='2015-12-31'

# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

train_sites = [item for sublist in site_splits[:4] for item in sublist] 
val_sites = site_splits[4]
test_sites = site_splits[5]

print(f"Train({len(train_sites)}): {train_sites}")
print(f"Validation({len(val_sites)}): {val_sites}")
print(f"Test({len(test_sites)}): {test_sites}")

Train(90): ['AR-SLu', 'AU-ASM', 'AU-Cum', 'AU-How', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-Lnf', 'IT-CA3', 'NL-Hor', 'US-Syv', 'US-AR2', 'US-ARM', 'US-Vcp', 'CH-Cha', 'CZ-KrP', 'CZ-Lnz', 'DE-Geb', 'DE-Obe', 'ES-LJu', 'FI-Let', 'IT-Lav', 'SE-Deg', 'AU-Cpr', 'AU-Wom', 'CZ-BK2', 'DE-SfN', 'IT-CA1', 'IT-CA2', 'IT-Ro2', 'US-IB2', 'US-Me6', 'US-Ton', 'CA-Ca3', 'US-CRT', 'US-KFS', 'US-Mpj', 'US-Prr', 'US-Ro1', 'US-Tw4', 'BE-Bra', 'CZ-BK1', 'DE-Hai', 'IL-Yat', 'IT-Tor', 'SE-Htm', 'AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2', 'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ', 'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam', 'AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue', 'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn']
Validation(22): ['AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', '

In [None]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, train_sites, val_sites, test_sites, 
                                    hourly_features, metadata_features, target_variable_qc, target_variable, raw_data_dir)

data_df = prep_hourly.all_sites_all_sources(imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

Sites with missing monthly data: ['DE-Zrk', 'IT-Isp', 'AU-GWW', 'AU-Rob', 'US-Tw3']


1it [00:04,  4.68s/it]

1. AR-SLu: (10800, 27)


2it [00:12,  6.36s/it]

2. AR-Vir: (20448, 27)


3it [00:31, 12.46s/it]

3. AT-Neu: (26304, 27)


4it [00:35,  9.09s/it]

4. AU-ASM: (37944, 27)


5it [00:43,  8.44s/it]

5. AU-Cpr: (38736, 27)


6it [00:47,  6.93s/it]

6. AU-Cum: (19296, 27)


7it [01:11, 12.59s/it]

7. AU-DaP: (32304, 27)


8it [01:32, 15.32s/it]

8. AU-DaS: (43824, 27)


9it [01:38, 12.26s/it]

9. AU-Emr: (22464, 27)


10it [01:47, 11.30s/it]

10. AU-Gin: (28200, 27)


11it [02:24, 19.27s/it]

11. AU-How: (43824, 27)


12it [02:29, 14.83s/it]

12. AU-RDF: (16008, 27)


13it [02:38, 13.01s/it]

13. AU-Rig: (35064, 27)


14it [02:56, 14.77s/it]

14. AU-Stp: (43824, 27)


15it [02:58, 10.81s/it]

15. AU-TTE: (21528, 27)


16it [03:00,  8.08s/it]

16. AU-Whr: (27024, 27)


17it [03:12,  9.32s/it]

17. AU-Wom: (43296, 27)


18it [03:14,  7.26s/it]

18. CA-Oas: (8760, 27)


19it [03:54, 16.99s/it]

19. CA-TP1: (43512, 27)


20it [04:23, 20.45s/it]

20. CA-TP3: (43512, 27)


21it [04:47, 21.64s/it]

21. CA-TPD: (25992, 27)


22it [04:48, 15.47s/it]

  Column(s) with only NAN: ['b6']
22. CN-Sw2: (9600, 27)


23it [05:19, 20.17s/it]

23. CZ-BK2: (26304, 27)


24it [05:45, 21.91s/it]

24. DE-Lnf: (26304, 27)


25it [05:54, 18.00s/it]

25. DE-SfN: (21840, 27)


26it [06:36, 25.13s/it]

26. DE-Spw: (40080, 27)


27it [06:41, 19.19s/it]

27. ES-Amo: (26304, 27)


28it [08:00, 37.21s/it]

28. FI-Sod: (43824, 27)


29it [08:09, 28.73s/it]

29. FR-Pue: (43824, 27)


30it [08:18, 22.55s/it]

30. IT-CA1: (30936, 27)


31it [08:24, 17.78s/it]

31. IT-CA2: (30096, 27)


32it [08:33, 15.22s/it]

32. IT-CA3: (26712, 27)


33it [08:50, 15.69s/it]

33. IT-Noe: (41712, 27)


34it [08:58, 13.44s/it]

34. IT-Ro2: (26136, 27)


35it [09:08, 12.20s/it]

35. IT-SRo: (26304, 27)


36it [09:15, 10.68s/it]

36. NL-Hor: (14736, 27)


37it [10:10, 24.15s/it]

37. US-GLE: (43824, 27)


38it [10:20, 19.82s/it]

38. US-IB2: (17520, 27)


39it [10:43, 20.71s/it]

39. US-Me6: (40104, 27)


40it [10:49, 16.18s/it]

40. US-Myb: (34152, 27)


41it [11:22, 21.30s/it]

41. US-NR1: (43824, 27)


42it [11:25, 15.94s/it]

42. US-SRM: (43824, 27)


43it [11:49, 18.29s/it]

43. US-Syv: (23352, 27)


44it [11:55, 14.69s/it]

44. US-Ton: (43824, 27)


45it [12:05, 13.18s/it]

45. US-Twt: (43008, 27)


46it [12:10, 10.61s/it]

46. US-Var: (43824, 27)


47it [12:58, 21.89s/it]

47. US-WCr: (35904, 27)


48it [13:16, 20.83s/it]

48. US-WPT: (26304, 27)


49it [13:18, 15.07s/it]

49. US-Whs: (43824, 27)


50it [13:20, 11.27s/it]

50. US-Wkg: (43824, 27)


51it [13:22,  8.52s/it]

  Column(s) with only NAN: ['b6']
51. CA-Ca1: (8760, 27)


52it [13:24,  6.47s/it]

  Column(s) with only NAN: ['b6']
52. CA-Ca2: (8760, 27)


53it [13:26,  5.07s/it]

  Column(s) with only NAN: ['b6']
53. CA-Ca3: (8760, 27)


54it [14:04, 14.95s/it]

54. CA-Cbo: (43824, 27)


55it [15:03, 28.15s/it]

55. CA-Gro: (38328, 27)


56it [15:30, 28.01s/it]

56. CA-TP4: (52584, 27)


57it [15:39, 22.06s/it]

57. US-AR1: (26304, 27)


58it [15:44, 17.18s/it]

58. US-AR2: (22680, 27)


59it [15:58, 16.20s/it]

59. US-ARM: (52584, 27)


60it [16:44, 25.09s/it]

60. US-Bar: (52584, 27)


61it [16:55, 20.69s/it]

61. US-CRT: (26304, 27)


62it [16:56, 14.87s/it]

  Column(s) with only NAN: ['b6']
62. US-Fmf: (8760, 27)


63it [17:19, 17.29s/it]

63. US-Ho1: (46680, 27)


64it [17:32, 15.96s/it]

64. US-KFS: (52584, 27)


65it [17:43, 14.44s/it]

65. US-Mpj: (52584, 27)


66it [18:02, 16.07s/it]

66. US-Oho: (35064, 27)


67it [19:16, 33.23s/it]

67. US-Prr: (45360, 27)


68it [19:42, 31.28s/it]

68. US-Ro1: (52584, 27)


69it [19:44, 22.28s/it]

  Column(s) with only NAN: ['b6']
69. US-Rws: (10920, 27)


70it [19:46, 16.34s/it]

70. US-SRG: (52584, 27)


71it [19:54, 13.78s/it]

71. US-Seg: (52584, 27)


72it [19:58, 10.94s/it]

72. US-Ses: (52584, 27)


73it [20:00,  8.24s/it]

73. US-Tw4: (14496, 27)


74it [21:10, 26.70s/it]

74. US-UMB: (52584, 27)


75it [22:08, 35.98s/it]

75. US-UMd: (52584, 27)


76it [22:46, 36.72s/it]

76. US-Vcm: (52584, 27)


77it [23:12, 33.53s/it]

77. US-Vcp: (52584, 27)


78it [23:19, 25.53s/it]

78. US-Wjs: (52584, 27)


79it [24:03, 31.14s/it]

79. BE-Bra: (52584, 27)


80it [24:56, 37.76s/it]

80. BE-Dor: (43824, 27)


81it [26:04, 46.70s/it]

81. BE-Lon: (52584, 27)


82it [27:11, 52.94s/it]

82. BE-Vie: (52584, 27)


83it [27:45, 47.12s/it]

83. CH-Cha: (52584, 27)


84it [28:18, 42.96s/it]

84. CH-Dav: (52584, 27)


85it [28:51, 40.01s/it]

85. CH-Fru: (51000, 27)


86it [29:27, 38.80s/it]

86. CH-Lae: (52584, 27)


87it [30:12, 40.68s/it]

87. CH-Oe2: (52584, 27)


88it [31:10, 45.76s/it]

88. CZ-BK1: (52584, 27)


89it [31:21, 35.43s/it]

89. CZ-KrP: (17520, 27)


90it [31:23, 25.25s/it]

  Column(s) with only NAN: ['b6']
90. CZ-Lnz: (8760, 27)


91it [31:57, 27.94s/it]

91. CZ-RAJ: (35064, 27)


92it [32:43, 33.42s/it]

92. CZ-Stn: (52584, 27)


93it [33:16, 33.19s/it]

93. CZ-wet: (52584, 27)


94it [33:50, 33.46s/it]

94. DE-Geb: (52584, 27)


95it [34:23, 33.45s/it]

95. DE-Gri: (52368, 27)


96it [35:40, 46.35s/it]

96. DE-Hai: (52008, 27)
  Column(s) with only NAN: ['b6']
97. DE-HoH: (8760, 27)


98it [36:21, 34.81s/it]

98. DE-Kli: (52584, 27)


99it [37:25, 43.58s/it]

99. DE-Obe: (52584, 27)


100it [38:01, 41.22s/it]

100. DE-RuR: (40632, 27)


101it [38:37, 39.88s/it]

101. DE-Tha: (52584, 27)


102it [39:14, 39.02s/it]

102. DK-Sor: (52584, 27)


103it [39:25, 30.53s/it]

103. ES-LJu: (52584, 27)


104it [39:27, 21.92s/it]

104. ES-LM1: (15768, 27)


105it [39:29, 16.04s/it]

105. ES-LM2: (15768, 27)


106it [40:48, 34.75s/it]

106. FI-Hyy: (52584, 27)


107it [42:04, 47.11s/it]

107. FI-Let: (52584, 27)


108it [42:17, 36.98s/it]

108. FR-Aur: (52584, 27)


109it [42:20, 26.69s/it]

109. FR-Bil: (12864, 27)


110it [42:27, 20.84s/it]

110. FR-FBn: (52584, 27)


111it [43:12, 28.20s/it]

111. FR-Fon: (52584, 27)


112it [43:19, 21.89s/it]

112. FR-Hes: (17520, 27)


113it [43:33, 19.29s/it]

113. FR-Lam: (52584, 27)


114it [46:53, 73.71s/it]

114. GF-Guy: (52584, 27)
115. IL-Yat: (52536, 27)


116it [47:40, 48.28s/it]

116. IT-BCi: (52584, 27)
117. IT-Lav: (52584, 27)


118it [49:13, 48.03s/it]

118. IT-MBo: (52584, 27)
119. IT-Ren: (52584, 27)


119it [49:54, 45.85s/it]

120. IT-SR2: (26280, 27)


120it [49:57, 33.22s/it]

121. IT-Tor: (52584, 27)


121it [51:43, 55.02s/it]

122. RU-Fyo: (52584, 27)


122it [53:09, 64.28s/it]

123. SE-Deg: (52584, 27)


123it [54:34, 70.42s/it]

124. SE-Htm: (8760, 27)


124it [54:36, 49.94s/it]

125. SE-Nor: (17520, 27)


125it [54:55, 40.77s/it]

  Column(s) with only NAN: ['b6']
126. SE-Ros: (12648, 27)


126it [54:58, 29.24s/it]

127. DE-Hte: (49824, 27)


127it [55:52, 36.66s/it]

128. NL-Loo: (52584, 27)


128it [57:04, 47.41s/it]

129. SE-Lnn: (17520, 27)


129it [57:20, 26.67s/it]


Initial records: 4613880, Final records after resampling + gap-filling: 4862712
Total retained sites: 129/129 = 1.00
Missing values after site-level imputation: 864
Missing values after global-level imputation: 0
Data size after after merged with site metadata: (4862712, 34)
Data size after after merged with monthly data: (4862712, 51)


In [None]:
if data_df.isna().sum().sum() != 0:
  display(data_df[data_df.isna().any(axis=1)].groupby(['site_id', 'year', 'month']).count())
  display(pd.DataFrame(data_df.isna().sum()).T)

In [None]:
# Get the memory usage of the dataframe in bytes
memory_usage = data_df.memory_usage().sum()
memory_usage_gb = memory_usage / 1_000_000_000
print(f"The dataframe uses {memory_usage_gb:.2f} GB of memory.")

The dataframe uses 2.02 GB of memory.


In [None]:
data_df['site_id'].unique()

array(['AR-SLu', 'AR-Vir', 'AT-Neu', 'AU-ASM', 'AU-Cpr', 'AU-Cum',
       'AU-DaP', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-How', 'AU-RDF',
       'AU-Rig', 'AU-Stp', 'AU-TTE', 'AU-Whr', 'AU-Wom', 'BE-Bra',
       'BE-Dor', 'BE-Lon', 'BE-Vie', 'CA-Ca1', 'CA-Ca2', 'CA-Ca3',
       'CA-Cbo', 'CA-Gro', 'CA-Oas', 'CA-TP1', 'CA-TP3', 'CA-TP4',
       'CA-TPD', 'CH-Cha', 'CH-Dav', 'CH-Fru', 'CH-Lae', 'CH-Oe2',
       'CN-Sw2', 'CZ-BK1', 'CZ-BK2', 'CZ-KrP', 'CZ-Lnz', 'CZ-RAJ',
       'CZ-Stn', 'CZ-wet', 'DE-Geb', 'DE-Gri', 'DE-Hai', 'DE-HoH',
       'DE-Hte', 'DE-Kli', 'DE-Lnf', 'DE-Obe', 'DE-RuR', 'DE-SfN',
       'DE-Spw', 'DE-Tha', 'DK-Sor', 'ES-Amo', 'ES-LJu', 'ES-LM1',
       'ES-LM2', 'FI-Hyy', 'FI-Let', 'FI-Sod', 'FR-Aur', 'FR-Bil',
       'FR-FBn', 'FR-Fon', 'FR-Hes', 'FR-Lam', 'FR-Pue', 'GF-Guy',
       'IL-Yat', 'IT-BCi', 'IT-CA1', 'IT-CA2', 'IT-CA3', 'IT-Lav',
       'IT-MBo', 'IT-Noe', 'IT-Ren', 'IT-Ro2', 'IT-SR2', 'IT-SRo',
       'IT-Tor', 'NL-Hor', 'NL-Loo', 'RU-Fyo', 'SE-Deg', 'SE-H

# CHECKPOINT: Save full raw data

In [6]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
from io import BytesIO
data_cleanup_checkpoint = False
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

if data_cleanup_checkpoint:

  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

In [16]:
data_df = None
local_file = tmp_dir + os.sep + blob_name
if not (os.path.exists(local_file)):
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
    data_df = pd.read_parquet(file_stream, engine='pyarrow')
    data_df.to_parquet(local_file)
else:
    data_df = pd.read_parquet(local_file)

print(f"Data size: {data_df.shape}")

Data size: (4862712, 51)


# Train/Val/Test Split

In [17]:
# Load Site data
site_metadata_df = pd.read_csv(site_metadata_filename, usecols=['site_id', 'filename', 'IGBP'])

# only focus on target sites
print(f"size:{site_metadata_df.shape}")
site_metadata_df.dropna(inplace=True)

# Group IGBP
print(site_metadata_df.IGBP.unique())
site_metadata_df['gen_IGBP'] = site_metadata_df['IGBP']
site_metadata_df['gen_IGBP'].replace('WSA', 'SAV', inplace=True)
site_metadata_df['gen_IGBP'].replace('CSH', 'SHB', inplace=True)
site_metadata_df['gen_IGBP'].replace('OSH', 'SHB', inplace=True)
site_metadata_df.drop(site_metadata_df[site_metadata_df['gen_IGBP'] == 'WAT'].index, inplace = True)
print(site_metadata_df.gen_IGBP.unique())

# Get available sites in the datasets
available_sites = data_df['site_id'].unique()
site_data_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(available_sites)]
print(f"available sites: {site_data_df.shape}")

# Conduct k-fold splitting
from sklearn.model_selection import StratifiedKFold
n = 5
skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=42) # Add random state for reproducibility
folds = skf.split(site_data_df['site_id'], site_data_df['gen_IGBP'])

site_splits = []

for i, (train_index, test_index) in enumerate(folds):
  print(f"Fold {i+1}:")
  data_df = site_data_df[['site_id', 'gen_IGBP']].iloc[test_index]
  sites = list(data_df.site_id.unique())
  print(f"  Count={test_index.shape}")
  print(f"  IGBP ={np.sort(data_df.gen_IGBP.unique())}")
  print(f"  Sites={sites}")
  print("")

  site_splits.append(sites)

# print all sites
print(site_splits)

size:(286, 3)
['MF' 'ENF' 'GRA' 'SAV' 'WSA' 'EBF' 'WET' 'DBF' 'OSH' 'CRO' 'CSH' 'WAT']
['MF' 'ENF' 'GRA' 'SAV' 'EBF' 'WET' 'DBF' 'SHB' 'CRO']
available sites: (129, 4)
Fold 1:
  Count=(26,)
  IGBP =['CRO' 'DBF' 'EBF' 'ENF' 'GRA' 'MF' 'SAV' 'SHB' 'WET']
  Sites=['AR-SLu', 'AU-ASM', 'AU-Cpr', 'AU-Cum', 'AU-RDF', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-SfN', 'NL-Hor', 'US-Me6', 'US-Syv', 'US-WCr', 'US-AR2', 'US-Tw4', 'US-UMB', 'US-Vcp', 'CH-Cha', 'CZ-BK1', 'CZ-KrP', 'DE-Obe', 'ES-LJu', 'FI-Let', 'FR-Lam', 'IT-Lav', 'SE-Lnn']

Fold 2:
  Count=(26,)
  IGBP =['CRO' 'DBF' 'EBF' 'ENF' 'GRA' 'MF' 'SAV' 'SHB' 'WET']
  Sites=['CZ-BK2', 'DE-Spw', 'FR-Pue', 'IT-CA3', 'IT-Noe', 'IT-Ro2', 'US-IB2', 'US-Myb', 'US-SRM', 'CA-Ca3', 'US-CRT', 'US-Fmf', 'US-KFS', 'US-Prr', 'US-UMd', 'US-Wjs', 'BE-Bra', 'BE-Lon', 'CH-Lae', 'CZ-RAJ', 'DE-HoH', 'DE-Kli', 'DE-RuR', 'IL-Yat', 'IT-Tor', 'SE-Htm']

Fold 3:
  Count=(26,)
  IGBP =['CRO' 'DBF' 'EBF' 'ENF' 'GRA' 'MF' 'SAV' 'SHB' 'WET']
  Sites=['AR-Vir', 'AT-Neu', 'AU-DaS'

# Terminate Runtime

In [None]:
from google.colab import runtime
runtime.unassign()