# Notesbook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import modules

In [2]:
# install required modules quietly
required_packages = ['azure-storage-blob', 'plotly']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

In [3]:
!pip install plotly==5.13.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
import os
os.chdir(MY_HOME_ABS_PATH)

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from timeit import default_timer
from datetime import datetime
import gc
import pickle

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
sys.path.append('./.cred')
sys.path.append('./code/src/tools')
sys.path.append(os.path.abspath("./code/src/tools"))
  
from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Define Constants

In [13]:
# Download full data
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
data_dir = root_dir + os.sep + 'data'
model_dir = data_dir + os.sep + 'models'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'

container = "all-sites-data"
blob_name = "full_2010_2015_v_mvp_raw.parquet"
local_file = tmp_dir + os.sep + blob_name

# Define Site splits

In [7]:
SITE_SPLITS =[
      ['AR-SLu', 'AU-ASM', 'AU-Cpr', 'AU-Cum', 'AU-RDF', 'CA-TP3', 'CA-TPD', 'CN-Sw2',
        'DE-SfN', 'NL-Hor', 'US-Me6', 'US-Syv', 'US-WCr', 'US-AR2', 'US-Tw4', 'US-UMB', 
        'US-Vcp', 'CH-Cha', 'CZ-BK1', 'CZ-KrP', 'DE-Obe', 'ES-LJu', 'FI-Let', 'FR-Lam', 
        'IT-Lav', 'SE-Lnn'], 
      ['CZ-BK2', 'DE-Spw', 'FR-Pue', 'IT-CA3', 'IT-Noe', 'IT-Ro2', 'US-IB2', 'US-Myb',
        'US-SRM', 'CA-Ca3', 'US-CRT', 'US-Fmf', 'US-KFS', 'US-Prr', 'US-UMd', 'US-Wjs',
        'BE-Bra', 'BE-Lon', 'CH-Lae', 'CZ-RAJ', 'DE-HoH', 'DE-Kli', 'DE-RuR', 'IL-Yat', 
        'IT-Tor', 'SE-Htm'], 
      ['AR-Vir', 'AT-Neu', 'AU-DaS', 'AU-TTE', 'AU-Wom', 'CA-TP1', 'IT-CA1', 'IT-SRo',
        'US-WPT', 'US-Wkg', 'CA-Ca2', 'CA-Cbo', 'CA-TP4', 'US-ARM', 'US-Ro1', 'US-Rws',
        'US-SRG', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'DE-Geb', 'ES-LM2', 'FR-Fon', 
        'SE-Ros', 'DE-Hte'],
      ['AU-DaP', 'AU-Emr', 'AU-Gin', 'AU-How', 'AU-Rig', 'US-GLE', 'US-NR1', 'US-Twt',
        'CA-Ca1', 'CA-Gro', 'US-AR1', 'US-Bar', 'US-Mpj', 'US-Ses', 'CH-Fru', 'CH-Oe2',
        'DE-Hai', 'DK-Sor', 'FI-Hyy', 'FR-Aur', 'FR-Hes', 'GF-Guy', 'IT-SR2', 'SE-Deg',
        'SE-Nor', 'NL-Loo'],
      ['AU-Stp', 'AU-Whr', 'CA-Oas', 'DE-Lnf', 'ES-Amo', 'FI-Sod', 'IT-CA2', 'US-Ton',
        'US-Var', 'US-Whs', 'US-Ho1', 'US-Oho', 'US-Seg', 'CH-Dav', 'CZ-Lnz', 'CZ-wet',
        'DE-Gri', 'DE-Tha', 'ES-LM1', 'FR-Bil', 'FR-FBn', 'IT-BCi', 'IT-MBo', 'IT-Ren',
        'RU-Fyo']
    ]

# Get Target Sites

In [12]:
target_sites = [item for sublist in SITE_SPLITS for item in sublist]
print(target_sites)

['AR-SLu', 'AU-ASM', 'AU-Cpr', 'AU-Cum', 'AU-RDF', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-SfN', 'NL-Hor', 'US-Me6', 'US-Syv', 'US-WCr', 'US-AR2', 'US-Tw4', 'US-UMB', 'US-Vcp', 'CH-Cha', 'CZ-BK1', 'CZ-KrP', 'DE-Obe', 'ES-LJu', 'FI-Let', 'FR-Lam', 'IT-Lav', 'SE-Lnn', 'CZ-BK2', 'DE-Spw', 'FR-Pue', 'IT-CA3', 'IT-Noe', 'IT-Ro2', 'US-IB2', 'US-Myb', 'US-SRM', 'CA-Ca3', 'US-CRT', 'US-Fmf', 'US-KFS', 'US-Prr', 'US-UMd', 'US-Wjs', 'BE-Bra', 'BE-Lon', 'CH-Lae', 'CZ-RAJ', 'DE-HoH', 'DE-Kli', 'DE-RuR', 'IL-Yat', 'IT-Tor', 'SE-Htm', 'AR-Vir', 'AT-Neu', 'AU-DaS', 'AU-TTE', 'AU-Wom', 'CA-TP1', 'IT-CA1', 'IT-SRo', 'US-WPT', 'US-Wkg', 'CA-Ca2', 'CA-Cbo', 'CA-TP4', 'US-ARM', 'US-Ro1', 'US-Rws', 'US-SRG', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'DE-Geb', 'ES-LM2', 'FR-Fon', 'SE-Ros', 'DE-Hte', 'AU-DaP', 'AU-Emr', 'AU-Gin', 'AU-How', 'AU-Rig', 'US-GLE', 'US-NR1', 'US-Twt', 'CA-Ca1', 'CA-Gro', 'US-AR1', 'US-Bar', 'US-Mpj', 'US-Ses', 'CH-Fru', 'CH-Oe2', 'DE-Hai', 'DK-Sor', 'FI-Hyy', 'FR-Aur', 'FR-Hes', 'GF-Guy',

# Get Gold Sample Site Metadata

In [15]:
# Load site metadata
included_site_features = ['site_id', 'site_name', 'IGBP', 'lat', 'long', 'koppen_main_name', 'start_time','end_time', 'record_count', 'recorded_day_percentage']
train_site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
train_site_metadata_df= train_site_metadata_df.loc[train_site_metadata_df['site_id'].isin(target_sites)]
print(f"size:{train_site_metadata_df.shape}")
train_site_metadata_df.reset_index(inplace=True, drop=True)
train_site_metadata_df=train_site_metadata_df.dropna()
display(train_site_metadata_df)


size:(129, 10)


Unnamed: 0,site_id,IGBP,lat,long,site_name,koppen_main_name,record_count,start_time,end_time,recorded_day_percentage
0,AR-SLu,MF,-33.46480,-66.45980,San Luis,Arid,22128.00000,2009-12-21 00:00:00,2011-03-26 23:30:00,1.00000
1,AR-Vir,ENF,-28.23950,-56.18860,Virasoro,Temperate,33984.00000,2010-02-13 00:00:00,2012-06-13 23:30:00,0.83099
2,AT-Neu,GRA,47.11667,11.31750,Neustift,Cold,188112.00000,2002-01-01 00:00:00,2012-12-31 23:30:00,0.97536
3,AU-ASM,SAV,-22.28300,133.24900,Alice Springs,Arid,74496.00000,2010-09-03 00:00:00,2014-12-31 23:30:00,0.98166
4,AU-Cpr,SAV,-34.00210,140.58910,Calperum,Arid,72384.00000,2010-08-01 00:00:00,2014-12-31 23:30:00,0.93432
...,...,...,...,...,...,...,...,...,...,...
124,SE-Nor,ENF,60.08650,17.47950,Norunda,Cold,120768.00000,2014-01-01 00:00:00,2020-12-31 23:30:00,0.98397
125,SE-Ros,ENF,64.17250,19.73800,Rosinedal-3,Cold,111216.00000,2014-07-23 00:00:00,2020-12-31 23:30:00,0.98428
126,DE-Hte,WET,54.21028,12.17611,Huetelmoor,Temperate,141696.00000,2009-01-01 00:00:00,2018-12-01 23:30:00,0.81502
127,NL-Loo,ENF,52.16658,5.74356,Loobos,Temperate,353760.00000,1996-07-22 00:00:00,2018-09-11 23:30:00,0.91134


In [16]:
train_site_IGBP = list(train_site_metadata_df['IGBP'].unique())
print(train_site_IGBP)

['MF', 'ENF', 'GRA', 'SAV', 'EBF', 'WSA', 'DBF', 'WET', 'OSH', 'CRO', 'CSH']


In [None]:
# Target test dataset record count:
print(f"Train dataset (untrimmed) size: {train_site_metadata_df['record_count'].sum()}")
print(f"Target test dataset (untrimmed) size: {train_site_metadata_df['record_count'].sum()*0.2}")
print(f"Target test dataset (untrimmed) size: {train_site_metadata_df['record_count'].sum()*0.1}")

Train dataset (untrimmed) size: 3460224.0
Target test dataset (untrimmed) size: 692044.8
Target test dataset (untrimmed) size: 346022.4


# Plot Folds

In [44]:
for i, k in enumerate(SITE_SPLITS):
  print(k)
  plot_df = train_site_metadata_df.loc[train_site_metadata_df['site_id'].isin(k)]
  display(plot_df.sort_values(['record_count','recorded_day_percentage'], ascending=[True, False]))

['AR-SLu', 'AU-ASM', 'AU-Cpr', 'AU-Cum', 'AU-RDF', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-SfN', 'NL-Hor', 'US-Me6', 'US-Syv', 'US-WCr', 'US-AR2', 'US-Tw4', 'US-UMB', 'US-Vcp', 'CH-Cha', 'CZ-BK1', 'CZ-KrP', 'DE-Obe', 'ES-LJu', 'FI-Let', 'FR-Lam', 'IT-Lav', 'SE-Lnn']


Unnamed: 0,site_id,IGBP,lat,long,site_name,koppen_main_name,record_count,start_time,end_time,recorded_day_percentage
21,CN-Sw2,GRA,41.7902,111.8971,Siziwang Grazed (SZWG),Arid,19200.0,2010-12-20 00:00:00,2012-01-23 23:30:00,1.0
0,AR-SLu,MF,-33.4648,-66.4598,San Luis,Arid,22128.0,2009-12-21 00:00:00,2011-03-26 23:30:00,1.0
11,AU-RDF,WSA,-14.5636,132.4776,"Red Dirt Melon Farm, Northern Territory",Tropical,29808.0,2011-09-24 00:00:00,2013-07-21 23:30:00,0.93103
5,AU-Cum,EBF,-33.61518,150.72362,Cumberland Plain,Temperate,37728.0,2012-10-19 00:00:00,2014-12-31 23:30:00,0.97761
24,DE-SfN,WET,47.80639,11.3275,Schechenfilz Nord,Cold,39360.0,2012-07-05 00:00:00,2014-12-31 23:30:00,0.9011
20,CA-TPD,DBF,42.63533,-80.55773,Ontario - Turkey Point Mature Deciduous,Cold,50592.0,2012-01-01 00:00:00,2014-12-18 23:30:00,0.97322
57,US-AR2,GRA,36.6358,-99.5975,ARM USDA UNL OSU Woodward Switchgrass 2,Temperate,52416.0,2009-04-30 00:00:00,2012-08-02 23:30:00,0.91688
72,US-Tw4,WET,38.1027,-121.6413,Twitchell East End Wetland,Temperate,59808.0,2013-11-27 00:00:00,2017-12-31 23:30:00,0.83289
38,US-Me6,ENF,44.32328,-121.6078,Metolius Young Pine Burn,Cold,66672.0,2010-06-05 00:00:00,2014-12-31 23:30:00,0.83124
4,AU-Cpr,SAV,-34.0021,140.5891,Calperum,Arid,72384.0,2010-08-01 00:00:00,2014-12-31 23:30:00,0.93432


['CZ-BK2', 'DE-Spw', 'FR-Pue', 'IT-CA3', 'IT-Noe', 'IT-Ro2', 'US-IB2', 'US-Myb', 'US-SRM', 'CA-Ca3', 'US-CRT', 'US-Fmf', 'US-KFS', 'US-Prr', 'US-UMd', 'US-Wjs', 'BE-Bra', 'BE-Lon', 'CH-Lae', 'CZ-RAJ', 'DE-HoH', 'DE-Kli', 'DE-RuR', 'IL-Yat', 'IT-Tor', 'SE-Htm']


Unnamed: 0,site_id,IGBP,lat,long,site_name,koppen_main_name,record_count,start_time,end_time,recorded_day_percentage
31,IT-CA3,DBF,42.38,12.0222,Castel d'Asso3,Temperate,46512.0,2011-11-14 00:00:00,2014-11-30 23:30:00,0.87062
60,US-CRT,CRO,41.6285,-83.3471,Curtice Walter-Berger cropland,Cold,51936.0,2011-01-01 00:00:00,2013-12-31 23:30:00,0.98723
39,US-Myb,WET,38.04978,-121.76506,Mayberry Wetland,Temperate,64320.0,2011-02-08 00:00:00,2014-12-31 23:30:00,0.94167
25,DE-Spw,WET,51.89225,14.03369,Spreewald,Cold,70608.0,2010-06-06 00:00:00,2014-12-31 23:30:00,0.88084
61,US-Fmf,ENF,35.1426,-111.7273,Flagstaff - Managed Forest,Cold,88848.0,2005-07-30 00:00:00,2010-12-31 23:30:00,0.93438
22,CZ-BK2,GRA,49.49443,18.54285,Bily Kriz grassland,Cold,100224.0,2006-05-04 00:00:00,2012-12-31 23:30:00,0.85785
66,US-Prr,ENF,65.1237,-147.4876,Poker Flat Research Range Black Spruce Forest,Cold,101040.0,2010-10-29 00:00:00,2016-12-31 23:30:00,0.93307
123,SE-Htm,ENF,56.09763,13.41897,Hyltemossa,Cold,103488.0,2015-01-01 00:00:00,2020-12-31 23:30:00,0.98358
96,DE-HoH,DBF,52.08656,11.22235,Hohes Holz,Cold,104736.0,2015-01-01 00:00:00,2020-12-31 23:30:00,0.99544
37,US-IB2,GRA,41.84062,-88.24103,Fermi National Accelerator Laboratory- Batavia...,Cold,124560.0,2004-10-07 00:00:00,2011-12-31 23:30:00,0.98221


['AR-Vir', 'AT-Neu', 'AU-DaS', 'AU-TTE', 'AU-Wom', 'CA-TP1', 'IT-CA1', 'IT-SRo', 'US-WPT', 'US-Wkg', 'CA-Ca2', 'CA-Cbo', 'CA-TP4', 'US-ARM', 'US-Ro1', 'US-Rws', 'US-SRG', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'DE-Geb', 'ES-LM2', 'FR-Fon', 'SE-Ros', 'DE-Hte']


Unnamed: 0,site_id,IGBP,lat,long,site_name,koppen_main_name,record_count,start_time,end_time,recorded_day_percentage
1,AR-Vir,ENF,-28.2395,-56.1886,Virasoro,Temperate,33984.0,2010-02-13 00:00:00,2012-06-13 23:30:00,0.83099
14,AU-TTE,GRA,-22.287,133.64,Ti Tree East,Arid,42576.0,2012-07-18 00:00:00,2014-12-31 23:30:00,0.98885
47,US-WPT,WET,41.46464,-82.99616,Winous Point North Marsh,Cold,52608.0,2011-01-01 00:00:00,2013-12-31 23:30:00,1.0
68,US-Rws,OSH,43.1675,-116.7132,Reynolds Creek Wyoming big sagebrush,Arid,55200.0,2014-10-03 00:00:00,2017-12-31 23:30:00,0.96965
29,IT-CA1,DBF,42.38041,12.02656,Castel d'Asso1,Temperate,56016.0,2011-06-08 00:00:00,2014-12-17 23:30:00,0.90535
16,AU-Wom,EBF,-37.4222,144.0944,Wombat,Temperate,83856.0,2010-01-23 00:00:00,2014-12-31 23:30:00,0.9684
51,CA-Ca2,ENF,49.8705,-125.2909,British Columbia - Clearcut Douglas-fir stand ...,Temperate,110352.0,2001-01-23 00:00:00,2010-12-31 23:30:00,0.63333
125,SE-Ros,ENF,64.1725,19.738,Rosinedal-3,Cold,111216.0,2014-07-23 00:00:00,2020-12-31 23:30:00,0.98428
7,AU-DaS,SAV,-14.1593,131.3881,Daly River Cleared,Tropical,115392.0,2008-01-01 00:00:00,2014-12-31 23:30:00,0.94016
104,ES-LM2,WSA,39.93459,-5.77588,Majadas del Tietar South,Arid,117408.0,2014-03-15 00:00:00,2020-12-31 23:30:00,0.9847


['AU-DaP', 'AU-Emr', 'AU-Gin', 'AU-How', 'AU-Rig', 'US-GLE', 'US-NR1', 'US-Twt', 'CA-Ca1', 'CA-Gro', 'US-AR1', 'US-Bar', 'US-Mpj', 'US-Ses', 'CH-Fru', 'CH-Oe2', 'DE-Hai', 'DK-Sor', 'FI-Hyy', 'FR-Aur', 'FR-Hes', 'GF-Guy', 'IT-SR2', 'SE-Deg', 'SE-Nor', 'NL-Loo']


Unnamed: 0,site_id,IGBP,lat,long,site_name,koppen_main_name,record_count,start_time,end_time,recorded_day_percentage
8,AU-Emr,GRA,-23.8587,148.4746,Emerald,Arid,40896.0,2011-06-10 00:00:00,2013-12-31 23:30:00,0.91026
9,AU-Gin,WSA,-31.3764,115.7138,Gingin,Temperate,48672.0,2011-10-14 00:00:00,2014-12-31 23:30:00,0.86298
56,US-AR1,GRA,36.4267,-99.42,ARM USDA UNL OSU Woodward Switchgrass 1,Temperate,58848.0,2009-04-30 00:00:00,2012-12-31 23:30:00,0.91356
12,AU-Rig,GRA,-36.6499,145.5759,Riggs Creek,Temperate,63648.0,2011-01-01 00:00:00,2014-12-31 23:30:00,0.9076
44,US-Twt,CRO,38.10872,-121.6531,Twitchell Island,Temperate,88848.0,2009-04-04 00:00:00,2014-12-31 23:30:00,0.88227
6,AU-DaP,GRA,-14.0633,131.3181,Daly River Savanna,Tropical,92544.0,2007-09-19 00:00:00,2013-09-07 23:30:00,0.884
124,SE-Nor,ENF,60.0865,17.4795,Norunda,Cold,120768.0,2014-01-01 00:00:00,2020-12-31 23:30:00,0.98397
111,FR-Hes,DBF,48.6741,7.06465,Hesse,Temperate,121536.0,2014-01-01 00:00:00,2020-12-31 23:30:00,0.99022
119,IT-SR2,ENF,43.73202,10.29091,San Rossore 2,Temperate,133920.0,2013-01-01 00:00:00,2020-12-31 23:30:00,0.95483
36,US-GLE,ENF,41.36653,-106.2399,GLEES,Cold,156672.0,2005-04-25 00:00:00,2014-12-31 23:30:00,0.92256


['AU-Stp', 'AU-Whr', 'CA-Oas', 'DE-Lnf', 'ES-Amo', 'FI-Sod', 'IT-CA2', 'US-Ton', 'US-Var', 'US-Whs', 'US-Ho1', 'US-Oho', 'US-Seg', 'CH-Dav', 'CZ-Lnz', 'CZ-wet', 'DE-Gri', 'DE-Tha', 'ES-LM1', 'FR-Bil', 'FR-FBn', 'IT-BCi', 'IT-MBo', 'IT-Ren', 'RU-Fyo']


Unnamed: 0,site_id,IGBP,lat,long,site_name,koppen_main_name,record_count,start_time,end_time,recorded_day_percentage
15,AU-Whr,EBF,-36.6732,145.0294,Whroo,Arid,54048.0,2011-12-02 00:00:00,2014-12-31 23:30:00,1.0
30,IT-CA2,CRO,42.37722,12.02604,Castel d'Asso2,Temperate,55536.0,2011-07-13 00:00:00,2014-12-17 23:30:00,0.92265
26,ES-Amo,OSH,36.83361,-2.25232,Amoladeras,Arid,78576.0,2007-07-03 00:00:00,2012-12-31 23:30:00,0.81483
13,AU-Stp,GRA,-17.1507,133.3502,Sturt Plains,Arid,100176.0,2008-08-28 00:00:00,2014-12-31 23:30:00,0.90073
89,CZ-Lnz,DBF,48.68155,16.94633,Lanzhot,Cold,102384.0,2015-01-01 00:00:00,2020-12-31 23:30:00,0.97308
108,FR-Bil,ENF,44.49365,-0.95609,Bilos,Temperate,111216.0,2014-07-14 00:00:00,2020-12-31 23:30:00,0.98053
103,ES-LM1,WSA,39.94269,-5.77868,Majadas del Tietar North,Arid,119232.0,2014-03-15 00:00:00,2020-12-31 23:30:00,1.0
48,US-Whs,OSH,31.7438,-110.0522,Walnut Gulch Lucky Hills Shrub,Arid,131616.0,2007-06-30 00:00:00,2014-12-31 23:30:00,1.0
23,DE-Lnf,DBF,51.32822,10.3678,Leinefelde,Cold,132144.0,2002-04-11 00:00:00,2012-12-31 23:30:00,0.70265
65,US-Oho,DBF,41.5545,-83.8438,Oak Openings,Cold,163824.0,2004-01-01 00:00:00,2013-12-31 23:30:00,0.9343


In [40]:
for i, k in enumerate(SITE_SPLITS):

  # Plot gold sample and test sites candidates
  plot_df = train_site_metadata_df.loc[train_site_metadata_df['site_id'].isin(k)]
  plot_df = plot_df.sort_values(['record_count','recorded_day_percentage'], ascending=[False, False])
  plot_df['start']= pd.to_datetime(plot_df['start_time']).dt.date
  plot_df['end']= pd.to_datetime(plot_df['end_time']).dt.date
  plot_df['IGBP_id'] = plot_df['IGBP'].replace(train_site_IGBP, [x for x in range(len(train_site_IGBP))])

  fold_plot = go.Scattergeo(lat = plot_df["lat"], lon = plot_df["long"],
                      marker_size=plot_df["record_count"]/7500,
                      marker_color=plot_df["IGBP_id"],
                      marker_line_color=plot_df["IGBP_id"],
                      marker_line_width=2,
                      opacity=0.75,
                      customdata=plot_df[["site_name","site_id",
                                                          "record_count","recorded_day_percentage",
                                                          "start","end",
                                                          "IGBP","koppen_main_name"]])
  
  fig = go.Figure(data=[fold_plot])
  fig.update_layout(
      title={
          'text': f"Fold {i+1} Sites Locations",
          'y':0.93,
          'x':0},
      geo = go.layout.Geo(
          resolution = 50,
          scope = "world",
          showframe = True,
          showcoastlines = True,
          landcolor = "rgb(229, 229, 229)",
          countrycolor = "white" ,
          coastlinecolor = "white"
      )
  )
  fig.update_geos(lataxis_range=[-60,90])
  fig.update_traces(hovertemplate = "<b>%{customdata[0]} (%{customdata[1]})</b>" +
                                    "<br>(%{customdata[6]}, %{customdata[7]})" + 
                                    "<br>Records: %{customdata[2]} (%{customdata[3]}%)" + 
                                    "<br>%{customdata[4]} ~ %{customdata[5]}")
  fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, height = 400, width = 750)
  fig.update_layout(legend=dict(orientation="h", y=0.05, xanchor="left", yanchor="bottom", itemwidth=50))

  fig.show()

In [None]:
# Export to HTML
file_name= data_dir + os.sep + "figures" + os.sep + "GoldSites_TrainCandiates_WorldMap.html"
pio.write_html(fig, file = file_name, include_plotlyjs = 'cdn')

# Select Test Sites

In [None]:
test_sites = ["US-GLE", # ENF, Cold
              "US-AR1", # GRA, Temperate
              "US-Seg", # GRA, Arid
              "US-FR2", # WSA, Temperate
              "ES-LM2", # WSA, Arid
              "CA-Cbo", # DBF, Cold
              "FR-Lam", # CRO, Temperate
              "IT-Cpz", # EBF, Temperate
              #"CN-Cha", # MF Cold
              "IT-Lsn", # OSH, Temperate
              ]

## Plot Gold Sample Sites and Selected Test Sites

In [None]:
final_test_plot_df = test_plot_df.loc[test_plot_df['site_id'].isin(test_sites)]

colormap = px.colors.qualitative.Plotly
final_test_plot_df['color'] = [ colormap[x] for x in final_test_plot_df["IGBP_id"]]
train_plot_df['color'] = [ colormap[x] for x in train_plot_df["IGBP_id"]]

text_template= "<b>%{customdata[1]}</b>" + \
          "<br>(%{customdata[6]}, %{customdata[7]})"

train_plot = go.Scattergeo(lat = train_plot_df["lat"], lon = train_plot_df["long"],
                     marker_size=train_plot_df["record_count"]/8000,
                     marker_color=train_plot_df["color"],
                     marker_line_color="#3E3E3E",
                     marker_line_width=1,
                     marker_symbol ="arrow",
                     name = "Train",
                     mode = "markers+text",
                     textposition = "bottom center", texttemplate = "TRAIN: " + text_template,
                     textfont=dict(color="#3E3E3E", size=14),
                     customdata=train_plot_df[["site_name","site_id",
                                              "record_count","recorded_day_percentage",
                                              "start","end",
                                              "IGBP","koppen_main_name"]])

test_plot = go.Scattergeo(lat = final_test_plot_df["lat"], lon = final_test_plot_df["long"],
                     marker_size=final_test_plot_df["record_count"]/8000,
                     marker_color=final_test_plot_df["color"],
                     marker_line_color="#20603C",
                     marker_line_width=1,
                     name = "Test",
                     mode = "markers+text",
                     textposition = "top center", texttemplate = "TEST: " + text_template,
                     textfont=dict(color="#20603C", size=14),
                     customdata=final_test_plot_df[["site_name","site_id",
                                                    "record_count","recorded_day_percentage",
                                                    "start","end",
                                                    "IGBP","koppen_main_name"]])

fig = go.Figure(data=[train_plot, test_plot])
fig.update_layout(
    title={
        'text': "Gold Sample (Triaining) Sites and Selected Test Sites",
        'y':1,
        'x':0},
    geo = go.layout.Geo(
        resolution = 110,
        scope = "world",
        projection_type="mercator",
        projection_scale=0.8,
        showframe = True,
        showcoastlines = True,
        landcolor = "rgb(229, 229, 229)",
        showcountries=True, countrycolor = "white" ,
        coastlinecolor = "white"
    )
)
#fig.update_geos(lataxis_range=[-60,90], lonaxis_range=[-120,-60])
fig.update_traces(hovertemplate = "<b>%{customdata[0]} (%{customdata[1]})</b>" +
                                  "<br>(%{customdata[6]}, %{customdata[7]})" + 
                                  "<br>Records: %{customdata[2]} (%{customdata[3]}%)" + 
                                  "<br>%{customdata[4]} ~ %{customdata[5]}")
fig.update_layout(margin={"r":0,"t":50,"l":0,"b":0}, height = 1000, width = 1200)
fig.update_layout(legend=dict(orientation="h", y=0.97, xanchor="left", yanchor="top", itemwidth=50))

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
# Export to HTML
file_name= data_dir + os.sep + "figures" + os.sep + "GoldSites_TrainSites_WorldMap.html"
pio.write_html(fig, file = file_name, include_plotlyjs = 'cdn')

In [None]:
final_test_plot_df

Unnamed: 0,site_id,IGBP,lat,long,site_name,koppen_main_name,record_count,start_time,end_time,recorded_day_percentage,start,end,IGBP_id
28,CN-Cha,MF,42.4025,128.0958,Changbaishan,Cold,43680.0,2003-01-01 00:00:00,2005-12-04 23:30:00,0.85126,2003-01-01,2005-12-04,6
52,IT-Cpz,EBF,41.70525,12.37611,Castelporziano,Temperate,132720.0,2000-03-21 00:00:00,2008-12-09 23:30:00,0.86786,2000-03-21,2008-12-09,0
66,US-GLE,ENF,41.36653,-106.2399,GLEES,Cold,156672.0,2005-04-25 00:00:00,2014-12-31 23:30:00,0.92256,2005-04-25,2014-12-31,1
102,CA-Cbo,DBF,44.3167,-79.9333,"Ontario - Mixed Deciduous, Borden Forest Site",Cold,238944.0,1997-12-26 00:00:00,2014-12-31 23:30:00,0.80097,1997-12-26,2014-12-31,5
114,US-AR1,GRA,36.4267,-99.42,ARM USDA UNL OSU Woodward Switchgrass 1,Temperate,58848.0,2009-04-30 00:00:00,2012-12-31 23:30:00,0.91356,2009-04-30,2012-12-31,3
125,US-FR2,WSA,29.9495,-97.9962,Freeman Ranch- Mesquite Juniper,Temperate,65520.0,2005-01-01 00:00:00,2008-12-31 23:30:00,0.93429,2005-01-01,2008-12-31,2
139,US-Seg,GRA,34.3623,-106.7019,Sevilleta grassland,Arid,183984.0,2007-01-01 00:00:00,2017-12-31 23:30:00,0.95396,2007-01-01,2017-12-31,3
177,ES-LM2,WSA,39.93459,-5.77588,Majadas del Tietar South,Arid,117408.0,2014-03-15 00:00:00,2020-12-31 23:30:00,0.9847,2014-03-15,2020-12-31,2
189,FR-Lam,CRO,43.49644,1.23788,Lamasquere,Temperate,259296.0,2005-03-13 00:00:00,2020-12-31 23:30:00,0.93574,2005-03-13,2020-12-31,4
195,IT-Lsn,OSH,45.74048,12.7503,Lison,Temperate,82656.0,2016-01-01 00:00:00,2020-12-31 23:30:00,0.94253,2016-01-01,2020-12-31,7


In [None]:
#Target test dataset (untrimmed) size: 692044.8
#Target test dataset (untrimmed) size: 346022.4

final_test_plot_df.record_count.sum()

1339728.0