# Notesbook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import modules

In [2]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

In [3]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()
# Check Spark Session Information
spark

In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from edahelpers import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Define Constants

In [5]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'

# Output
tag = "1_raw"
container = "gold-samples-data"
blob_name = f"gold_samples_trim_v_{tag}.parquet" #Advisor suggested features only

In [6]:
# "Golden" Sites
tier1_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

target_sites = tier1_sites + tier2_sites

# Get Gold Sample Site Metadata

In [7]:
# Load site metadata
included_site_features = ['site_id', 'site_name', 'IGBP', 'lat', 'long', 'koppen_main_name', 'start_time','end_time', 'record_count', 'recorded_day_percentage']
train_site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
train_site_metadata_df= train_site_metadata_df.loc[train_site_metadata_df['site_id'].isin(target_sites)]
print(f"size:{train_site_metadata_df.shape}")
train_site_metadata_df.reset_index(inplace=True, drop=True)
train_site_metadata_df=train_site_metadata_df.dropna()
display(train_site_metadata_df)


size:(16, 10)


Unnamed: 0,site_id,IGBP,lat,long,site_name,koppen_main_name,record_count,start_time,end_time,recorded_day_percentage
0,FR-Pue,EBF,43.7413,3.5957,Puechabon,Temperate,245760.0,2000-07-26 00:00:00,2014-12-31 23:30:00,0.97117
1,US-NR1,ENF,40.0329,-105.5464,Niwot Ridge Forest (LTER NWT1),Cold,270768.0,1999-05-02 00:00:00,2014-12-31 23:30:00,0.98567
3,US-SRM,WSA,31.8214,-110.8661,Santa Rita Mesquite,Arid,190752.0,2004-01-01 00:00:00,2014-12-31 23:30:00,0.98905
4,US-Ton,WSA,38.4316,-120.96598,Tonzi Ranch,Temperate,230928.0,2001-05-24 00:00:00,2014-12-31 23:30:00,0.96801
5,US-Var,GRA,38.4133,-120.9507,Vaira Ranch- Ione,Temperate,245712.0,2000-11-01 00:00:00,2014-12-31 23:30:00,0.98937
6,US-Wkg,GRA,31.7365,-109.9419,Walnut Gulch Kendall Grasslands,Arid,186768.0,2004-05-07 00:00:00,2014-12-31 23:30:00,1.0
7,US-ARM,CRO,36.6058,-97.4888,ARM Southern Great Plains site- Lamont,Temperate,259104.0,2003-01-01 00:00:00,2018-12-31 23:30:00,0.92368
9,US-Me2,ENF,44.4523,-121.5574,Metolius mature ponderosa pine,Cold,230688.0,2002-01-01 00:00:00,2017-12-31 23:30:00,0.82238
10,US-UMB,DBF,45.5598,-84.7138,Univ. of Mich. Biological Station,Cold,191904.0,2007-01-01 00:00:00,2017-12-31 23:30:00,0.99502
11,US-Vcp,ENF,35.8624,-106.5974,Valles Caldera Ponderosa Pine,Cold,174528.0,2007-01-01 00:00:00,2017-12-31 23:30:00,0.90493


In [8]:
train_site_IGBP = list(train_site_metadata_df['IGBP'].unique())
print(train_site_IGBP)

train_site_koppen = train_site_metadata_df['koppen_main_name'].unique()
print(train_site_koppen)

['EBF', 'ENF', 'WSA', 'GRA', 'CRO', 'DBF', 'MF', 'OSH']
['Temperate' 'Cold' 'Arid']


In [9]:
# Target test dataset record count:
print(f"Train dataset (untrimmed) size: {train_site_metadata_df['record_count'].sum()}")
print(f"Target test dataset (untrimmed) size: {train_site_metadata_df['record_count'].sum()*0.2}")
print(f"Target test dataset (untrimmed) size: {train_site_metadata_df['record_count'].sum()*0.1}")

Train dataset (untrimmed) size: 3460224.0
Target test dataset (untrimmed) size: 692044.8
Target test dataset (untrimmed) size: 346022.4


# Get Test Sites Candidates

In [10]:
# Load site metadata
included_site_features = ['site_id', 'site_name', 'IGBP', 'lat', 'long', 'koppen_main_name', 'start_time','end_time', 'record_count', 'recorded_day_percentage']
test_site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# Not the gold sample site, and same IGBP as gold sample sites
cond = ((~test_site_metadata_df['site_id'].isin(target_sites)) &\
        (test_site_metadata_df['IGBP'].isin(train_site_IGBP)) &\
        (test_site_metadata_df['koppen_main_name'].isin(train_site_koppen)))
test_site_metadata_df = test_site_metadata_df.loc[cond]
test_site_metadata_df.reset_index(inplace=True, drop=True)
test_site_metadata_df = test_site_metadata_df.dropna()
test_site_metadata_df.drop(test_site_metadata_df[test_site_metadata_df['recorded_day_percentage'] <= 0.8].index, inplace = True)
print(f"size:{test_site_metadata_df.shape}")
test_site_metadata_df.sort_values(['record_count','recorded_day_percentage'], ascending=[True, False])

size:(169, 10)


Unnamed: 0,site_id,IGBP,lat,long,site_name,koppen_main_name,record_count,start_time,end_time,recorded_day_percentage
171,DK-Gds,ENF,56.0737,9.3341,Gludsted Plantage,Cold,7632.0,2020-07-09 00:00:00,2020-12-14 23:30:00,1.0
95,US-Wi7,OSH,46.64911,-91.06928,Red pine clearcut (RPCC),Cold,8112.0,2005-05-23 00:00:00,2005-11-07 23:30:00,1.0
96,US-Wi8,DBF,46.72233,-91.25242,Young hardwood clearcut (YHW),Cold,8400.0,2002-05-17 00:00:00,2002-11-07 23:30:00,1.0
44,ES-Ln2,OSH,36.9695,-3.47582,Lanjaron-Salvage logging,Cold,9168.0,2009-05-30 00:00:00,2009-12-31 23:30:00,0.88426
88,US-Wi0,ENF,46.61878,-91.08144,Young red pine (YRP),Cold,10272.0,2002-04-27 00:00:00,2002-12-14 23:30:00,0.92241
93,US-Wi5,ENF,46.65308,-91.08581,Mixed young jack pine (MYJP),Cold,11376.0,2004-04-20 00:00:00,2004-12-12 23:30:00,1.0
41,DK-Fou,CRO,56.4842,9.58722,Foulum,Cold,11568.0,2005-01-30 00:00:00,2005-10-12 23:30:00,0.94141
120,US-Bi2,CRO,38.1091,-121.5351,Bouldin Island corn,Temperate,11856.0,2017-04-29 00:00:00,2017-12-31 23:30:00,1.0
70,US-KS1,ENF,28.4583,-80.6709,Kennedy Space Center (slash pine),Temperate,13248.0,2002-03-07 00:00:00,2002-12-31 23:30:00,0.92
33,CN-Du3,GRA,42.0551,116.2809,Duolun Degraded Meadow,Cold,13344.0,2009-08-04 00:00:00,2010-05-08 23:30:00,1.0


In [11]:
# Plot gold sample and test sites candidates
train_plot_df = train_site_metadata_df
train_plot_df['start']= pd.to_datetime(train_plot_df['start_time']).dt.date
train_plot_df['end']= pd.to_datetime(train_plot_df['end_time']).dt.date
train_plot_df['IGBP_id'] = train_plot_df['IGBP'].replace(train_site_IGBP, [x for x in range(len(train_site_IGBP))])

test_plot_df = test_site_metadata_df
test_plot_df['start']= pd.to_datetime(test_plot_df['start_time']).dt.date
test_plot_df['end']= pd.to_datetime(test_plot_df['end_time']).dt.date
test_plot_df['IGBP_id'] = test_plot_df['IGBP'].replace(train_site_IGBP, [x for x in range(len(train_site_IGBP))])

train_plot = go.Scattergeo(lat = train_plot_df["lat"], lon = train_plot_df["long"],
                     marker_size=train_plot_df["record_count"]/15000,
                     marker_color=train_plot_df["IGBP_id"],
                     marker_line_color=train_plot_df["IGBP_id"],
                     marker_line_width=2,
                     marker_symbol ="star-open-dot",
                     name = "Train",
                     opacity=1,
                     customdata=train_plot_df[["site_name","site_id",
                                                         "record_count","recorded_day_percentage",
                                                         "start","end",
                                                         "IGBP","koppen_main_name"]])

test_plot = go.Scattergeo(lat = test_plot_df["lat"], lon = test_plot_df["long"],
                     marker_size=test_plot_df["record_count"]/15000,
                     marker_color=test_plot_df["IGBP_id"],
                     opacity=0.75,
                     name = "Test<br>Candidate",
                     customdata=test_plot_df[["site_name","site_id",
                                                         "record_count","recorded_day_percentage",
                                                         "start","end",
                                                         "IGBP","koppen_main_name"]])

fig = go.Figure(data=[train_plot, test_plot])
fig.update_layout(
    title={
        'text': "Gold Sample (Triaining) Sites and Test Site Candidates",
        'y':0.925,
        'x':0},
    geo = go.layout.Geo(
        resolution = 50,
        scope = "world",
        showframe = False,
        showcoastlines = True,
        landcolor = "rgb(229, 229, 229)",
        countrycolor = "white" ,
        coastlinecolor = "white"
    )
)
fig.update_geos(lataxis_range=[-60,90])
fig.update_traces(hovertemplate = "<b>%{customdata[0]} (%{customdata[1]})</b>" +
                                  "<br>(%{customdata[6]}, %{customdata[7]})" + 
                                  "<br>Records: %{customdata[2]} (%{customdata[3]}%)" + 
                                  "<br>%{customdata[4]} ~ %{customdata[5]}")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, height = 500, width = 800)
fig.update_layout(legend=dict(orientation="h", y=0.05, xanchor="left", yanchor="bottom", itemwidth=50))

fig.show()

In [12]:
# Export to HTML
file_name= data_dir + os.sep + "figures" + os.sep + "GoldSites_TrainCandiates_WorldMap.html"
pio.write_html(fig, file = file_name, include_plotlyjs = 'cdn')

# Select Test Sites

In [31]:
test_sites = ["US-GLE", # ENF, Cold
              "US-AR1", # GRA, Temperate
              "US-Seg", # GRA, Arid
              "US-FR2", # WSA, Temperate
              "ES-LM2", # WSA, Arid
              "CA-Cbo", # DBF, Cold
              "FR-Lam", # CRO, Temperate
              "IT-Cpz", # EBF, Temperate
              "CN-Cha", # MF Cold
              "IT-Lsn", # OSH, Temperate
              ]

## Plot Gold Sample Sites and Selected Test Sites

In [32]:
final_test_plot_df = test_plot_df.loc[test_plot_df['site_id'].isin(test_sites)]

train_plot = go.Scattergeo(lat = train_plot_df["lat"], lon = train_plot_df["long"],
                     marker_size=train_plot_df["record_count"]/15000,
                     marker_color=train_plot_df["IGBP_id"],
                     marker_line_color=train_plot_df["IGBP_id"],
                     marker_line_width=2,
                     marker_symbol ="star-open-dot",
                     name = "Train",
                     opacity=1,
                     customdata=train_plot_df[["site_name","site_id",
                                                         "record_count","recorded_day_percentage",
                                                         "start","end",
                                                         "IGBP","koppen_main_name"]])

test_plot = go.Scattergeo(lat = final_test_plot_df["lat"], lon = final_test_plot_df["long"],
                     marker_size=final_test_plot_df["record_count"]/15000,
                     marker_color=final_test_plot_df["IGBP_id"],
                     name = "Test",
                     customdata=final_test_plot_df[["site_name","site_id",
                                                         "record_count","recorded_day_percentage",
                                                         "start","end",
                                                         "IGBP","koppen_main_name"]])

fig = go.Figure(data=[train_plot, test_plot])
fig.update_layout(
    title={
        'text': "Gold Sample (Triaining) Sites and Selected Test Sites",
        'y':0.925,
        'x':0},
    geo = go.layout.Geo(
        resolution = 50,
        scope = "world",
        showframe = False,
        showcoastlines = True,
        landcolor = "rgb(229, 229, 229)",
        countrycolor = "white" ,
        coastlinecolor = "white"
    )
)
fig.update_geos(lataxis_range=[-60,90])
fig.update_traces(hovertemplate = "<b>%{customdata[0]} (%{customdata[1]})</b>" +
                                  "<br>(%{customdata[6]}, %{customdata[7]})" + 
                                  "<br>Records: %{customdata[2]} (%{customdata[3]}%)" + 
                                  "<br>%{customdata[4]} ~ %{customdata[5]}")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, height = 500, width = 800)
fig.update_layout(legend=dict(orientation="h", y=0.05, xanchor="left", yanchor="bottom", itemwidth=50))

fig.show()

# Export to HTML
file_name= data_dir + os.sep + "figures" + os.sep + "GoldSites_TrainSites_WorldMap.html"
pio.write_html(fig, file = file_name, include_plotlyjs = 'cdn')

In [33]:
final_test_plot_df

Unnamed: 0,site_id,IGBP,lat,long,site_name,koppen_main_name,record_count,start_time,end_time,recorded_day_percentage,start,end,IGBP_id
28,CN-Cha,MF,42.4025,128.0958,Changbaishan,Cold,43680.0,2003-01-01 00:00:00,2005-12-04 23:30:00,0.85126,2003-01-01,2005-12-04,6
52,IT-Cpz,EBF,41.70525,12.37611,Castelporziano,Temperate,132720.0,2000-03-21 00:00:00,2008-12-09 23:30:00,0.86786,2000-03-21,2008-12-09,0
66,US-GLE,ENF,41.36653,-106.2399,GLEES,Cold,156672.0,2005-04-25 00:00:00,2014-12-31 23:30:00,0.92256,2005-04-25,2014-12-31,1
102,CA-Cbo,DBF,44.3167,-79.9333,"Ontario - Mixed Deciduous, Borden Forest Site",Cold,238944.0,1997-12-26 00:00:00,2014-12-31 23:30:00,0.80097,1997-12-26,2014-12-31,5
114,US-AR1,GRA,36.4267,-99.42,ARM USDA UNL OSU Woodward Switchgrass 1,Temperate,58848.0,2009-04-30 00:00:00,2012-12-31 23:30:00,0.91356,2009-04-30,2012-12-31,3
125,US-FR2,WSA,29.9495,-97.9962,Freeman Ranch- Mesquite Juniper,Temperate,65520.0,2005-01-01 00:00:00,2008-12-31 23:30:00,0.93429,2005-01-01,2008-12-31,2
139,US-Seg,GRA,34.3623,-106.7019,Sevilleta grassland,Arid,183984.0,2007-01-01 00:00:00,2017-12-31 23:30:00,0.95396,2007-01-01,2017-12-31,3
177,ES-LM2,WSA,39.93459,-5.77588,Majadas del Tietar South,Arid,117408.0,2014-03-15 00:00:00,2020-12-31 23:30:00,0.9847,2014-03-15,2020-12-31,2
189,FR-Lam,CRO,43.49644,1.23788,Lamasquere,Temperate,259296.0,2005-03-13 00:00:00,2020-12-31 23:30:00,0.93574,2005-03-13,2020-12-31,4
195,IT-Lsn,OSH,45.74048,12.7503,Lison,Temperate,82656.0,2016-01-01 00:00:00,2020-12-31 23:30:00,0.94253,2016-01-01,2020-12-31,7


In [34]:
#Target test dataset (untrimmed) size: 692044.8
#Target test dataset (untrimmed) size: 346022.4

final_test_plot_df.record_count.sum()

1339728.0