# Notebook Setup

In [3]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Mounted at /content/drive/


## Import Modules

In [4]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.6/96.6 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
flask 1.1.4 requires click<8.0,>=5.1, but you have click 8.1.3 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [5]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

from pyspark.sql.functions import col
import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [6]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").config(
    "spark.jars.packages", 
    "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-storage:8.6.6"
    ).getOrCreate()
# Check Spark Session Information
spark

# Constant Definitions

In [7]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data"

monthly_data_filename = raw_data_dir + os.sep + 'data_monthly_v1_0.csv'

# Load Monthly Data

Full features from the monthly data:
```
['SITE_ID', 'year', 'month', 'time', 'TIMESTAMP', 'dataset',
'LOCATION_LAT', 'LOCATION_LONG',
'TA_F', 'VPD_F', 'P_F', 'NETRAD',
'NEE_VUT_REF', 'NEE_VUT_REF_QC', 'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF', 'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF', 'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF',
'ET', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'CSIF-SIFinst', 'PET', 'Ts', 'Tmean', 'prcp', 'vpd', 'prcp-lag3', 'ESACCI-sm', 'MODIS_LC', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'EVI', 'GCI', 'NDVI', 'NDWI', 'NIRv', 'kNDVI',
'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night',
'SITE_IGBP', 'MODIS_IGBP','MODIS_PFT', 'koppen_sub', 'koppen', 'CO2_concentration']
```

In [25]:
included_features= ['SITE_ID', 'year', 'month',
                   'ESACCI-sm',    # ESACCI Soil Moisture (%)
                   'Percent_Snow', # Percentage of snow cover (%)
                   'NDWI',      # Normalized Different Water Index (NDWI)
                   'PET',       # Potential ET (m)
                   'MODIS_PFT', # Plant Function Type
                   'MODIS_LC',  # MODIS Land Cover
                   'Ts',        # Skin temperature (K) ??
                   'LST_Day',   # Daytime land surface temperature (K)
                   'LST_Night', # Nightime land surface temperature (K)
                   'Lai',       # Leaf Area Index (LAI)
                   'Fpar',      # Fraction of photosynthetically active radiation (fPAR)
                   'CSIF-SIFdaily', # All-sky daily average SIF
                   'BESS-PAR',      # Photosynthetic Active Radiation (PAR) (W/m^2)
                   'BESS-PARdiff',  # Diffuse PAR (W/m^2)
                   'BESS-RSDN'      # Shortwave downwelling radiation (W/m^2)
                   ]
month_df = pd.read_csv(monthly_data_filename) #usecols = included_features)

# only focus on target sites
print(f"size:{month_df.shape}")
month_df.head()

size:(19015, 62)


Unnamed: 0,SITE_ID,year,month,TIMESTAMP,dataset,SITE_IGBP,LOCATION_LAT,LOCATION_LONG,TA_F,VPD_F,P_F,NETRAD,NEE_VUT_REF,NEE_VUT_REF_QC,NEE_CUT_REF,NEE_CUT_REF_QC,GPP_NT_VUT_REF,GPP_DT_VUT_REF,GPP_NT_CUT_REF,GPP_DT_CUT_REF,RECO_NT_VUT_REF,RECO_DT_VUT_REF,RECO_NT_CUT_REF,RECO_DT_CUT_REF,time,ET,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,CSIF-SIFinst,PET,Ts,Tmean,prcp,vpd,prcp-lag3,ESACCI-sm,MODIS_LC,b1,b2,b3,b4,b5,b6,b7,EVI,GCI,NDVI,NDWI,NIRv,kNDVI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_IGBP,MODIS_PFT,koppen_sub,koppen,CO2_concentration
0,AR-SLu,2010,1,201001,FLUXNET,MF,-33.4648,-66.4598,28.493,23.378,0.903,188.59881,-5.63278,0.94489,-5.6277,0.94825,10.2095,11.9133,10.089,11.9232,4.46072,7.03163,4.45634,7.06081,1/31/10,9.01454,154,40,336,0.20432,0.51663,-0.01339,302.46967,300.10977,0.00212,2.06618,0.00874,0.15152,7,0.08443,0.26877,0.04532,0.08053,0.30058,0.25053,0.15536,0.32126,2.3492,0.52271,0.03542,0.14051,0.26745,0.0,0.49,1.2,313.84,293.58,OSH,SH,BSk,Arid,387.11
1,AR-SLu,2010,2,201002,FLUXNET,MF,-33.4648,-66.4598,26.673,14.369,1.986,144.2162,-4.47433,0.96949,-4.45358,0.97098,8.16307,9.97563,8.09051,10.1663,3.62522,5.68557,3.6153,6.02964,2/28/10,7.67797,120,46,258,0.14553,0.38726,-0.00894,298.78864,297.27515,0.00313,1.09011,0.00972,0.16656,7,0.0918,0.25245,0.04803,0.08092,0.29245,0.25221,0.15946,0.2783,2.12166,0.46684,0.0004,0.11781,0.21459,0.0,0.43,0.9,309.86,292.96,OSH,SH,BSk,Arid,387.675
2,AR-SLu,2010,3,201003,FLUXNET,MF,-33.4648,-66.4598,25.744,15.167,0.371,125.64314,-3.89288,0.93884,-3.88405,0.93884,7.06222,9.00824,7.07681,9.00492,3.18909,6.51721,3.18613,6.61985,3/31/10,5.89032,107,31,231,0.1098,0.30724,-0.00813,297.54816,296.4367,0.00221,1.16864,0.00745,0.16408,7,0.08556,0.2304,0.04454,0.07464,0.26943,0.24116,0.15364,0.2569,2.08743,0.45835,-0.02286,0.10565,0.20722,0.0,0.41,0.8,309.18,290.52,OSH,SH,BSk,Arid,388.195
3,AR-SLu,2010,4,201004,FLUXNET,MF,-33.4648,-66.4598,18.45,9.185,0.1,71.50069,-3.11159,0.9625,-3.10705,0.9625,5.72781,6.54333,5.6526,6.6073,2.55268,4.14082,2.54582,4.04346,4/30/10,2.34566,81,27,175,0.07673,0.22387,-0.00676,291.69604,290.61377,0.00021,0.9462,0.00555,0.12402,7,0.0874,0.21581,0.0453,0.07454,0.26009,0.23483,0.14689,0.22921,1.89596,0.42359,-0.04202,0.0914,0.17755,0.0,0.36,0.5,303.24,286.34,OSH,SH,BSk,Arid,388.905
4,AR-SLu,2010,5,201005,FLUXNET,MF,-33.4648,-66.4598,13.493,5.823,1.852,41.24915,-1.71633,0.89583,-1.55985,0.91398,3.47763,4.15484,3.56473,4.16886,1.78421,3.35165,1.84822,3.54627,5/31/10,2.208,56,19,122,0.06602,0.20064,-0.00473,287.05652,286.8832,0.00084,0.71629,0.00326,0.14273,7,0.07619,0.19819,0.03887,0.06481,0.23148,0.20282,0.1242,0.22353,2.06111,0.44537,-0.01064,0.08815,0.19592,0.0,0.37,0.5,296.2,277.82,OSH,SH,BSk,Arid,389.32


# Feature EDA

In [33]:
group_df = month_df.groupby('SITE_ID').first()
print(f"size:{group_df.shape}")
tmp_df = group_df[group_df['SITE_IGBP'] != group_df['MODIS_IGBP']][['SITE_IGBP', 'MODIS_IGBP', 'MODIS_LC']]
print(f"size:{tmp_df.shape}")
tmp_df

size:(243, 61)
size:(162, 3)


Unnamed: 0_level_0,SITE_IGBP,MODIS_IGBP,MODIS_LC
SITE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AR-SLu,MF,OSH,7
AR-Vir,ENF,SAV,9
AT-Neu,GRA,MF,5
AU-ASM,SAV,CSH,6
AU-Ade,WSA,GRA,10
AU-Cpr,SAV,CSH,6
AU-Cum,EBF,SAV,9
AU-DaS,SAV,GRA,10
AU-Dry,SAV,GRA,10
AU-Fog,WET,GRA,10
