# Notebook Setup

In [2]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

In [3]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Mounted at /content/drive/


## Import Modules

In [None]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

In [None]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from edahelpers import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()
# Check Spark Session Information
spark

## Define Local File System Constants

In [None]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'


## Load Data From Azure If Needed
Download to `/.tmp` if there is not local copy

In [None]:
# Parquet File Properties
tag = "1_raw"
container = "gold-samples-data"
blob_name = f"gold_samples_trim_v_{tag}.parquet"

# Download from Azure if there is not local copy
data_df = None
if not (os.path.exists(tmp_dir+blob_name)):
    if not (os.path.exists(tmp_dir)):
        os.mkdir(tmp_dir)
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
    data_df = pd.read_parquet(file_stream, engine='pyarrow')
    data_df.to_parquet(tmp_dir + os.sep + blob_name)
else:
    data_df = pd.read_parquet(tmp_dir + os.sep + blob_name)

print(f"size: {data_df.shape}")
data_df.head()

size: (1485926, 36)


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent
0,5.311,25.016,272.218,1.708,0.0,97.939,-0.53574,2001-01-01 08:30:00,2001,1,1,8,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
1,5.744,59.734,272.218,1.738,0.0,97.939,0.86438,2001-01-01 09:00:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
2,6.176,91.235,272.218,1.767,0.0,97.939,-0.02627,2001-01-01 09:30:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
3,6.608,79.264,333.933,1.797,0.05,97.939,-0.17229,2001-01-01 10:00:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
4,7.043,94.929,333.933,1.817,0.0,97.923,1.20865,2001-01-01 10:30:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59


# Check Missing Value

In [None]:
total_record_count = data_df.shape[0]
na_df = pd.DataFrame(data_df.isna().sum())
na_df["percentage"] = (na_df / total_record_count)
na_df.rename(columns={0:"count"}, inplace=True)
na_df.loc[(na_df['count'] != 0)].sort_values("percentage", ascending=False)

Unnamed: 0,count,percentage


## Get NA percentage per Feature/Site

In [None]:
sites = data_df['site_id'].unique()
features =  data_df.columns
plot_data = pd.DataFrame(columns=features[:-1])

for i, s in enumerate(data_df['site_id'].unique()):

    site_df = data_df[data_df['site_id'] == s]
    
    site_na_df = pd.DataFrame(site_df.isna().sum())
    site_na_df["percentage"] = round( 100*(site_na_df / site_df.shape[0]), 2)
    site_na_df.drop(["site_id"], inplace=True)
    #display(site_na_df.T)
    site_na_df.drop([0], axis=1, inplace=True)
    
    plot_data.loc[s] = site_na_df["percentage"]

# Drop columns to no missing data
plot_df = plot_data.loc[:, (plot_data.sum(axis=0) != 0)]

plot_df.shape

del sites
del plot_data
del site_na_df
del site_df

In [None]:
# Plot
fig = px.imshow(plot_df,
                color_continuous_scale = 'amp',
                text_auto=True,
                title= "Proportions of Missing Data per Feature and Site",
                labels=dict(x="Features", y="Site ID",
                            color="Missing<br>Data(%)"),
               )
fig.update(data=[{'hovertemplate': '%{y} (%{x})<br>Missing: %{z}%'}])
fig.update_layout(
    height=700, width=800,
    margin={"r":0,"t":75,"l":0,"b":0},
    title={'y':0.95,'x':0.025},
    coloraxis_colorbar=dict(
        title_font_size = 14,
        tickfont_size = 12,
        lenmode="pixels", len=400,
        thicknessmode="pixels", thickness=15
    )
)

fig.show()

In [None]:
del plot_df
del na_df

# ERA Features EDA

In [None]:
features = ['site_id', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA']
target_variable = ['GPP_NT_VUT_REF']
data_df = pd.read_parquet(tmp_dir  + os.sep + blob_name, columns = features + target_variable)

In [None]:
sites = list(data_df['site_id'].unique())
data_df['site_num'] = [sites.index(x) for x in data_df['site_id']]

In [None]:
data_df.columns

Index(['site_id', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'GPP_NT_VUT_REF', 'site_num'],
      dtype='object')

In [None]:
features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA']
fig = px.scatter_matrix(data_df.rename(columns={'GPP_NT_VUT_REF': 'GPP'}).sample(2000),
          dimensions=features + ['GPP'],
          color="site_id",)
fig.update_traces(diagonal_visible=False, showupperhalf=False)
fig.update_layout(height=1000, title_text="Scatter Matrix on ERA Features and GPP")
fig.show()

In [None]:
fig = make_subplots(rows=2, cols=3, shared_yaxes=True,
                    subplot_titles=features)

for i, f in enumerate(features):
    c = (i%3) + 1
    r = math.floor(i/3) + 1
    feature_df = data_df[[f, 'GPP_NT_VUT_REF', 'site_num']].sample(n=1000)
    fig.add_trace(
        go.Scatter(x=feature_df[f], y=feature_df['GPP_NT_VUT_REF'],
                   mode="markers",
                   showlegend=True,
                   marker=dict(color=feature_df['site_num'])),
        row=r, col=c
    )

fig.update_layout(height=600, width=800, title_text="Side By Side Subplots",
                  showlegend=False)
fig.show()

In [None]:
features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA']

fig = make_subplots(rows=2, cols=3, shared_yaxes=True,
                    subplot_titles=features)
color_swatches = px.colors.qualitative.Dark24

for i, s in enumerate(sites):
  site_df = data_df[data_df['site_id'] == s].sample(n=100)
  site_color = None
  for j, f in enumerate(features):
      c = (j%3) + 1
      r = math.floor(j/3) + 1
      fig.add_trace(
          go.Scatter(x=site_df[f], y=site_df['GPP_NT_VUT_REF'],
                    mode="markers", showlegend=False,
                    opacity = 0.5, marker_color=color_swatches[i],
                    name = s),
          row=r, col=c
      )
      if j == 0:
        fig.data[-1].showlegend = True

fig.update_layout(height=600, width=800, title_text="Side By Side Subplots")
fig.show()