In [1]:
from calendar import monthrange
from datetime import datetime
import pandas as pd
from pandas.api.types import CategoricalDtype
from io import BytesIO
import os
import json

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
sys.path.append(os.path.abspath("../tools"))
from CloudIO.AzStorageClient import AzStorageClient
from edahelpers import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [2]:
root_dir =  os.path.abspath(os.getcwd()+"..\\..\\..\\..\\")
tmp_dir = root_dir + "\\.tmp"
data_dir = root_dir + "\\data\\"
cred_dir = root_dir + "\\.cred"
az_cred_file = cred_dir + '\\azblobcred.json'

site_metadata_filename = data_dir + "site-metadata.csv"

# Output
container = "gold-samples-data"
# blob_name = "gold_samples_full_data.parquet"
blob_name = "gold_samples_trim_data_0206.parquet" #Advisor suggested features only

In [3]:
# "Golden" Sites
tier1_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

target_sites = tier1_sites + tier2_sites

# Get Gold Sample Site Data

In [4]:
# Load site metadata
included_site_features = ['site_id', 'filename','IGBP', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name', 'koppen_main_name',
                          'c3c4', 'c4_percent']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df= site_metadata_df.loc[site_metadata_df['site_id'].isin(target_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

size:(16, 12)


Unnamed: 0,site_id,IGBP,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent,filename
0,FR-Pue,EBF,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59,data_full_half_hourly_raw_v0_1_FR-Pue.csv
1,US-NR1,ENF,3050.0,40.0329,-105.5464,27,4,Dfc,Cold,C3,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv
2,US-Ne2,CRO,362.0,41.16487,-96.4701,25,4,Dfa,Cold,rotation,48.91,
3,US-SRM,WSA,1120.0,31.8214,-110.8661,6,2,BSh,Arid,C3,55.39,data_full_half_hourly_raw_v0_1_US-SRM.csv
4,US-Ton,WSA,177.0,38.4316,-120.96598,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_US-Ton.csv
5,US-Var,GRA,129.0,38.4133,-120.9507,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_US-Var.csv
6,US-Wkg,GRA,1531.0,31.7365,-109.9419,7,2,BSk,Arid,C4,40.93,data_full_half_hourly_raw_v0_1_US-Wkg.csv
7,US-ARM,CRO,314.0,36.6058,-97.4888,14,3,Cfa,Temperate,mix,15.97,data_full_half_hourly_raw_v0_1_US-ARM.csv
8,US-MMS,DBF,275.0,39.3232,-86.4131,25,4,Dfa,Cold,C3,42.28,
9,US-Me2,ENF,1253.0,44.4523,-121.5574,18,4,Dsb,Cold,C3,0.03,data_full_half_hourly_raw_v0_1_US-Me2.csv


In [5]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'NETRAD', 'PPFD_IN', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable = ['GPP_NT_VUT_REF']
qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
qc_flags_features = [s for s in included_features if "_QC" in s]

In [6]:
# Read files
data_df = None
for i, r in site_metadata_df[['site_id','filename']].iterrows():        
    if not r.filename or type(r.filename) != type(""):
        print(f'\nERROR: {r.site_id} is mssing hourly data.')
        continue
    
    # Get only `included_features` from file
    local_filename = tmp_dir + "\\" + r.filename
    site_df = pd.read_csv(local_filename, usecols = included_features +  target_variable)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    site_df['date'] = pd.to_datetime(site_df['date'])
    site_df['minute'] = site_df['datetime'].dt.minute
    site_df[qc_flags_features] = site_df[qc_flags_features].astype(qc_flag_dtype)
    site_df['site_id'] = r.site_id
    
    # Remove zero or negative SW
    site_df.drop(site_df[site_df['SW_IN_ERA'] <= 0].index, inplace = True)
    
    # Drop rows with NAs for Target Variable
    site_df.dropna(subset=target_variable, axis='index', inplace=True)
    
    #TODO: Drop rows with bad QC flags for GPP?
    site_df.drop(['NEE_VUT_REF', 'NEE_VUT_REF_QC'], axis=1, inplace=True)
    
    print(f"{r.site_id}: {site_df.shape}")
    if type(data_df) == type(None):
        data_df = site_df
    else:
        data_df = pd.concat([data_df, site_df])

FR-Pue: (123299, 29)
US-NR1: (135937, 29)

ERROR: US-Ne2 is mssing hourly data.
US-SRM: (95756, 29)
US-Ton: (115546, 29)
US-Var: (122872, 29)
US-Wkg: (93607, 29)
US-ARM: (129363, 29)

ERROR: US-MMS is mssing hourly data.
US-Me2: (109417, 29)
US-UMB: (96484, 29)
US-Vcp: (87395, 29)
CH-Lae: (153502, 29)
ES-LJu: (117181, 29)
FI-Hyy: (216706, 29)
IT-Lav: (158858, 29)


In [7]:
data_df.head()

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,NETRAD,PPFD_IN,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id
11,16.848,18.478,336.152,3.049,0.0,97.965,-27.27,12.08,-0.43772,2000-07-26 05:30:00,2000,7,26,5,2000-07-26,,,,,,,,,,,EBF,Temperate,30,FR-Pue
12,16.932,65.997,336.152,3.134,0.0,97.981,-16.69,74.61,0.87929,2000-07-26 06:00:00,2000,7,26,6,2000-07-26,,,,,,,,,,,EBF,Temperate,0,FR-Pue
13,17.016,114.551,336.152,3.22,0.0,97.996,27.455,88.21,5.44874,2000-07-26 06:30:00,2000,7,26,6,2000-07-26,,,,,,,,,,,EBF,Temperate,30,FR-Pue
14,17.1,252.095,339.113,3.305,0.0,98.012,84.205,274.01,6.43492,2000-07-26 07:00:00,2000,7,26,7,2000-07-26,,,,,,,,,,,EBF,Temperate,0,FR-Pue
15,18.055,326.385,339.113,5.097,0.0,98.029,146.205,387.91,5.70765,2000-07-26 07:30:00,2000,7,26,7,2000-07-26,,,,,,,,,,,EBF,Temperate,30,FR-Pue


In [8]:
data_df.describe()

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,NETRAD,PPFD_IN,GPP_NT_VUT_REF,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute
count,1704611.0,1704611.0,1704611.0,1704611.0,1704611.0,1704611.0,1409825.0,1694537.0,1755923.0,1755923.0,1755923.0,1755923.0,1755923.0,1583379.0,1585494.0,1585494.0,1585536.0,1586560.0,1584243.0,1585482.0,1582550.0,1536679.0,1586253.0,1755923.0
mean,13.51667,358.09109,312.26937,9.24948,0.03987,90.72458,229.39423,698.57499,5.58137,2009.7576,6.48568,15.73964,11.87538,0.30085,0.54377,0.13169,0.07782,0.24083,0.04321,0.07075,0.25498,0.19813,0.11535,14.99975
std,9.66231,266.27605,48.38676,9.36177,0.18144,9.15296,225.46301,583.37977,7.48933,5.15445,3.13876,8.81063,3.86598,0.13447,0.2197,0.07177,0.05632,0.06185,0.0415,0.04346,0.07028,0.09334,0.07817,15.0
min,-29.74,0.001,142.77,0.0,0.0,67.281,-365.20333,-91.4,-49.7372,1996.0,1.0,1.0,0.0,-0.11958,-0.18252,-0.01715,0.0,0.0305,0.0,0.0,0.0,0.0132,0.0,0.0
25%,6.888,120.677,279.488,2.596,0.0,85.237,33.5736,170.2,0.34248,2006.0,4.0,8.0,9.0,0.20765,0.34645,0.08228,0.0325,0.1975,0.019,0.0413,0.1993,0.1179,0.0497,0.0
50%,13.737,317.127,313.775,6.07,0.0,93.312,179.3,563.953,2.86745,2010.0,6.0,16.0,12.0,0.29014,0.57728,0.11928,0.0618,0.23,0.0331,0.0618,0.2565,0.1824,0.0944,0.0
75%,20.341,561.5975,346.2925,12.521,0.0,98.737,396.15,1135.7803,9.059,2014.0,9.0,23.0,15.0,0.36221,0.71267,0.15995,0.1152,0.2771,0.0544,0.0908,0.3132,0.2761,0.1786,30.0
max,42.587,1094.341,473.011,75.684,15.493,103.383,1035.3656,2527.13,85.0309,2020.0,12.0,31.0,23.0,2.38835,1.0,0.42385,0.7971,0.7849,0.7689,0.7865,0.4666,0.428,0.3573,30.0


In [9]:
data_df.shape

(1755923, 29)

In [10]:
data_df.site_id.unique()

array(['FR-Pue', 'US-NR1', 'US-SRM', 'US-Ton', 'US-Var', 'US-Wkg',
       'US-ARM', 'US-Me2', 'US-UMB', 'US-Vcp', 'CH-Lae', 'ES-LJu',
       'FI-Hyy', 'IT-Lav'], dtype=object)

## Merge with Site Metadata

In [11]:
data_df = data_df.merge(site_metadata_df.drop(['filename'], axis=1), how='left', left_on='site_id', right_on='site_id')
data_df.head()

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,NETRAD,PPFD_IN,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP_x,koppen,minute,site_id,IGBP_y,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent
0,16.848,18.478,336.152,3.049,0.0,97.965,-27.27,12.08,-0.43772,2000-07-26 05:30:00,2000,7,26,5,2000-07-26,,,,,,,,,,,EBF,Temperate,30,FR-Pue,EBF,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
1,16.932,65.997,336.152,3.134,0.0,97.981,-16.69,74.61,0.87929,2000-07-26 06:00:00,2000,7,26,6,2000-07-26,,,,,,,,,,,EBF,Temperate,0,FR-Pue,EBF,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
2,17.016,114.551,336.152,3.22,0.0,97.996,27.455,88.21,5.44874,2000-07-26 06:30:00,2000,7,26,6,2000-07-26,,,,,,,,,,,EBF,Temperate,30,FR-Pue,EBF,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
3,17.1,252.095,339.113,3.305,0.0,98.012,84.205,274.01,6.43492,2000-07-26 07:00:00,2000,7,26,7,2000-07-26,,,,,,,,,,,EBF,Temperate,0,FR-Pue,EBF,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
4,18.055,326.385,339.113,5.097,0.0,98.029,146.205,387.91,5.70765,2000-07-26 07:30:00,2000,7,26,7,2000-07-26,,,,,,,,,,,EBF,Temperate,30,FR-Pue,EBF,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59


In [12]:
if data_df[data_df['IGBP_x'] != data_df['IGBP_y']].shape[0] != 0:
     raise Exception("Mismatched IGBP")
else:
    data_df.drop(['IGBP_y'], axis=1, inplace=True)
    data_df.rename(columns={'IGBP_x':'IGBP'}, inplace=True)
    display(data_df)

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,NETRAD,PPFD_IN,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent
0,16.84800,18.47800,336.15200,3.04900,0.00000,97.96500,-27.27000,12.08000,-0.43772,2000-07-26 05:30:00,2000,7,26,5,2000-07-26,,,,,,,,,,,EBF,Temperate,30,FR-Pue,270.00000,43.74130,3.59570,8,3,Csa,Temperate,C3,6.59000
1,16.93200,65.99700,336.15200,3.13400,0.00000,97.98100,-16.69000,74.61000,0.87929,2000-07-26 06:00:00,2000,7,26,6,2000-07-26,,,,,,,,,,,EBF,Temperate,0,FR-Pue,270.00000,43.74130,3.59570,8,3,Csa,Temperate,C3,6.59000
2,17.01600,114.55100,336.15200,3.22000,0.00000,97.99600,27.45500,88.21000,5.44874,2000-07-26 06:30:00,2000,7,26,6,2000-07-26,,,,,,,,,,,EBF,Temperate,30,FR-Pue,270.00000,43.74130,3.59570,8,3,Csa,Temperate,C3,6.59000
3,17.10000,252.09500,339.11300,3.30500,0.00000,98.01200,84.20500,274.01000,6.43492,2000-07-26 07:00:00,2000,7,26,7,2000-07-26,,,,,,,,,,,EBF,Temperate,0,FR-Pue,270.00000,43.74130,3.59570,8,3,Csa,Temperate,C3,6.59000
4,18.05500,326.38500,339.11300,5.09700,0.00000,98.02900,146.20500,387.91000,5.70765,2000-07-26 07:30:00,2000,7,26,7,2000-07-26,,,,,,,,,,,EBF,Temperate,30,FR-Pue,270.00000,43.74130,3.59570,8,3,Csa,Temperate,C3,6.59000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1755918,,,,,,,,-0.25343,1.38967,2020-12-31 21:30:00,2020,12,31,21,2020-12-31,0.30255,0.26341,0.08440,0.18680,0.32040,0.17830,0.18210,0.09780,0.03050,0.01600,ENF,Cold,30,IT-Lav,1353.00000,45.95620,11.28132,26,4,Dfb,Cold,C3,3.57000
1755919,,,,,,,,-0.24970,1.37019,2020-12-31 22:00:00,2020,12,31,22,2020-12-31,0.30255,0.26341,0.08440,0.18680,0.32040,0.17830,0.18210,0.09780,0.03050,0.01600,ENF,Cold,0,IT-Lav,1353.00000,45.95620,11.28132,26,4,Dfb,Cold,C3,3.57000
1755920,,,,,,,,-0.25716,1.38881,2020-12-31 22:30:00,2020,12,31,22,2020-12-31,0.30255,0.26341,0.08440,0.18680,0.32040,0.17830,0.18210,0.09780,0.03050,0.01600,ENF,Cold,30,IT-Lav,1353.00000,45.95620,11.28132,26,4,Dfb,Cold,C3,3.57000
1755921,,,,,,,,-0.25343,1.38905,2020-12-31 23:00:00,2020,12,31,23,2020-12-31,0.30255,0.26341,0.08440,0.18680,0.32040,0.17830,0.18210,0.09780,0.03050,0.01600,ENF,Cold,0,IT-Lav,1353.00000,45.95620,11.28132,26,4,Dfb,Cold,C3,3.57000


## Upload Data to Azure Storage Blob as Parquet
**Run with Caution!!!**

In [13]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
parquet_file = BytesIO()
data_df.to_parquet(parquet_file, engine='pyarrow')
parquet_file.seek(0)

azStorageClient = AzStorageClient(az_cred_file)
azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to gold-samples-data/gold_samples_trim_data_0206.parquet


## View NA Data

In [16]:
total_record_count = data_df.shape[0]
na_df = pd.DataFrame(data_df.isna().sum())
na_df["percentage"] = (na_df / total_record_count)
na_df.rename(columns={0:"count"}, inplace=True)

In [17]:
na_df.loc[(na_df['count'] != 0)].sort_values("percentage", ascending=False)

Unnamed: 0,count,percentage
NETRAD,346098,0.197103
b6,219244,0.12486
b5,173373,0.098736
EVI,172544,0.098264
b3,171680,0.097772
b4,170441,0.097066
NDVI,170429,0.097059
NIRv,170429,0.097059
b1,170387,0.097036
b7,169670,0.096627


## Get NA Percentage per Sites

In [18]:
sites = data_df['site_id'].unique()
features =  data_df.columns
plot_data = pd.DataFrame(columns=features[:-1])

for i, s in enumerate(data_df['site_id'].unique()):

    site_df = data_df[data_df['site_id'] == s]
    
    site_na_df = pd.DataFrame(site_df.isna().sum())
    site_na_df["percentage"] = round( 100*(site_na_df / site_df.shape[0]), 2)
    site_na_df.drop(["site_id"], inplace=True)
    #display(site_na_df.T)
    site_na_df.drop([0], axis=1, inplace=True)
    
    plot_data.loc[s] = site_na_df["percentage"]

plot_data

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,NETRAD,PPFD_IN,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4
FR-Pue,0.0,0.0,0.0,0.0,0.0,0.0,1.8,8.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.75,3.71,3.71,3.71,3.71,3.75,3.71,4.1,4.31,3.74,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-NR1,0.0,0.0,0.0,0.0,0.0,0.0,0.73,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.58,20.51,20.51,20.51,20.51,20.58,20.54,20.73,27.11,20.51,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-SRM,0.0,0.0,0.0,0.0,0.0,0.0,1.4,0.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.07,0.07,0.07,0.07,0.13,0.07,0.07,0.18,0.13,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-Ton,0.0,0.0,0.0,0.0,0.0,0.0,2.49,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.83,1.99,0.8,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-Var,0.0,0.0,0.0,0.0,0.0,0.0,2.47,0.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.54,1.54,1.54,1.54,1.54,1.54,1.54,1.54,2.29,1.54,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-Wkg,0.0,0.0,0.0,0.0,0.0,0.0,0.59,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26,0.26,0.26,0.26,0.26,0.26,0.26,0.26,0.31,0.26,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-ARM,0.0,0.0,0.0,0.0,0.0,0.0,9.58,8.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.11,1.11,1.11,1.11,1.11,1.11,1.11,1.21,2.36,1.11,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-Me2,0.0,0.0,0.0,0.0,0.0,0.0,2.6,1.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.22,5.22,5.22,5.22,5.22,5.22,5.22,5.41,8.11,5.24,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-UMB,0.0,0.0,0.0,0.0,0.0,0.0,5.95,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.07,23.04,23.04,23.04,23.04,23.07,23.04,23.9,25.75,23.04,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-Vcp,0.0,0.0,0.0,0.0,0.0,0.0,30.42,7.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.58,5.55,5.55,5.55,5.52,5.58,5.55,6.22,9.11,5.52,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
plot_df = plot_data.loc[:, (plot_data.sum(axis=0) != 0)]
plot_df.shape

(14, 18)

In [21]:
fig = px.imshow(plot_df,
                color_continuous_scale = 'amp',
                text_auto=False,
                title= "Proportions of Missing Data per Feature and Site",
                labels=dict(x="Features", y="Site ID",
                            color="Missing<br>Data(%)"),
               )
fig.update(data=[{'hovertemplate': '%{y} (%{x})<br>Missing: %{z}%'}])
fig.update_layout(
    height=500, width=600,
    margin={"r":0,"t":75,"l":0,"b":0},
    title={'y':0.95,'x':0.025},
    coloraxis_colorbar=dict(
        title_font_size = 14,
        tickfont_size = 12,
        lenmode="pixels", len=300,
        thicknessmode="pixels", thickness=15
    )
)

fig.show()