# Calculate stats for df_tw and df_tw_cloud in traininig and testing period

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import cartopy.crs as ccrs
import cartopy

In [2]:
# change here to replace with cloud gapped lake surface water temperature for calibration
df_tw = pd.read_csv("/nas/cee-hydro/laketemp_bias/era5land/water_temp.csv", index_col=0, 
                    parse_dates=True)
df_tw_cloud = pd.read_csv("/nas/cee-hydro/laketemp_bias/era5land/water_temp_cloud_25.csv", index_col=0, 
                    parse_dates=True)

train_period = pd.date_range("2000-01-01", "2014-12-31")
val_period = pd.date_range("2015-01-01", "2023-12-31")
total_period = pd.date_range("2000-01-01", "2023-12-31")


# split to train and val dataframe
df_tw_train = df_tw.loc[train_period]
df_tw_cloud_train = df_tw_cloud.loc[train_period]
df_tw_val = df_tw.loc[val_period]
df_tw_cloud_val = df_tw_cloud.loc[val_period]

In [3]:
# load cci lakes
cci_lake_list = pd.read_csv("../data/cci_lakes_hydrolake_depth.csv")["CCI ID"].to_numpy()
cci_lakes = pd.read_csv("../data/ESA_CCI_static_lake_mask_v2_1km_UoR_metadata_fv2.1_06Oct2021_4laketemp.csv", index_col=0).loc[cci_lake_list]
cci_lakes_gdf = gpd.GeoDataFrame(cci_lakes, geometry=gpd.points_from_xy(cci_lakes['LON CENTRE'], cci_lakes['LAT CENTRE']),
                                crs="epsg:4326")
cci_lakes_gdf.index.name = "cci_lake_id"

Calculate temperature stats for training and validation period

In [4]:
# create empty dataframe with lake id as index
obs_stats_train = pd.DataFrame([], index = cci_lakes_gdf.index)
obs_stats_val = pd.DataFrame([], index = cci_lakes_gdf.index)

# calculate stats for train
# full
obs_stats_train["obs_tmax"] = df_tw_train.groupby(df_tw_train.index.year).max().mean().to_numpy()
obs_stats_train["obs_mean"] = df_tw_train.mean().to_numpy()

# cloud
obs_stats_train["obs_cloud_tmax"] = df_tw_cloud_train.groupby(df_tw_cloud_train.index.year).max().mean().to_numpy()
obs_stats_train["obs_cloud_mean"] = df_tw_cloud_train.mean().to_numpy()

# missing proportion
obs_stats_train["valid_proportion"] = df_tw_cloud_train.count().to_numpy()/df_tw_train.count().to_numpy()

# calculate stats for test period
obs_stats_val["obs_tmax"] = df_tw_val.groupby(df_tw_val.index.year).max().mean().to_numpy()
obs_stats_val["obs_mean"] = df_tw_val.mean().to_numpy()

# cloud
obs_stats_val["obs_cloud_tmax"] = df_tw_cloud_val.groupby(df_tw_cloud_val.index.year).max().mean().to_numpy()
obs_stats_val["obs_cloud_mean"] = df_tw_cloud_val.mean().to_numpy()

In [None]:
# calculate stats for test period
obs_stats_total = pd.DataFrame([], index = cci_lakes_gdf.index)
obs_stats_total["obs_tmax"] = df_tw.groupby(df_tw.index.year).max().mean().to_numpy()
obs_stats_total["obs_mean"] = df_tw.mean().to_numpy()

# cloud
obs_stats_total["obs_cloud_tmax"] = df_tw_cloud.groupby(df_tw_cloud.index.year).max().mean().to_numpy()
obs_stats_total["obs_cloud_mean"] = df_tw_cloud.mean().to_numpy()

# load missing info to obs_stats_total and obs_stats_train
# note, this is from the trainig period
obs_stats_total["valid_proportion"] = df_tw_cloud_train.count().to_numpy()/df_tw_train.count().to_numpy()
obs_stats_total

Unnamed: 0_level_0,obs_tmax,obs_mean,obs_cloud_tmax,obs_cloud_mean,valid_proportion
cci_lake_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
799,32.577165,20.704133,30.538311,17.201105,0.355904
3114,17.119290,7.208574,16.896219,7.405358,0.793575
7889,17.030177,9.133325,16.966602,9.325340,0.806899
2516,15.825278,6.519729,15.704343,6.644480,0.806169
12262,15.727267,6.415000,15.572343,6.621344,0.786275
...,...,...,...,...,...
473,12.967302,2.781468,12.967302,2.802277,0.981201
309,28.204805,11.806721,27.838501,9.525628,0.605402
141,15.830732,4.017723,15.830732,4.038030,0.981201
212,13.359965,1.700454,12.990317,1.701121,0.596642


In [8]:
df_tw_cloud

Unnamed: 0,799,3114,7889,2516,12262,1519,3053,1203,3350,3607,...,300000430,278,293,300000771,378,473,309,141,212,170
2000-01-01,,,,,,,,,,,...,,,,,,,,,,
2000-01-02,,,,,,,,,,,...,,,,,,,,,,
2000-01-03,,,,,,,,,,,...,,,,,,,,,,
2000-01-04,,,,,,,,,,,...,,,,,,,,,,
2000-01-05,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-27,,3.625541,7.035598,4.104998,2.925948,7.781769,21.977073,4.061035,,,...,,6.969979,18.580343,1.398393,3.148971,-0.000342,0.764114,-0.000342,-0.000342,-0.000342
2023-12-28,15.307682,3.667067,,4.063137,2.896821,7.396017,22.026259,,,21.951714,...,,6.988318,17.668394,1.423019,2.999573,-0.000342,,-0.000342,,-0.000342
2023-12-29,15.418274,,6.235015,,,,22.057891,,4.106709,,...,,,16.474875,1.389183,2.998324,-0.000342,,-0.000342,,-0.000342
2023-12-30,,3.492535,5.829605,3.927026,2.643310,6.596595,22.167544,,4.164867,22.942908,...,-0.000342,,14.418345,1.189809,3.031629,-0.000342,0.705289,-0.000342,,-0.000342


In [6]:
obs_stats_train.to_csv("../data/obs_stats_train.csv")
obs_stats_val.to_csv("../data/obs_stats_val.csv")
obs_stats_total.to_csv("../data/obs_stats_total.csv")