# Calculate stats for df_tw and df_tw_cloud in traininig and testing period

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import cartopy.crs as ccrs
import cartopy
os.chdir("..")
import tools.marineHeatWaves as mhw

In [2]:
# load cci lakes
cci_lake_list = pd.read_csv("data/cci_lakes_hydrolake_depth.csv")["CCI ID"].to_numpy()
cci_lakes = pd.read_csv("data/ESA_CCI_static_lake_mask_v2_1km_UoR_metadata_fv2.1_06Oct2021_4laketemp.csv", index_col=0).loc[cci_lake_list]
cci_lakes_gdf = gpd.GeoDataFrame(cci_lakes, geometry=gpd.points_from_xy(cci_lakes['LON CENTRE'], cci_lakes['LAT CENTRE']),
                                crs="epsg:4326")
cci_lakes_gdf.index.name = "cci_lake_id"

# Metrics
- Annual mean
- Warmest month temperature
- Coldest month temperature

In [3]:
# northern hemisphere
northern_lakes = cci_lakes_gdf.loc[cci_lakes_gdf["LON CENTRE"] > 0].index.astype(str).to_numpy()
southern_lakes = cci_lakes_gdf.loc[cci_lakes_gdf["LON CENTRE"] < 0].index.astype(str).to_numpy()

def cal_warmest_month_mean(df):
    """
    1. Find the warmest month for each year.
    2. Calculate the average temperature of the warmest months.
    """
    # monthly mean across df period
    monthly_temperature = df.resample("ME").mean()
    # for each year, find the warmest month mean temperature
    # then calculate mean across the df period
    output_df = monthly_temperature.groupby(monthly_temperature.index.year).max().mean()
    
    output_df.index.name = 'cci_lake_id'
    # output_df.index = output_df.index.astype(int)
    return output_df

def cal_summer_mean(df):
    # northern lakes
    df_north = df.loc[:, northern_lakes]
    df_north = df_north.loc[df_north.index.month.isin([6,7,8])]
    df_north_summer_mean = df_north.mean()
    # southern lakes
    df_south = df.loc[:, southern_lakes]
    df_south = df_south.loc[df_south.index.month.isin([12,1,2])]
    df_south_summer_mean = df_south.mean()
    # merge
    df_summer_mean = pd.concat([df_north_summer_mean, df_south_summer_mean])
    # df_summer_mean.columns = ["summer"]
    df_summer_mean.index.name = 'cci_lake_id'
    # df_summer_mean.index = df_summer_mean.index.astype(int)
    return df_summer_mean

def cal_winter_mean(df):
    # northern lakes
    df_north = df.loc[:, northern_lakes]
    df_north = df_north.loc[df_north.index.month.isin([12,1,2])]
    df_north_winter_mean = df_north.mean()
    # southern lakes
    df_south = df.loc[:, southern_lakes]
    df_south = df_south.loc[df_south.index.month.isin([6,7,8])]
    df_south_winter_mean = df_south.mean()
    # merge
    df_winter_mean = pd.concat([df_north_winter_mean, df_south_winter_mean])
    # df_winter_mean.columns = ["winter"]
    df_winter_mean.index.name = 'cci_lake_id'
    # df_winter_mean.index = df_winter_mean.index.astype(int)
    return df_winter_mean

def cal_coldest_month_mean(df):
    """
    1. Find the coldest month for each year.
    2. Calculate the average temperature of the coldest months.
    """
    # monthly mean across df period
    monthly_temperature = df.resample("ME").mean()
    # for each year, find the warmest month mean temperature
    # then calculate mean across the df period
    output_df = monthly_temperature.groupby(monthly_temperature.index.year).min().mean()
    
    output_df.index.name = 'cci_lake_id'
    # output_df.index = output_df.index.astype(int)
    return output_df

def cal_ice_days(df, 
                 threshold = 0.76):
    '''
    A function to calculate number of days covered by ice
    
    Ice-cover: temperature < threshold
    
    Threshold is determined by the maximum RMSE during ice-covered period across study lakes
    '''
    # create a true/false dataframe
    df_ice = df <= threshold
    
    # sum on each year and calculate the mean across years
    ice_days = df_ice.groupby(df_ice.index.year).sum().mean()
    
    return ice_days

In [4]:
# change here to replace with cloud gapped lake surface water temperature for calibration
df_tw = pd.read_csv("/nas/cee-hydro/laketemp_bias/era5land/water_temp.csv", index_col=0, 
                    parse_dates=True).loc[:, cci_lake_list.astype(str)]
df_tw_cloud = pd.read_csv("/nas/cee-hydro/laketemp_bias/era5land/water_temp_cloud.csv", index_col=0, 
                    parse_dates=True).loc[:, cci_lake_list.astype(str)]

train_period = pd.date_range("2003-01-01", "2017-12-31") # 15 years
val_period = pd.date_range("2018-01-01", "2023-12-31") # 6 years
total_period = pd.date_range("2003-01-01", "2023-12-31")

In [5]:
def cal_obs_stats(period):
    # load df for daily and cloudy
    df = df_tw.loc[period]
    df_cloud = df_tw_cloud.loc[period]
    
    # build a dataframe for obs stats
    obs_stats = pd.DataFrame([], index = cci_lakes_gdf.index.astype(str))
    
    # concat
    obs_stats = pd.concat([obs_stats, 
                           df.mean().rename("obs_mean"), 
                           cal_summer_mean(df).rename("obs_summer_mean"),
                           cal_warmest_month_mean(df).rename("obs_warmest_mean"), 
                           cal_coldest_month_mean(df).rename("obs_coldest_mean"),
                           cal_ice_days(df).rename("obs_ice_duration"),
                           
                           df_cloud.mean().rename("obs_cloud_mean"), 
                           cal_summer_mean(df_cloud).rename("obs_cloud_summer_mean"),
                           cal_warmest_month_mean(df_cloud).rename("obs_cloud_warmest_mean"), 
                           cal_coldest_month_mean(df_cloud).rename("obs_cloud_coldest_mean")
                           ], axis = 1)
    # turn the id to int
    obs_stats.index = obs_stats.index.astype(int)
        
    return obs_stats

In [6]:
obs_stats_train = cal_obs_stats(train_period)
obs_stats_val = cal_obs_stats(val_period)
obs_stats_total = cal_obs_stats(total_period)

In [7]:
obs_stats_total

Unnamed: 0_level_0,obs_mean,obs_summer_mean,obs_warmest_mean,obs_coldest_mean,obs_ice_duration,obs_cloud_mean,obs_cloud_summer_mean,obs_cloud_warmest_mean,obs_cloud_coldest_mean
cci_lake_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
799,20.750757,26.303731,28.020937,11.998787,0.000000,23.185528,26.259991,27.999916,12.558893
3114,7.237123,2.882975,13.921182,1.907211,8.476190,7.017820,2.770322,14.417449,1.734563
7889,9.159194,4.422097,15.202979,3.640545,1.619048,8.710076,4.135467,15.393307,3.531228
2516,6.553977,2.792597,12.980301,1.231114,21.285714,6.305155,2.659742,13.509607,1.044246
12262,6.446441,1.782244,13.256771,0.903437,26.523810,6.015920,1.754878,13.599192,0.807727
...,...,...,...,...,...,...,...,...,...
473,2.862553,-0.000342,10.951210,-0.000342,222.619048,2.496811,-0.000342,10.994574,-0.000342
309,11.815620,0.561585,25.614385,0.083371,78.142857,16.688898,0.483486,25.566515,0.084186
141,4.074236,0.000252,12.815278,-0.000342,192.380952,2.505900,-0.000265,13.277473,-0.000342
212,1.754104,-0.000342,9.473323,-0.000342,275.238095,1.527057,-0.000342,9.714437,-0.000342


# Export

In [8]:
obs_stats_train.to_csv("data/obs_stats_train.csv")
obs_stats_val.to_csv("data/obs_stats_val.csv")
obs_stats_total.to_csv("data/obs_stats_total.csv")