# MNTOHA temperature data

In [None]:
import os
import zipfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm


# This notebook can be opened from the project root or the notebooks folder
cwd = os.getcwd()
current_folder = os.path.split(cwd)[-1]
if current_folder == 'notebooks':
    os.chdir('..')
elif current_folder == 'lake-temperature-lstm-static':
    pass
else:
    raise FileNotFoundError('Working directory not recognized. Please os.chdir() to project root directory (lake-temperature-lstm-static).')

# Download MNTOHA data

In [None]:
!snakemake --snakefile Snakefile -c1 -p --rerun-incomplete 1_fetch/out/obs_mntoha/temperature_observations.zip 1_fetch/out/metadata_mntoha/lake_metadata.csv

# Read data

In [None]:
# Read MNTOHA temperature observations

source_dir = '1_fetch/out'
destination_dir = '2_process/tmp'
obs_zipfile = 'obs_mntoha/temperature_observations.zip'

# unzip
with zipfile.ZipFile(os.path.join(source_dir, obs_zipfile), 'r') as zf:
    zf.extractall(os.path.join(destination_dir, os.path.splitext(obs_zipfile)[0]))

# read csv
temperature_observations_file = os.path.join(destination_dir, os.path.splitext(obs_zipfile)[0], 'temperature_observations.csv')
obs = pd.read_csv(temperature_observations_file)

# Read metadata
lake_metadata_file = os.path.join(source_dir, 'metadata_mntoha/lake_metadata.csv')
lake_metadata = pd.read_csv(lake_metadata_file)

# Plot temperature observations in each lake over time

In [None]:
# View temp plots for one lake at a time
# TIP: use CTRL+Enter to rerun this cell multiple times and view lake after lake
try:
    i_lake += 1
    lake = lake_metadata.iloc[i_lake]
except:
    i_lake = 0
    lake = lake_metadata.iloc[i_lake]
lake_obs = obs[obs.site_id==lake.site_id].copy()
lake_obs["time"] = pd.to_datetime(lake_obs["date"])
fig, ax = plt.subplots()
ax.set_title(lake.lake_name)
p = lake_obs.plot(kind='scatter', x='time', y='depth', c='temp', colormap='viridis', ax=ax)
ax.hlines(y=lake.depth, xmin=p.get_xlim()[0], xmax=p.get_xlim()[1], colors='k')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

lake

# Explore observations below nominal maximum lake depth

Many of these lakes have temperatures observed deeper than the nominal maximum depth. What's up with that? How many lakes have temperatures below their max depth?

In [None]:
obs_max_depth = (
    obs.loc[:, ['site_id', 'depth']]
    .groupby('site_id')
    .max()
)

(lake_metadata
# .loc[:, ['site_id', 'depth']]
 .join(obs_max_depth,
       how='left',
       on='site_id',
       rsuffix='_obs')
 .query('depth_obs>depth')
).shape

526 out of 881 (877 with temperatures) have obs deeper than max depth! What's the distribution of differences between depth and depth_obs?

In [None]:
augmented_metadata = (
    lake_metadata
     .join(obs
           .loc[:, ['site_id', 'depth']]
           .groupby('site_id')
           .max(),
           how='left',
           on='site_id',
           rsuffix='_obs')
)

(augmented_metadata.depth - augmented_metadata.depth_obs).hist(bins=500)
plt.xlim(-10, 10)
plt.show()

Looks 0-centered and heavy-tailed.

# Plot number of lakes above data thresholds

I wonder:

1. How many observation dates per lake
2. How many depths per observation date

Let's make an image where the color indicates how many lakes meet thresholds of

- x axis: # dates
- y axis: # depths/date

In [None]:
counts_by_date = (
    obs.loc[:, ['site_id', 'date', 'depth']]
    .drop_duplicates()
    .groupby(['site_id', 'date'])
    .count()
)

min_dates_vals = np.logspace(0,3,31)
min_depths_vals = np.arange(1, 30)
img_qualifying_lakes = np.zeros((len(min_dates_vals), len(min_depths_vals)))
for i_date, min_dates in enumerate(min_dates_vals):
    for i_depth, min_depths in enumerate(min_depths_vals):
        img_qualifying_lakes[i_date, i_depth] = len(
            counts_by_date.query(f'depth>={min_depths}')
            .groupby('site_id')
            .count()
            .query(f'depth>={min_dates}')
        )

Y, X = np.meshgrid(min_depths_vals, min_dates_vals)
font_style = {'font.size': 20}
with plt.style.context([font_style]):
    fig, ax = plt.subplots(figsize=(9,7))
    p = ax.pcolormesh(X, Y, img_qualifying_lakes, norm=LogNorm(vmin=1, vmax=1000))
    c = ax.contour(X, Y, img_qualifying_lakes, levels=[10, 20, 50, 100, 200, 500, 800], colors='k')
    ax.clabel(c, inline=True, fontsize=14)
    ax.set_xscale('log')
    cbar = fig.colorbar(p)
    ax.set_xlabel('Minimum # observation days')
    ax.set_ylabel('Minimum # observation depths')
    cbar.ax.set_ylabel('# of lakes that meet thresholds')
    ax.set_title('Temperature observation availability \nin 881 MNTOHA lakes')
    plt.show()

# Plot occurrence of temperature values

Which temperature values are most common?

In [None]:
temp_counts = obs.temp.value_counts(normalize=True)
fig, ax = plt.subplots(figsize=(9,7))
temp_counts.loc[temp_counts > 0.0025].plot(kind='bar')

Round numbers greater than 4&deg;C are most common, then values rounded to the nearest 0.5&deg;C. It's reassuring to see that 0&deg;C isn't too high in the list. It's interesting that 5&deg;C is secondmost common. Maybe lake turnover times are popular times to measure temperatures?