# 2 Explore a Dataset containing Gaps

In [16]:
import pandas as pd
import numpy as np

import panel as pn
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
pn.extension()

Consider the following data set from CMAR Water Quality Data taken from different stations off the coast of Shelburne County.

In [17]:
df = pd.read_csv("dataset.csv", parse_dates=['date'], index_col='date')
df

Unnamed: 0_level_0,BlueIsland_2m,BlueIsland_5m,BlueIsland_10m,Ingomar_2m,Ingomar_5m,Ingomar_10m,Ingomar_15m,McNuttsIsland_2m,McNuttsIsland_5m,McNuttsIsland_10m,McNuttsIsland_15m,McNuttsIsland_20m,TaylorsRock_2m,TaylorsRock_5m,TaylorsRock_10m,TaylorsRock_15m,TaylorsRock_20m
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2018-05-15,8.634,7.762,6.811,7.664,6.993,6.220,5.637,6.504,6.295,5.991,5.737,4.492,7.141,6.778,6.217,5.922,5.871
2018-05-16,9.009,7.564,6.215,7.347,6.636,5.912,5.390,7.222,6.786,6.184,5.737,4.420,6.980,6.595,6.050,5.732,5.449
2018-05-17,8.074,7.188,6.466,7.621,7.072,6.544,6.034,7.828,7.398,6.245,5.606,4.322,7.346,6.994,6.574,6.328,6.092
2018-05-18,8.441,7.328,6.099,7.993,7.554,7.025,6.501,8.065,7.444,6.599,5.952,4.644,7.312,7.033,6.523,6.207,5.757
2018-05-19,7.649,6.877,6.142,8.180,7.809,7.421,6.901,7.883,7.127,5.962,5.392,4.141,7.667,7.398,7.056,6.856,6.507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-10,6.555,6.554,6.692,6.617,6.709,6.589,5.733,,,6.604,6.651,6.642,6.400,,6.358,6.396,6.521
2022-05-11,6.754,6.753,6.837,6.816,6.895,6.738,5.896,,,6.720,6.667,6.529,6.442,,6.393,6.424,6.525
2022-05-12,7.026,6.947,7.000,6.809,6.761,6.466,5.617,,,6.645,6.217,6.000,6.523,,6.252,6.139,6.142
2022-05-13,7.610,7.248,7.075,6.691,6.347,5.974,5.079,,,6.017,5.804,5.725,6.465,,5.565,5.470,5.600


## Explore the data

We can plot the temperature at any of these different station-depth locations.


In [18]:
# Create a dropdown selector
station_depth_selector = pn.widgets.Select(name='Station @ Depth', options=list(df.columns))

def plot_curve(station_depth):
    curve = hv.Curve(df[station_depth], label=station_depth).opts(
        width=900, height=400, tools=['hover', 'box_zoom', 'pan', 'wheel_zoom'],
        show_grid=True, title="Daily Avg Temperature"
    )
    return curve

interactive_plot = hv.DynamicMap(pn.bind(plot_curve, station_depth_selector))

Browse through this dataset by selecting different stations and depths.

In [19]:
pn.Column(station_depth_selector, interactive_plot)

Over the four year period from May 2018 to May 2022, there are gaps within this data set depending on the station and the depth.

In [20]:
def highlight_nan_regions(series, label):
    # Identify NaN regions
    is_nan = series.isna()
    nan_ranges = []
    current_start = None

    for date, missing in is_nan.items():
        if missing and current_start is None:
            current_start = date
        elif not missing and current_start is not None:
            nan_ranges.append((current_start, date))
            current_start = None
    if current_start is not None:
        nan_ranges.append((current_start, series.index[-1]))

    # Create shaded regions
    spans = [
        hv.VSpan(start, end).opts(color='red', alpha=0.2)
        for start, end in nan_ranges
    ]

    curve = hv.Curve(series, label=label).opts(
        width=900, height=250, tools=['hover', 'box_zoom', 'pan', 'wheel_zoom'],
        show_grid=True, title=label
    )

    return curve * hv.Overlay(spans)

In [21]:
plot1 = highlight_nan_regions(df['BlueIsland_5m'], 'Blue Island @ 5.0m')
plot2 = highlight_nan_regions(df['Ingomar_10m'], 'Ingomar @ 10.0m')
plot3 = highlight_nan_regions(df['McNuttsIsland_10m'], 'McNutts Island @ 10.0m')

layout = (plot1 + plot2 + plot3).cols(1)
layout

This dataset covers four years of daily observations (2018-05-15 to 2022-05-14) for 17 different "sites" (different stations and vertical depths).

In [22]:
image_data = df.astype('float32').T.values

x_labels = df.index.strftime('%Y-%m-%d')  # dates → x-axis
y_labels = list(df.columns)               # station-depths → y-axis

x_coords = np.arange(len(x_labels))
y_coords = np.arange(len(y_labels))

heatmap = hv.Image((x_coords, y_coords, image_data)).opts(
    xaxis='bottom',
    xlabel='Date',
    ylabel='Station @ Depth',
    xticks=list(zip(x_coords[::30], x_labels[::30])),  # every 30th date
    yticks=list(zip(y_coords, y_labels)),
    xrotation=45,
    cmap='Viridis',
    colorbar=True,
    width=1000,
    height=800,
    tools=['hover']
)
heatmap

Our objective is to fill in the gaps in this dataset.