# 7 Case Study: East Atlantic Coast Aquatic Invasive Species(AIS) Monitoring Program

CKAN record: https://catalogue.cioos.ca/dataset/ca-cioos_b54e1292-7483-4730-9873-4df055bd7edb

In this notebook we repeat the analysis for different dataset to validate the approach.

# Download and prepare the dataset

In [106]:
from erddapy import ERDDAP
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import panel as pn
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
pn.extension()

::: {note}
The first section can be skipped since the dataset `dataset_ais.csv` has already been included in the git repository.
:::

In [107]:
e = ERDDAP(
    server = "https://erddap.ogsl.ca/erddap",
    protocol = "tabledap"
)

In [108]:
e.dataset_id = "mpoEaeTemperature"
e.variables = ['time', 'location', 'sea_water_temperature' ]
e.constraints = { "time>=": "2015-01-01", "time<=": "2020-12-31" }

In [109]:
os.makedirs('data', exist_ok=True)

csvfile = f"data/ais_tempdata.csv.gz"

if not os.path.exists(csvfile):
    print("Downloading...", end='')
    df = e.to_pandas()
    df.to_csv(csvfile, compression='gzip', index=False)
    print("Done.")
else:
    df = pd.read_csv(csvfile)

df = df.rename(columns={'time (UTC)': 'time',
           'location (unitless)': 'location',
           'sea_water_temperature (degree_C)':'sea_water_temperature'})

# Ensure the date column is in datetime format
df['time'] = pd.to_datetime(df['time'])

In [110]:
df.sample(5)

Unnamed: 0,time,location,sea_water_temperature
823262,2016-07-31 14:30:00+00:00,Marina de Sainte-Anne-des-Monts,14.9
507533,2018-06-01 18:15:00+00:00,Marina de Cap-aux-Meules,11.236
331381,2016-07-30 20:15:00+00:00,Grande Riviere,19.187
1439480,2018-05-26 23:15:00+00:00,Quai commercial de Cap-aux-Meules,6.877
1850679,2017-06-29 10:45:00+00:00,Site aquicole de Sept-Iles,7.481


In [111]:
# Create a new column for date only (drop time component)
df['date'] = df['time'].dt.date

# Group and aggregate
daily_avg = (
    df
    .groupby(['location', 'date'])['sea_water_temperature']
    .mean()
    .round(3)  # Limit to 3 decimal places
    .reset_index()
    .rename(columns={'sea_water_temperature': 'daily_avg_temperature'})
)

In [112]:
# Pivot the data: rows = date, columns = station, values = daily average temperature
pivot_df = daily_avg.pivot_table(
    index='date',
    columns=['location'],
    values='daily_avg_temperature'
)

In [113]:
pivot_df.to_csv('dataset_ais.csv')

In [114]:
pivot_df.sample(10)

location,Baie au Saumon,Baie de Gaspe,Baie de plaisance,Baie des Belles Amours,Bassin du Havre Aubert,Cascapedia,Grande Riviere,Grande-Entree,Ile Bonaventure,La jetee de Cap-aux-Meules,...,Quai de la Relance,Quai des Pilotes des Escoumins,Quai des pecheurs de Cap-aux-Meules,Riviere au Renard,Site aquicole de Grande Entree,Site aquicole de Paspebiac,Site aquicole de Sept-Iles,St-Simeon,Tourelle,Tracadigache
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-14,,,9.888,,12.264,,,12.389,,10.082,...,,7.916,10.861,,,7.706,14.185,,,
2018-11-09,,,,4.338,,,,,,,...,,,,,,,,,,
2015-08-30,,17.615,,15.183,,,15.709,19.935,,,...,15.69,,,15.642,20.33,9.821,,,,14.09
2018-08-29,,15.196,17.828,15.067,22.024,8.401,15.62,,13.509,18.633,...,12.171,3.04,19.42,13.67,21.098,,,,,
2019-07-06,,,,7.221,17.491,,13.852,13.452,,15.539,...,,5.287,15.801,14.903,14.089,16.41,,,15.571,
2016-08-06,,,16.683,,,,16.868,,,18.027,...,16.47,4.88,19.648,17.11,20.535,16.562,,9.573,,
2019-06-02,,,,2.088,10.621,,7.767,8.032,,,...,,,9.887,7.128,10.12,8.201,,,8.16,
2018-10-13,,,,5.1,,9.886,,,,,...,,,10.568,,,,,,,
2015-08-23,,21.63,,13.245,,,16.664,21.468,,,...,15.827,,,18.099,22.016,18.735,,,,20.155
2016-07-19,,,15.513,,,,15.031,,,17.191,...,16.652,7.983,18.879,15.083,17.329,11.443,,9.035,,


## Explore the data

In [115]:
df = pd.read_csv('dataset_ais.csv', parse_dates=True, index_col=0)

This dataset only has observations during the a portion of each calendar year (there are no measurements take during the winter).

In [116]:
def plot_all_sites(df):
    image_data = df.astype('float32').T.values
    
    x_labels = df.index.strftime('%Y-%m-%d')  # dates → x-axis
    y_labels = list(df.columns)               # station-depths → y-axis
    
    x_coords = np.arange(len(x_labels))
    y_coords = np.arange(len(y_labels))
    
    heatmap = hv.Image((x_coords, y_coords, image_data)).opts(
        xaxis='bottom',
        xlabel='Date',
        ylabel='Station @ Depth',
        xticks=list(zip(x_coords[::30], x_labels[::30])),  # every 30th date
        yticks=list(zip(y_coords, y_labels)),
        xrotation=45,
        cmap='Viridis',
        colorbar=True,
        width=1000,
        height=800,
        tools=['hover']
    )
    return heatmap
    
plot_all_sites(df)

### Visualize the series data

In [101]:
# Create a dropdown selector
site_selector = pn.widgets.Select(name='Site', options=list(df.columns))

def highlight_nan_regions(label):

    series = df[label]
    
    # Identify NaN regions
    is_nan = series.isna()
    nan_ranges = []
    current_start = None

    for date, missing in is_nan.items():
        if missing and current_start is None:
            current_start = date
        elif not missing and current_start is not None:
            nan_ranges.append((current_start, date))
            current_start = None
    if current_start is not None:
        nan_ranges.append((current_start, series.index[-1]))

    # Create shaded regions
    spans = [
        hv.VSpan(start, end).opts(color='red', alpha=0.2)
        for start, end in nan_ranges
    ]

    curve = hv.Curve(series, label=label).opts(
        width=900, height=250, tools=['hover', 'box_zoom', 'pan', 'wheel_zoom'],
        show_grid=True, title=label
    )

    return curve * hv.Overlay(spans)
    
interactive_plot = hv.DynamicMap(pn.bind(highlight_nan_regions, site_selector))

pn.Column(site_selector, interactive_plot, 'Hightlights regions are gaps that need to imputed.')

## Impute the gaps

We have determined that the `MissForest`appears to work reasonably well when imputing artificially large gaps. 

We use it to gap fill the missing data in this dataset.

In [117]:
from imputeMF import imputeMF

In [118]:
df_imputed = pd.DataFrame(imputeMF(df.values, 10, print_stats=True), columns=df.columns, index=df.index)

Statistics:
iteration 1, gamma = 0.014082805565545895
Statistics:
iteration 2, gamma = 0.002128218680933588
Statistics:
iteration 3, gamma = 0.0009831946896805977
Statistics:
iteration 4, gamma = 0.0007002698046187135
Statistics:
iteration 5, gamma = 0.0004505560374552218
Statistics:
iteration 6, gamma = 0.0003560106245824383
Statistics:
iteration 7, gamma = 0.000344451776973629
Statistics:
iteration 8, gamma = 0.0003607540526316762


In [119]:
def highlight_imputed_regions(label):

    series = df[label]
    series_imputed = df_imputed[label]
    
    # Identify NaN regions
    is_nan = series.isna()
    nan_ranges = []
    current_start = None

    for date, missing in is_nan.items():
        if missing and current_start is None:
            current_start = date
        elif not missing and current_start is not None:
            nan_ranges.append((current_start, date))
            current_start = None
    if current_start is not None:
        nan_ranges.append((current_start, series.index[-1]))

    # Create shaded regions
    spans = [
        hv.VSpan(start, end).opts(color='red', alpha=0.2)
        for start, end in nan_ranges
    ]

    curve = hv.Curve(series_imputed, label=label).opts(
        width=900, height=250, tools=['hover', 'box_zoom', 'pan', 'wheel_zoom'],
        show_grid=True, title=label
    )

    return curve * hv.Overlay(spans)
    
interactive_plot = hv.DynamicMap(pn.bind(highlight_imputed_regions, site_selector))

pn.Column(site_selector, interactive_plot)

Highlighted regions show where the gaps have been imputed.

Notice the imputation algorithm gap fills in time intervals where there is very limited information from any other site. Care should be taken in interpretation of interpolated data.

In [120]:
plot_all_sites(df_imputed)

```{warning}
Apply caution when using these imputed datasets in subsequent analysis steps.  While the imputed regions appears reasonable, they are not true measurements.  
```