# CMAR Water Quality Datasets

Download from https://data.novascotia.ca/

Counties:

- Annapolis County
- Halifax County
- Lunenburg County
- Guysborough County
- Colchester County
- Pictou County
- Shelburne County
- Inverness County
- Antigonish County
- Yarmouth County
- Digby County
- Queens County
- Richmond County
- Victoria County
- Cape Breton County

In [55]:
import pandas as pd
import os
import numpy as np
from tqdm.notebook import tqdm

Pull out the metadata on these sensor strings

In [2]:
counties = ['Annapolis', 'Halifax', 'Lunenburg', 'Guysborough', 'Colchester',
            'Pictou', 'Shelburne', 'Inverness', 'Antigonish', 'Yarmouth',
            'Digby', 'Queens', 'Richmond', 'Victoria']

In [6]:
metadata = []
for county in counties:
    print(county)
    filename = os.path.join('../CMAR_WaterQualityDatasets', f'{county}_County_Water_Quality_Data_20250130.csv')
    df = pd.read_csv(filename, sep=',', dtype='str', usecols=[0,1,2,3,4,5,6,7,8,10,11])
    df = df.drop_duplicates()
    df = df.reset_index()
    df.insert(0, 'county', county)
    metadata.append(df)

Annapolis
Halifax
Lunenburg
Guysborough
Colchester
Pictou
Shelburne
Inverness
Antigonish
Yarmouth
Digby
Queens
Richmond
Victoria
Cape_Breton


In [7]:
df = pd.concat(metadata)

In [8]:
df

Unnamed: 0,county,index,waterbody,station,lease,latitude,longitude,deployment_range,string_configuration,sensor_type,sensor_serial_number,sensor_depth_at_low_tide_m,depth_crosscheck_flag
0,Annapolis,0,Annapolis Basin,5006,5006,44.669583,-65.647733,2021-May-14 to 2021-Jun-18,attached to gear,hobo,20330413,1.3,
1,Annapolis,360,Annapolis Basin,1042,1042,44.652855,-65.674531,2021-Aug-27 to 2022-May-23,attached to gear,aquameasure,671014,0.5,
2,Annapolis,2126,Annapolis Basin,5005,5005,44.634167,-65.711933,2021-May-14 to 2021-Jun-18,attached to gear,aquameasure,680322,1,
3,Annapolis,16048,Annapolis Basin,Cornwallis,,44.66939,-65.64479,2020-Jun-11 to 2020-Nov-22,sub-surface buoy,aquameasure,670367,1,Pass
4,Annapolis,39618,Annapolis Basin,Cornwallis,,44.66939,-65.64479,2020-Jun-11 to 2020-Nov-22,sub-surface buoy,vr2ar,547089,2,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,Victoria,177281,Aspy Bay,North Harbour,,46.91123,-60.47478,2018-Nov-01 to 2020-Sep-10,unknown,tidbit,20397521,5,
9,Victoria,242468,Bras d'Or Lakes,Nyanza Bay E,,46.0652428,-60.8948307,2016-Sep-14 to 2018-Jul-05,attached to fixed structure,hobo,10034841,8,
10,Victoria,258284,Bras d'Or Lakes,Nyanza Bay W,,46.0647787,-60.9023563,2016-Sep-14 to 2018-Jul-05,attached to fixed structure,hobo,10778920,8,
0,Cape_Breton,0,Bras d'Or Lakes,Little Bras d'Or Channel,,46.2962606,-60.2876892,2016-Sep-13 to 2018-May-30,attached to fixed structure,hobo,10034844,8,


In [9]:
df.to_csv('cmar_datasets.csv', index=True)

In [57]:
os.makedirs('data_ns', exist_ok=True)

for county in tqdm(counties):
    csvfile = f"data_ns/{county}.csv"

    if os.path.exists(csvfile):
        continue

    filename = os.path.join('../CMAR_WaterQualityDatasets', f'{county}_County_Water_Quality_Data_20250130.csv')
    df = pd.read_csv(filename, usecols=[0,1,5,9,10,16,21], parse_dates=['timestamp_utc'], date_format="%m/%d/%Y %I:%M:%S %p")
    
    # filter by date range
    df = df[(df.timestamp_utc >= '2020-09-01') & (df.timestamp_utc <= '2024-08-31')]
       
    df.to_csv(csvfile, index=False)

  0%|          | 0/15 [00:00<?, ?it/s]

  df = pd.read_csv(filename, usecols=[0,1,5,9,10,16,21], parse_dates=['timestamp_utc'], date_format="%m/%d/%Y %I:%M:%S %p")
  df = pd.read_csv(filename, usecols=[0,1,5,9,10,16,21], parse_dates=['timestamp_utc'], date_format="%m/%d/%Y %I:%M:%S %p")


In [59]:
!ls -lh data_ns/

total 2.1G
-rw-r--r-- 1 jmunroe jmunroe  28M Jun 11 22:51 Annapolis.csv
-rw-r--r-- 1 jmunroe jmunroe  90M Jun 11 22:52 Antigonish.csv
-rw-r--r-- 1 jmunroe jmunroe  126 Jun 11 22:52 Cape_Breton.csv
-rw-r--r-- 1 jmunroe jmunroe  27M Jun 11 22:52 Colchester.csv
-rw-r--r-- 1 jmunroe jmunroe 167M Jun 11 22:52 Digby.csv
-rw-r--r-- 1 jmunroe jmunroe 783M Jun 11 22:52 Guysborough.csv
-rw-r--r-- 1 jmunroe jmunroe 150M Jun 11 22:51 Halifax.csv
-rw-r--r-- 1 jmunroe jmunroe  34M Jun 11 22:52 Inverness.csv
-rw-r--r-- 1 jmunroe jmunroe 329M Jun 11 22:51 Lunenburg.csv
-rw-r--r-- 1 jmunroe jmunroe  40M Jun 11 22:52 Pictou.csv
-rw-r--r-- 1 jmunroe jmunroe  60M Jun 11 22:52 Queens.csv
-rw-r--r-- 1 jmunroe jmunroe  75M Jun 11 22:52 Richmond.csv
-rw-r--r-- 1 jmunroe jmunroe 119M Jun 11 22:52 Shelburne.csv
-rw-r--r-- 1 jmunroe jmunroe 150K Jun 11 22:52 Victoria.csv
-rw-r--r-- 1 jmunroe jmunroe 155M Jun 11 22:52 Yarmouth.csv


In [67]:
os.makedirs('segments_ns', exist_ok=True)

all_segment_metadata = []
for county in tqdm(counties):

    csvfile = f"data_ns/{county}.csv"

    df = pd.read_csv(csvfile)
    df.rename(columns={'timestamp_utc': 'time (UTC)', 
                       'sensor_depth_at_low_tide_m': 'depth (m)',
                       'temperature_degree_c': 'temperature (degrees_Celsius)',
                       'qc_flag_temperature_degree_c': 'qc_flag_temperature'
                      }, 
              inplace=True)
    
    df['segment'] = df[['waterbody', 'station', 'depth (m)',
                     'deployment_range',
                     ]].agg(lambda x: county + '_' + '_'.join([str(y) for y in x]), axis=1)

    df_metadata = df[['segment', 'waterbody', 'station', 'depth (m)',
                     'deployment_range',
                     ]]

    df_metadata = df_metadata.drop_duplicates()
    all_segment_metadata.append(df_metadata)
    
    df_data = df.drop(columns=['waterbody', 'station', 'depth (m)',
                                 'deployment_range',
                                  ])

    df_data = df_data.sort_values(by=['segment', 'time (UTC)'])
    
    df_data.set_index(['segment', 'time (UTC)'], inplace=True)

    for key, segment_df in df_data.groupby(level=0):
        csvfile = f'segments_ns/{key}.csv'
        segment_df = segment_df.droplevel(0)
        segment_df.to_csv(csvfile)

df_metadata = pd.concat(all_segment_metadata)
df_metadata.set_index('segment', inplace=True)
df_metadata.to_csv('metadata_ns.csv')

  0%|          | 0/15 [00:00<?, ?it/s]

  df_metadata = pd.concat(all_segment_metadata)


In [68]:
!ls -lh segments_ns/ | wc

   1126   15215  135587


In [69]:
df_metadata

Unnamed: 0_level_0,waterbody,station,depth (m),deployment_range
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Annapolis_Annapolis Basin_5006_1.3_2021-May-14 to 2021-Jun-18,Annapolis Basin,5006,1.3,2021-May-14 to 2021-Jun-18
Annapolis_Annapolis Basin_1042_0.5_2021-Aug-27 to 2022-May-23,Annapolis Basin,1042,0.5,2021-Aug-27 to 2022-May-23
Annapolis_Annapolis Basin_5005_1.0_2021-May-14 to 2021-Jun-18,Annapolis Basin,5005,1.0,2021-May-14 to 2021-Jun-18
Annapolis_Annapolis Basin_Cornwallis_1.0_2020-Jun-11 to 2020-Nov-22,Annapolis Basin,Cornwallis,1.0,2020-Jun-11 to 2020-Nov-22
Annapolis_Annapolis Basin_Cornwallis_2.0_2020-Jun-11 to 2020-Nov-22,Annapolis Basin,Cornwallis,2.0,2020-Jun-11 to 2020-Nov-22
...,...,...,...,...
Richmond_Arichat Harbour_Kavanagh Point_5.0_2024-Jul-03 to 2024-Oct-17,Arichat Harbour,Kavanagh Point,5.0,2024-Jul-03 to 2024-Oct-17
Richmond_Arichat Harbour_Kavanagh Point_10.0_2024-Jul-03 to 2024-Oct-17,Arichat Harbour,Kavanagh Point,10.0,2024-Jul-03 to 2024-Oct-17
Richmond_Arichat Harbour_Kavanagh Point_15.0_2024-Jul-03 to 2024-Oct-17,Arichat Harbour,Kavanagh Point,15.0,2024-Jul-03 to 2024-Oct-17
Victoria_Aspy Bay_North Harbour_2_2018-Nov-01 to 2020-Sep-10,Aspy Bay,North Harbour,2.0,2018-Nov-01 to 2020-Sep-10
