# Split data in to separate segements

In [5]:
import os
import pandas as pd
from tqdm.notebook import tqdm

We now have the following `.csv` files stored locally

In [6]:
!ls -lh data/

total 2.0G
-rw-r--r-- 1 jmunroe jmunroe  40M May 22 21:24 Annapolis.csv
-rw-r--r-- 1 jmunroe jmunroe 106M May 22 21:25 Antigonish.csv
-rw-r--r-- 1 jmunroe jmunroe  33M May 22 21:25 Colchester.csv
-rw-r--r-- 1 jmunroe jmunroe 127M May 22 21:26 Digby.csv
-rw-r--r-- 1 jmunroe jmunroe 774M May 22 21:27 Guysborough.csv
-rw-r--r-- 1 jmunroe jmunroe 163M May 22 21:28 Halifax.csv
-rw-r--r-- 1 jmunroe jmunroe  30M May 22 21:28 Inverness.csv
-rw-r--r-- 1 jmunroe jmunroe 236M May 22 21:30 Lunenburg.csv
-rw-r--r-- 1 jmunroe jmunroe  41M May 22 21:30 Pictou.csv
-rw-r--r-- 1 jmunroe jmunroe  89M May 22 21:31 Queens.csv
-rw-r--r-- 1 jmunroe jmunroe  95M May 22 21:31 Richmond.csv
-rw-r--r-- 1 jmunroe jmunroe  96M May 22 21:32 Shelburne.csv
-rw-r--r-- 1 jmunroe jmunroe 241K May 22 21:32 Victoria.csv
-rw-r--r-- 1 jmunroe jmunroe 219M May 22 21:33 Yarmouth.csv


We need to organize and sort the observations so that we are considering only the observation for a single sensor in temporal order.

This will remove all of the duplicated metadata within this `.csv` files.

In [7]:
os.makedirs('segments', exist_ok=True)

all_segment_metadata = []
for index, row in tqdm(list(df_CMAR_datasets.iterrows())):

    csvfile = f"data/{row['county']}.csv"

    df = pd.read_csv(csvfile)

    df.drop(columns=['Unnamed: 0'], inplace=True)
    
    df['segment'] = df[['waterbody', 'station', 
                        #'sensor_serial_number', 'latitude (degrees_north)', 'longitude (degrees_east)',
                     'deployment_start_date (UTC)', 'deployment_end_date (UTC)',
                     'depth (m)']].agg(lambda x: row['county'] + '_' + '_'.join([str(y) for y in x]), axis=1)

    df_metadata = df[['segment', 'waterbody', 'station', 
                      # 'sensor_serial_number',  'latitude (degrees_north)', 'longitude (degrees_east)',
                     'deployment_start_date (UTC)', 'deployment_end_date (UTC)',
                     'depth (m)']]

    df_metadata = df_metadata.drop_duplicates()
    all_segment_metadata.append(df_metadata)
    
    df_data = df.drop(columns=['waterbody', 'station', 
                               #'sensor_serial_number',                     'latitude (degrees_north)', 'longitude (degrees_east)',
                     'deployment_start_date (UTC)', 'deployment_end_date (UTC)',
                     'depth (m)'])
    
    df_data = df_data.sort_values(by=['segment', 'time (UTC)'])

    df_data.set_index(['segment', 'time (UTC)'], inplace=True)

    for key, segment_df in df_data.groupby(level=0):
        csvfile = f'segments/{key}.csv'
        segment_df = segment_df.droplevel(0)
        segment_df.to_csv(csvfile)

df_metadata = pd.concat(all_segment_metadata)
df_metadata.set_index('segment', inplace=True)
df_metadata.to_csv('metadata.csv')

NameError: name 'df_CMAR_datasets' is not defined

In [11]:
!ls -lh segments/ | wc

    805    9197  131954


We have ~800 distinct observational time series taken at various locations and depths around Nova Scotia during the period of 2020-09-01 to 2024-08-31

In [14]:
df_metadata

Unnamed: 0_level_0,waterbody,station,sensor_serial_number,latitude (degrees_north),longitude (degrees_east),deployment_start_date (UTC),deployment_end_date (UTC),depth (m)
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Annapolis_Annapolis Basin_Lobster Ledge_20820380_44.6782_-65.68329_2020-06-11T00:00:00Z_2020-11-22T00:00:00Z_2.0,Annapolis Basin,Lobster Ledge,20820380,44.678200,-65.683290,2020-06-11T00:00:00Z,2020-11-22T00:00:00Z,2.0
Annapolis_Annapolis Basin_Lobster Ledge_670381_44.6782_-65.68329_2020-06-11T00:00:00Z_2020-11-22T00:00:00Z_4.0,Annapolis Basin,Lobster Ledge,670381,44.678200,-65.683290,2020-06-11T00:00:00Z,2020-11-22T00:00:00Z,4.0
Annapolis_Annapolis Basin_Cornwallis_670367_44.66939_-65.64479_2020-06-11T00:00:00Z_2020-11-22T00:00:00Z_1.0,Annapolis Basin,Cornwallis,670367,44.669390,-65.644790,2020-06-11T00:00:00Z,2020-11-22T00:00:00Z,1.0
Annapolis_Annapolis Basin_Cornwallis_670380_44.66942_-65.64477_2020-11-22T00:00:00Z_2021-06-16T00:00:00Z_1.0,Annapolis Basin,Cornwallis,670380,44.669420,-65.644770,2020-11-22T00:00:00Z,2021-06-16T00:00:00Z,1.0
Annapolis_Annapolis Basin_Cornwallis_547089_44.66942_-65.64477_2020-11-22T00:00:00Z_2021-06-16T00:00:00Z_2.0,Annapolis Basin,Cornwallis,547089,44.669420,-65.644770,2020-11-22T00:00:00Z,2021-06-16T00:00:00Z,2.0
...,...,...,...,...,...,...,...,...
Yarmouth_Lobster Bay_Lobster Bay_548559_43.679035_-65.89796_2023-08-02T00:00:00Z_2024-08-07T00:00:00Z_5.0,Lobster Bay,Lobster Bay,548559,43.679035,-65.897960,2023-08-02T00:00:00Z,2024-08-07T00:00:00Z,5.0
Yarmouth_Lamonts Cove_Short Beach_548563_43.930927_-66.167984_2023-08-03T00:00:00Z_2024-08-07T00:00:00Z_15.0,Lamonts Cove,Short Beach,548563,43.930927,-66.167984,2023-08-03T00:00:00Z,2024-08-07T00:00:00Z,15.0
Yarmouth_Careys Cove_Yarmouth Bar_548586_43.831528_-66.17321_2023-08-03T00:00:00Z_2024-08-07T00:00:00Z_15.0,Careys Cove,Yarmouth Bar,548586,43.831528,-66.173210,2023-08-03T00:00:00Z,2024-08-07T00:00:00Z,15.0
Yarmouth_Lobster Bay_Ram Island_548597_43.68223_-65.836205_2023-08-02T00:00:00Z_2024-08-07T00:00:00Z_8.0,Lobster Bay,Ram Island,548597,43.682230,-65.836205,2023-08-02T00:00:00Z,2024-08-07T00:00:00Z,8.0
