# Data pipeline for DiveDB
Uses classes `Metadata` and `DataReader` to facilitate data intake, processing, and alignment. 

In [None]:
# Import libraries and set working directory (adjust to fit your preferences)
import os
import sys
import numpy as np
import pandas as pd
import pytz
from notion_client import Client
from dotenv import load_dotenv
from datareader import DataReader
from metadata import Metadata
import plotly.express as px
import pickle
import nbformat
print(nbformat.__version__)

# Change the current working directory to the root directory
os.chdir("/Users/fbar/Documents/GitHub/pyologger")
root_dir = os.getcwd()
data_dir = os.path.join(root_dir, "data")

# Verify the current working directory
print(f"Current working directory: {root_dir}")

### Query metadata
Use Notion and [metadata entry form](https://forms.fillout.com/t/8UNuTLMaRfus) to start a recording and to generate identifiers for the Recording and Deployment. 


In [None]:
# Initialize the Metadata class
metadata = Metadata()
metadata.fetch_databases()

In [None]:
# Show all deployments to identify deployment in question
dep_db = metadata.get_metadata("dep_DB")
logger_db = metadata.get_metadata("logger_DB")
rec_db = metadata.get_metadata("rec_DB")
animal_db = metadata.get_metadata("animal_DB")
dep_db

#### Select a deployment
Select a deployment by entering the index for the deployment in the table above that you would like to analyze.

In [None]:
# Step 1: Display relevant information to help the user decide
print("Please choose a deployment by selecting its index:")
dep_db[['Deployment Name', 'Notes']]

# Step 2: Prompt the user for input
selected_index = int(input("Enter the index of the deployment you want to work with: "))

# Step 3: Process the user's selection
if 0 <= selected_index < len(dep_db):
    selected_deployment = dep_db.iloc[selected_index]
    print(f"You selected the deployment: {selected_deployment['Deployment Name']}")
    print(f"Description: {selected_deployment['Notes']}")
else:
    print("Invalid index selected.")

#### Checks to see if folder exists
Code searches for a folder with an exact match, also acceptable if a suffix has been added. Errors will be flagged if there is more than one folder matching deployment name.

In [None]:
# Get to deployment folder
deployment_folder = os.path.join(data_dir, selected_deployment['Deployment Name'])

# Verify the current working directory
print(f"Deployment folder path: {deployment_folder}")

# Step 1: Check if the folder exists
if os.path.exists(deployment_folder):
    print(f"Deployment folder found: {deployment_folder}")
else:
    # Step 2: If not found, search for a folder that starts with the deployment name
    print(f"Folder {deployment_folder} not found. Searching for folders with a similar name...")
    
    # Get a list of all folders in the data directory
    possible_folders = [folder for folder in os.listdir(data_dir) 
                        if folder.startswith(selected_deployment['Deployment Name'])]
    
    if len(possible_folders) == 1:
        # If exactly one match is found, use that folder
        deployment_folder = os.path.join(data_dir, possible_folders[0])
        print(f"Using the found folder: {deployment_folder}")
    elif len(possible_folders) > 1:
        # If multiple matches are found, ask the user to select one
        print("Multiple matching folders found. Please select one:")
        for i, folder in enumerate(possible_folders):
            print(f"{i}: {folder}")
        selected_index = int(input("Enter the index of the folder you want to use: "))
        if 0 <= selected_index < len(possible_folders):
            deployment_folder = os.path.join(data_dir, possible_folders[selected_index])
            print(f"Using the selected folder: {deployment_folder}")
        else:
            print("Invalid selection. Aborting.")
            deployment_folder = None
    else:
        # If no matches are found, return an error
        print("Error: Folder not found.")
        deployment_folder = None

# Continue processing if a valid folder was found
if deployment_folder:
    # Perform actions with the selected deployment folder
    pass  # Replace with your processing logic
else:
    print("Processing aborted due to missing folder.")

### Read in Data
Using `DataReader` class to query the files in the folder using logger IDs. Will flag unrecognized file types and summarize which loggers' data has been recognized. Will output copies of processed CSV data into an `outputs` folder that will be created within the original deployment folder.

In [None]:
# Initialize the datareader class
datareader = DataReader(deployment_folder)

# Call the read_files method
datareader.read_files(metadata, save_csv=True)

In [None]:
def read_and_concatenate_csvs(folder_path, deployment_name):
    # List CSV files that start with the given Deployment Name
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv') and file.startswith(deployment_name)]
    
    # Dictionary to hold lists of files grouped by logger ID
    logger_groups = {}

    for file in csv_files:
        # Extract the logger ID, assuming it's the part after the "Deployment Name" and before the last underscore
        logger_id = file.split('_')[-2]
        
        # Group files by their logger ID
        if logger_id not in logger_groups:
            logger_groups[logger_id] = []
        logger_groups[logger_id].append(file)
    
    # Sort each group's files by the number in the filename
    for logger_id in logger_groups:
        logger_groups[logger_id].sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
    
    # Concatenate dataframes for each logger ID if more than one file exists, and save the result
    concatenated_dfs = []
    for logger_id, files in logger_groups.items():
        if len(files) > 1:  # Only concatenate if there is more than one file
            print(f"Concatenating {len(files)} files for logger: {logger_id}")
            dfs = [pd.read_csv(os.path.join(folder_path, file)) for file in files]
            concatenated_df = pd.concat(dfs, ignore_index=True)
            
            # Save the concatenated dataframe to a CSV file
            output_filename = f"{deployment_name}_{logger_id}_ALL.csv"
            output_filepath = os.path.join(folder_path, output_filename)
            concatenated_df.to_csv(output_filepath, index=False)
            print(f"Saved concatenated file to: {output_filepath}")
            concatenated_dfs.append(concatenated_df)
        else:
            print(f"Skipping concatenation for logger: {logger_id}, only one file found.")
    
    # Optionally, return the final concatenated dataframe
    if concatenated_dfs:
        return pd.concat(concatenated_dfs, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an empty dataframe if no concatenation was performed

### Concatenate consecutive files

Checks to see if there are multiple files from a single logger to concatenate in the outputs folder. If needed, it concatenates them and outputs a concatenated file.

In [None]:
# Call the function to concatenate CSV files and save the result
folder_path = os.path.join(deployment_folder, "outputs")
deployment_name = selected_deployment['Deployment Name']

concatenated_data = read_and_concatenate_csvs(folder_path, deployment_name)


In [None]:
concatenated_data

In [None]:
final_df['datetime'] = pd.to_datetime(final_df[" Date (local)"] + ' ' + final_df[" Time (local)"], format='%d.%m.%Y %H:%M:%S.%f')
final_df['datetime'] = final_df['datetime'].dt.tz_localize(pytz.timezone('America/Los_Angeles'))
print(final_df['datetime'][0])

# Calculate time differences and cumulative sum of differences
sec_diff = final_df['datetime'].diff().dt.total_seconds()
final_df['cum_diff'] = np.cumsum(sec_diff)

# Check for inconsistencies (time jumps)
mean_diff = sec_diff.mean()
time_jumps = sec_diff[sec_diff > mean_diff * 2]  # Define a threshold for time jumps

# Report any inconsistencies
if not time_jumps.empty:
    print(f"Time jumps detected:\n{time_jumps}")
else:
    print("No significant time jumps detected.")
    print(f"Sampling frequency: {1 / mean_diff} Hz")

# Plot cumulative differences
plt.plot(final_df['datetime'], final_df['cum_diff'])
plt.xlabel('Time')
plt.ylabel('Cumulative Difference (seconds)')
plt.title('Cumulative Difference over Time')
plt.show()

In [None]:
# Plot prep
CO_df = final_df 

print(CO_df['datetime'][1]-CO_df['datetime'][0])
CO_fs = 1/(CO_df['datetime'][1]-CO_df['datetime'][0]).total_seconds()
CO_max_timediff = np.max(np.diff(CO_df['datetime']))
print(f"CATS Sampling frequency: {CO_fs} Hz with a maximum time difference of {CO_max_timediff}")

# Load the data_reader object from the pickle file
with open('outputs/data_reader.pkl', 'rb') as file:
    data_reader = pickle.load(file)

# Get the ECG and timestamp data
ecg_df = data_reader.data_raw['2024-06-17_oror-002-001a_UF-04_001']
ecg_df['datetime'] = pd.to_datetime(ecg_df['timestamp'])
ecg_df['datetime'] = ecg_df['datetime'].dt.tz_localize(pytz.timezone('America/Los_Angeles'))
print(ecg_df['datetime'][0])
print(ecg_df)

print(ecg_df['datetime'][1]-ecg_df['datetime'][0])
ecg_fs = 1/(ecg_df['datetime'][1]-ecg_df['datetime'][0]).total_seconds()
ecg_max_timediff = np.max(np.diff(ecg_df['datetime']))
print(f"ECG Sampling frequency: {ecg_fs} Hz with a maximum time difference of {ecg_max_timediff}")

In [None]:
new_sampling_rate = 10
ecg_conversion = int(ecg_fs / new_sampling_rate)
CATS_conversion = int(CO_fs / new_sampling_rate)

ecg_df10 = ecg_df.iloc[::ecg_conversion, :] # To subsample from 400Hz to 10Hz (1 out of every 40 samples)
CO_df10 = CO_df.iloc[::CATS_conversion, :] # To subsample from 400Hz to 10Hz (1 out of every 40 samples)

import matplotlib.pyplot as plt

fig, axs = plt.subplots(5, 1, figsize=(10, 10))

axs[0].plot(CO_df10['datetime'], CO_df10['Accelerometer X [m/s²]'])
axs[0].set_ylabel('Accelerometer X [m/s²]')

axs[1].plot(CO_df10['datetime'], CO_df10['Accelerometer Y [m/s²]'])
axs[1].set_ylabel('Accelerometer Y [m/s²]')

axs[2].plot(CO_df10['datetime'], CO_df10['Accelerometer Z [m/s²]'])
axs[2].set_ylabel('Accelerometer Z [m/s²]')

axs[3].plot(CO_df10['datetime'], CO_df10['Depth (100bar) [m]'])
axs[3].set_ylabel('Depth (100bar) [m]')

axs[4].plot(ecg_df10['datetime'], ecg_df10['ecg'])
axs[4].set_ylabel('ECG [mV]')

plt.xlabel('Datetime')
plt.show()

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

new_CATS_sampling_rate = 10
new_ecg_sampling_rate = 50
ecg_conversion = int(ecg_fs / new_ecg_sampling_rate)
CATS_conversion = int(CO_fs / new_CATS_sampling_rate)

ecg_df50 = ecg_df.iloc[::ecg_conversion, :]  # To subsample from 400Hz to 10Hz (1 out of every 40 samples)
CO_df10 = CO_df.iloc[::CATS_conversion, :]  # To subsample from 400Hz to 10Hz (1 out of every 40 samples)

# Create subplots
fig = make_subplots(rows=6, cols=1, shared_xaxes=True, vertical_spacing=0.01)

# Add ECG plot
fig.add_trace(go.Scatter(x=ecg_df10['datetime'], y=ecg_df10['ecg'], mode='lines', name='ECG [mV]', line=dict(color='orange')), row=1, col=1)

# Add Depth plot
fig.add_trace(go.Scatter(x=CO_df10['datetime'], y=CO_df10['Depth (100bar) [m]'], mode='lines', name='Depth [m]', line=dict(color='purple')), row=2, col=1)
fig.update_yaxes(autorange="reversed", row=2, col=1)

# Add Accelerometer plots
fig.add_trace(go.Scatter(x=CO_df10['datetime'], y=CO_df10['Accelerometer X [m/s²]'], mode='lines', name='Accel X [m/s²]', line=dict(color='blue')), row=3, col=1)
fig.add_trace(go.Scatter(x=CO_df10['datetime'], y=CO_df10['Accelerometer Y [m/s²]'], mode='lines', name='Accel Y [m/s²]', line=dict(color='green')), row=4, col=1)
fig.add_trace(go.Scatter(x=CO_df10['datetime'], y=CO_df10['Accelerometer Z [m/s²]'], mode='lines', name='Accel Z [m/s²]', line=dict(color='red')), row=5, col=1)

# Add Gyroscope Y plot
fig.add_trace(go.Scatter(x=CO_df10['datetime'], y=CO_df10['Gyroscope X [mrad/s]'], mode='lines', name='Gyr X [mrad/s]', line=dict(color='pink')), row=6, col=1)

# Update layout
fig.update_layout(height=800, width=1000, title_text="Subsampled Data Plots", showlegend=False)
fig.update_xaxes(title_text="Datetime", row=6, col=1)

# Update y-axes labels
fig.update_yaxes(title_text="ECG [mV]", row=1, col=1)
fig.update_yaxes(title_text="Depth [m]", row=2, col=1)
fig.update_yaxes(title_text="Accel X [m/s²]", row=3, col=1)
fig.update_yaxes(title_text="Accel Y [m/s²]", row=4, col=1)
fig.update_yaxes(title_text="Accel Z [m/s²]", row=5, col=1)
fig.update_yaxes(title_text="Gyr X [mrad/s]", row=6, col=1)

# Show plot
fig.show()


In [None]:
# Initialize the Metadata class
metadata = Metadata()
metadata.fetch_databases()

# Get the logger database
logger_db = metadata.get_metadata("logger_DB")

# Determine unique LoggerIDs from the logger metadata dataframe
logger_ids = set(logger_db['LoggerID'])
print(f"Unique Logger IDs: {logger_ids}")

# Breakdown of loggers by type
logger_breakdown = logger_db.groupby(['Manufacturer', 'Type']).size().reset_index(name='Count')
print("Logger Breakdown by Manufacturer and Type:")
print(logger_breakdown)

