# Data pipeline for DiveDB
Uses classes `Metadata` and `DataReader` to facilitate data intake, processing, and alignment. 

In [None]:
# Import libraries and set working directory (adjust to fit your preferences)
import os
import sys
import numpy as np
import pandas as pd
import pytz
import matplotlib.pyplot as plt
from notion_client import Client
from dotenv import load_dotenv
from datareader import DataReader
from metadata import Metadata
from datastreams import DataStreams
from loggerdata import LoggerData
import plotly.express as px
import pickle
import nbformat
print(nbformat.__version__)

# Change the current working directory to the root directory
# os.chdir("/Users/fbar/Documents/GitHub/pyologger")
os.chdir("/Users/jessiekb/Documents/GitHub/pyologger")
root_dir = os.getcwd()
data_dir = os.path.join(root_dir, "data")
outputs_dir = os.path.join(root_dir, "outputs")

# Verify the current working directory
print(f"Current working directory: {root_dir}")

### Query metadata
Use Notion and [metadata entry form](https://forms.fillout.com/t/8UNuTLMaRfus) to start a recording and to generate identifiers for the Recording and Deployment. 


In [None]:
# Initialize the Metadata class
metadata = Metadata()
metadata.fetch_databases(verbose=False)

# Save databases
dep_db = metadata.get_metadata("dep_DB")
logger_db = metadata.get_metadata("logger_DB")
rec_db = metadata.get_metadata("rec_DB")
animal_db = metadata.get_metadata("animal_DB")

### Steps for Processing Deployment Data:

1. **Select Deployment Folder**:
   - **Description:** Asks the user for input to select a deployment folder to kick off the data reading process. In your folder name, you can have any suffix after Deployment ID. It will check and stop if there are two that fit.
   - **Function Used:** `check_deployment_folder()`

2. **Initialize Deployment Folder**:
   - **Description:** Starts the main `read_files` process with the selected deployment folder.
   - **Function Used:** `read_files()`

3. **Fetch Metadata**:
   - **Description:** Retrieve necessary data from the metadata database, including logger information.
   - **Function Used:** `metadata.fetch_databases()`

4. **Organize Files by Logger ID**:
   - **Description:** Group files by logger ID for processing.
   - **Function Used:** `read_files()` (This is the main function)

5. **Check for Existing Processed Files**:
   - **Description:** Verify if the outputs folder already contains processed files for each logger. Skip reprocessing if all necessary files are present.
   - **Function Used:** `check_outputs_folder()`

6. **Process UBE Files**:
   - **Description:** For each UFI logger with UBE files, process and save the data.
   - **Function Used:** `process_ube_file()`

7. **Process CSV Files**:
   - **Description:** For each logger with multiple CSV files, concatenate them, and save the combined data.
   - **Function Used:** `concatenate_and_save_csvs()`

8. **Final Outputs**:
   - **Description:** Ensure all processed data is saved in the outputs folder with appropriate filenames.
   - **Functions Used:** `save_data()`

In [None]:
# Find your deployment ID index and remember it for the next cell, where you have to enter it.
dep_db

In [None]:
# Assuming you have the metadata and dep_db loaded:
datareader = DataReader()
deployment_folder = datareader.check_deployment_folder(dep_db, data_dir)

if deployment_folder:
    datareader.read_files(metadata, save_csv=True)


### Concatenate consecutive files

Checks to see if there are multiple files from a single logger to concatenate in the outputs folder. If needed, it concatenates them and outputs a concatenated file. For any concatenated files, the intermediate files that were previously in `outputs` will be removed.

### Organize data into DataStream object



In [None]:
# Load the data_reader object from the pickle file
pkl_path = os.path.join(outputs_dir, 'data_reader.pkl')

with open(pkl_path, 'rb') as file:
    data_pkl = pickle.load(file)

In [None]:
# Initialize the DataStreams object
data = DataStreams()

# Define the folder path to the "outputs" directory
folder_path = os.path.join(deployment_folder, "outputs") 

# Load data from the "outputs" folder
load_data_from_outputs(folder_path, data)

### Assign a time zone to the deployment
This time zone will apply to the deployment and be used to align timestamps. Check to make sure the timezone is recognized and present in our mapping dictionary.

In [None]:
provided_timezone = selected_deployment['Time Zone']
print(pytz.all_timezones)

if provided_timezone not in pytz.all_timezones:
    raise ValueError(f"Unrecognized time zone: {provided_timezone}. Please provide a valid time zone from pytz.all_timezones.")
else:
    print(f"Time zone recognized:\n{provided_timezone}")

# Assuming you have a DataFrame final_df with Date (local) and Time (local) columns
final_df['datetime'] = pd.to_datetime(final_df[" Date (local)"] + ' ' + final_df[" Time (local)"], format='%d.%m.%Y %H:%M:%S.%f')

# Localize to the correct timezone
final_df['datetime'] = final_df['datetime'].dt.tz_localize(pytz.timezone(provided_timezone))

### Check for gaps in timeseries data
Especially with concatenated data, we must check that there are no gaps in the data. 

In [None]:
# Calculate time differences and cumulative sum of differences
sec_diff = final_df['datetime'].diff().dt.total_seconds()
final_df['cum_diff'] = np.cumsum(sec_diff)

# Check for inconsistencies (time jumps)
mean_diff = sec_diff.mean()
time_jumps = sec_diff[sec_diff > mean_diff * 2]  # Define a threshold for time jumps

print(CO_df['datetime'][1]-CO_df['datetime'][0])
CO_fs = 1/(CO_df['datetime'][1]-CO_df['datetime'][0]).total_seconds()
CO_max_timediff = np.max(np.diff(CO_df['datetime']))
print(f"CATS Sampling frequency: {CO_fs} Hz with a maximum time difference of {CO_max_timediff}")

# Report any inconsistencies
if not time_jumps.empty:
    print(f"Time jumps detected:\n{time_jumps}")
else:
    print("No significant time jumps detected.")
    print(f"Sampling frequency: {1 / mean_diff} Hz")

# Plot cumulative differences
plt.plot(final_df['datetime'], final_df['cum_diff'])
plt.xlabel('Time')
plt.ylabel('Cumulative Difference (seconds)')
plt.title('Cumulative Difference over Time')
plt.show()

In [None]:
# Plot prep
CO_df = final_df 



# Load the data_reader object from the pickle file
with open('outputs/data_reader.pkl', 'rb') as file:
    data_reader = pickle.load(file)

# Get the ECG and timestamp data
ecg_df = data_reader.data_raw['2024-06-17_oror-002-001a_UF-04_001']
ecg_df['datetime'] = pd.to_datetime(ecg_df['timestamp'])
ecg_df['datetime'] = ecg_df['datetime'].dt.tz_localize(pytz.timezone('America/Los_Angeles'))
print(ecg_df['datetime'][0])
print(ecg_df)

print(ecg_df['datetime'][1]-ecg_df['datetime'][0])
ecg_fs = 1/(ecg_df['datetime'][1]-ecg_df['datetime'][0]).total_seconds()
ecg_max_timediff = np.max(np.diff(ecg_df['datetime']))
print(f"ECG Sampling frequency: {ecg_fs} Hz with a maximum time difference of {ecg_max_timediff}")

In [None]:
new_sampling_rate = 10
ecg_conversion = int(ecg_fs / new_sampling_rate)
CATS_conversion = int(CO_fs / new_sampling_rate)

ecg_df10 = ecg_df.iloc[::ecg_conversion, :] # To subsample from 400Hz to 10Hz (1 out of every 40 samples)
CO_df10 = CO_df.iloc[::CATS_conversion, :] # To subsample from 400Hz to 10Hz (1 out of every 40 samples)

import matplotlib.pyplot as plt

fig, axs = plt.subplots(5, 1, figsize=(10, 10))

axs[0].plot(CO_df10['datetime'], CO_df10['Accelerometer X [m/s²]'])
axs[0].set_ylabel('Accelerometer X [m/s²]')

axs[1].plot(CO_df10['datetime'], CO_df10['Accelerometer Y [m/s²]'])
axs[1].set_ylabel('Accelerometer Y [m/s²]')

axs[2].plot(CO_df10['datetime'], CO_df10['Accelerometer Z [m/s²]'])
axs[2].set_ylabel('Accelerometer Z [m/s²]')

axs[3].plot(CO_df10['datetime'], CO_df10['Depth (100bar) [m]'])
axs[3].set_ylabel('Depth (100bar) [m]')

axs[4].plot(ecg_df10['datetime'], ecg_df10['ecg'])
axs[4].set_ylabel('ECG [mV]')

plt.xlabel('Datetime')
plt.show()

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

new_CATS_sampling_rate = 10
new_ecg_sampling_rate = 50
ecg_conversion = int(ecg_fs / new_ecg_sampling_rate)
CATS_conversion = int(CO_fs / new_CATS_sampling_rate)

ecg_df50 = ecg_df.iloc[::ecg_conversion, :]  # To subsample from 400Hz to 10Hz (1 out of every 40 samples)
CO_df10 = CO_df.iloc[::CATS_conversion, :]  # To subsample from 400Hz to 10Hz (1 out of every 40 samples)

# Create subplots
fig = make_subplots(rows=6, cols=1, shared_xaxes=True, vertical_spacing=0.01)

# Add ECG plot
fig.add_trace(go.Scatter(x=ecg_df10['datetime'], y=ecg_df10['ecg'], mode='lines', name='ECG [mV]', line=dict(color='orange')), row=1, col=1)

# Add Depth plot
fig.add_trace(go.Scatter(x=CO_df10['datetime'], y=CO_df10['Depth (100bar) [m]'], mode='lines', name='Depth [m]', line=dict(color='purple')), row=2, col=1)
fig.update_yaxes(autorange="reversed", row=2, col=1)

# Add Accelerometer plots
fig.add_trace(go.Scatter(x=CO_df10['datetime'], y=CO_df10['Accelerometer X [m/s²]'], mode='lines', name='Accel X [m/s²]', line=dict(color='blue')), row=3, col=1)
fig.add_trace(go.Scatter(x=CO_df10['datetime'], y=CO_df10['Accelerometer Y [m/s²]'], mode='lines', name='Accel Y [m/s²]', line=dict(color='green')), row=4, col=1)
fig.add_trace(go.Scatter(x=CO_df10['datetime'], y=CO_df10['Accelerometer Z [m/s²]'], mode='lines', name='Accel Z [m/s²]', line=dict(color='red')), row=5, col=1)

# Add Gyroscope Y plot
fig.add_trace(go.Scatter(x=CO_df10['datetime'], y=CO_df10['Gyroscope X [mrad/s]'], mode='lines', name='Gyr X [mrad/s]', line=dict(color='pink')), row=6, col=1)

# Update layout
fig.update_layout(height=800, width=1000, title_text="Subsampled Data Plots", showlegend=False)
fig.update_xaxes(title_text="Datetime", row=6, col=1)

# Update y-axes labels
fig.update_yaxes(title_text="ECG [mV]", row=1, col=1)
fig.update_yaxes(title_text="Depth [m]", row=2, col=1)
fig.update_yaxes(title_text="Accel X [m/s²]", row=3, col=1)
fig.update_yaxes(title_text="Accel Y [m/s²]", row=4, col=1)
fig.update_yaxes(title_text="Accel Z [m/s²]", row=5, col=1)
fig.update_yaxes(title_text="Gyr X [mrad/s]", row=6, col=1)

# Show plot
fig.show()


In [None]:
# Initialize the Metadata class
metadata = Metadata()
metadata.fetch_databases()

# Get the logger database
logger_db = metadata.get_metadata("logger_DB")

# Determine unique LoggerIDs from the logger metadata dataframe
logger_ids = set(logger_db['LoggerID'])
print(f"Unique Logger IDs: {logger_ids}")

# Breakdown of loggers by type
logger_breakdown = logger_db.groupby(['Manufacturer', 'Type']).size().reset_index(name='Count')
print("Logger Breakdown by Manufacturer and Type:")
print(logger_breakdown)

