# Using `pyologger` data processing pipeline with `DiveDB`
Uses classes `Metadata` and `DataReader` to facilitate data intake, processing, and alignment. 

## Read deployment metadata

In [None]:
# Import libraries and set working directory (adjust to fit your preferences)
import os
import pickle
from pyologger.load_data.datareader import DataReader
from pyologger.load_data.metadata import Metadata
from pyologger.plot_data.plotter import *
from pyologger.calibrate_data.calibrate_acc_mag import *

# Change the current working directory to the root directory
#os.chdir("/Users/fbar/Documents/GitHub/pyologger")
#os.chdir("/Users/williamgislason/Documents/Whitecap/Seals/Finescale-HR")
os.chdir("/Users/jessiekb/Documents/GitHub/pyologger")

root_dir = os.getcwd()
data_dir = os.path.join(root_dir, "data")

# Verify the current working directory
print(f"Current working directory: {root_dir}")

In [None]:
# Initialize the info class
metadata = Metadata()
metadata.fetch_databases(verbose=False)
metadata.find_relations(verbose=False)

# Save databases
deployment_db = metadata.get_metadata("deployment_DB")
logger_db = metadata.get_metadata("logger_DB")
recording_db = metadata.get_metadata("recording_DB")
animal_db = metadata.get_metadata("animal_DB")
dataset_db = metadata.get_metadata("dataset_DB")

In [None]:
# Check out the deployment dataframe.
deployment_db_sorted = deployment_db.sort_values('Recording Date')
deployment_db_sorted
#recording_db
#logger_db

## Read Files in Deployment Folder

### Steps for Processing Deployment Data:

1. **Select Deployment Folder**:
   - **Description:** Prompts the user to select a deployment folder to initiate the data reading process. The folder name can include any suffix after the Deployment ID. The function checks for potential conflicts and halts the process if multiple matching folders are found.
   - **Function Used:** `check_deployment_folder()`

2. **Initialize Deployment Folder**:
   - **Description:** Begins the main `read_files` process with the selected deployment folder.
   - **Function Used:** `read_files()`

3. **Fetch Metadata**:
   - **Description:** Retrieves essential data from the metadata database, including logger and animal information.
   - **Functions Used:** `metadata.fetch_databases()`, `get_animal_info()`, `get_dataset_info()`

4. **Organize Files by Logger ID**:
   - **Description:** Groups files by logger ID for processing.
   - **Function Used:** `organize_files_by_logger_id()` (within `read_files()`)

5. **Check for Existing Processed Files**:
   - **Description:** Verifies if the output folder already contains processed files for each logger. Skips reprocessing if all necessary files are present.
   - **Function Used:** `check_outputs_folder()`

6. **Process Manufacturer-Specific Files**:
   - **Description:** Depending on the logger's manufacturer, different processing methods are applied. The `BaseManufacturer` class and its subclasses (`CATSManufacturer`, `UFIManufacturer`) handle the specifics, such as processing `.txt` files for CATS loggers or `.ube` files for UFI loggers.
   - **Functions Used:** `BaseManufacturer.process_files()`, `CATSManufacturer.process_files()`, `UFIManufacturer.process_files()`

7. **Save Processed Data**:
   - **Description:** Saves processed data files in the outputs folder with appropriate filenames, ensuring consistency across different formats such as CSV, Parquet, and EDF.
   - **Functions Used:** `save_data()`, `export_to_edf()`, `save_to_netcdf()`

8. **Finalize and Save DataReader Object**:
   - **Description:** Saves the state of the `DataReader` object, including all processed data and metadata, as a pickle file for easy retrieval and future processing.
   - **Function Used:** `save_datareader_object()`

### Classes and Their Roles:

- **`DataReader` Class:**
  - **Role:** Handles the overall reading, processing, and saving of deployment data files. It manages the deployment folder path, organizes data by logger and sensor, and interfaces with manufacturer-specific processing through the `BaseManufacturer` class.
  - **Methods:** `read_files()`, `save_datareader_object()`, `organize_files_by_logger_id()`, `save_data()`, `export_to_edf()`, `save_to_netcdf()`, and more.

- **`BaseManufacturer` Class:**
  - **Role:** Acts as a template for manufacturer-specific processing classes. It defines common methods for loading custom mappings, renaming columns, and mapping data to sensors. Subclasses like `CATSManufacturer` and `UFIManufacturer` extend this base class to implement specific processing logic.
  - **Methods:** `process_files()`, `load_custom_mapping()`, `rename_columns()`, `map_data_to_sensors()`, `parse_txt_for_intervals()`, and more.

- **`metadata` Class (from `pyologger.load_data.metadata`):**
  - **Role:** Provides methods for fetching and managing metadata related to deployments, loggers, animals, and datasets. It plays a crucial role in ensuring the correct organization and processing of data.
  - **Methods:** `fetch_databases()`, `get_metadata()`, and others as needed for metadata handling.


In [None]:
# Define the path to your custom mapping file
channel_mapping_path = os.path.join(root_dir, 'channel_mapping.json')

datareader = DataReader(deployment_folder_path=data_dir)

deployment_folder = datareader.check_deployment_folder(deployment_db, data_dir)
edf_filename_template = os.path.join(datareader.files_info['deployment_folder_path'], 'outputs', 'edf_test_{sensor}.edf')

if deployment_folder:
    datareader.read_files(metadata, save_csv=False, save_parq=False, save_edf=False, 
                          custom_mapping_path=channel_mapping_path, save_netcdf=True,
                          edf_filename_template=edf_filename_template, edf_save_from='sensor_data')

In [None]:
datareader.sensor_info['pressure']['sampling_frequency']

## Upload data to DiveDB

Make sure your local DiveDB servers are running. To do so:
- Navigate to the DiveDB directory
- Run the command: `make up`
- Wait until all services are running (Django, Postgres, Jupyter)
- Make sure you have run the latest migrations: `make migrate`
- Make sure you imported the latest logger and animal databases: `make importmetadata`

Then, you're ready to upload data!

In [None]:
# Allow Django to run with async unsafe to run outside of Django server
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

from DiveDB.services.data_uploader import DataUploader

data_uploader = DataUploader()

metadata = {
    "animal": datareader.animal_info["Animal ID"],
    "deployment": datareader.deployment_info["Deployment ID"],
    "recording": datareader.deployment_info["Recording ID"].split(", ")[1]
}

data_uploader.upload_netcdf('./data/2024-01-16_oror-002a/outputs/deployment_data.nc', metadata=metadata)

In [None]:
from DiveDB.services.duck_pond import DuckPond

duckpond = DuckPond()

df = duckpond.get_delta_data(    
    signal_names=["sensor_data_ecg", "sensor_data_light", "sensor_data_temperature", "sensor_data_depth"],
    animal_ids="mian-001", # Make sure this matches the animal ID you uploaded
    frequency=100,
)

display(df)

In [None]:
import mne

def export_concatenated_to_edf(concatenated_df, highest_sampling_frequency, latest_start_time, edf_filename_template):
    """
    Exports the concatenated DataFrame to an EDF file.

    Parameters:
    - concatenated_df: The DataFrame containing concatenated data from all loggers.
    - highest_sampling_frequency: The highest sampling frequency among the loggers.
    - latest_start_time: The latest start time among the loggers.
    - edf_filename_template: Template string for the EDF filename.
                             The string should contain `{sensor}` to be replaced with 'ALL'.
    """
    if concatenated_df is None or concatenated_df.empty:
        print("No data available for export. Exiting.")
        return

    ch_names = concatenated_df.columns.tolist()
    sfreq = highest_sampling_frequency

    # Check if there are any channels to process
    if len(ch_names) == 0:
        print("No valid channels found to export. Exiting.")
        return

    info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types='misc')  # Adjust ch_types as necessary

    # Convert datetime to (seconds, microseconds) tuple for the latest start time
    meas_date = (int(latest_start_time.timestamp()), int((latest_start_time.timestamp() % 1) * 1e6))

    # Create MNE RawArray
    data = concatenated_df.values.T
    raw = mne.io.RawArray(data, info)
    raw.set_meas_date(meas_date)

    # Step 8: Define the EDF filename and save the EDF file
    edf_filename = edf_filename_template.format(sensor='ALL')

    print(f"Saving EDF file as {edf_filename} with shape {data.shape}.")

    # Ensure that data is within the physical range EDF expects
    raw.export(edf_filename, fmt='edf')

    print(f"EDF file saved as {edf_filename}")


In [None]:
from pyologger.process_data.sampling import *
import pandas as pd
def concatenate_logger_data(datareader):
    """
    Concatenates data from all loggers stored in `datareader.data`.

    Parameters:
    - datareader: The DataReader object containing logger data in `datareader.data`.

    Returns:
    - concatenated_df: A pandas DataFrame with the concatenated data from all loggers.
    - highest_sampling_frequency: The highest sampling frequency found among the loggers.
    - latest_start_time: The latest start time among the loggers.
    """
    logger_data_info = {}

    # Step 1: Extract start time, end time, and sampling frequency for each logger
    for logger_id, df in datareader.data.items():
        if not isinstance(df, pd.DataFrame):
            print(f"Logger {logger_id} does not contain a valid DataFrame. Skipping.")
            continue

        if 'datetime' not in df.columns:
            print(f"Logger {logger_id} does not have a 'datetime' column. Skipping.")
            continue

        start_time = df['datetime'].iloc[0]
        end_time = df['datetime'].iloc[-1]
        sampling_frequency = round(1 / df['datetime'].diff().dt.total_seconds().mean())

        logger_data_info[logger_id] = {
            'start_time': start_time,
            'end_time': end_time,
            'sampling_frequency': sampling_frequency
        }

        print(f"Logger {logger_id}: start_time={start_time}, end_time={end_time}, sampling_frequency={sampling_frequency} Hz")

    if not logger_data_info:
        print("No valid logger data found. Exiting.")
        return None, None, None

    # Step 2: Determine the latest start time, earliest end time, and highest sampling frequency
    latest_start_time = max(info['start_time'] for info in logger_data_info.values())
    earliest_end_time = min(info['end_time'] for info in logger_data_info.values())
    highest_sampling_frequency = max(info['sampling_frequency'] for info in logger_data_info.values())

    print(f"Latest start time: {latest_start_time}")
    print(f"Earliest end time: {earliest_end_time}")
    print(f"Highest sampling frequency: {highest_sampling_frequency} Hz")

    # Step 3: Initialize an empty DataFrame for concatenation
    concatenated_df = pd.DataFrame()

    # Step 4: Crop dataframes, upsample as necessary, and concatenate
    for logger_id, df in datareader.data.items():
        if not isinstance(df, pd.DataFrame):
            continue

        # Crop dataframe
        df_cropped = df[(df['datetime'] >= latest_start_time) & (df['datetime'] <= earliest_end_time)]
        print(f"Logger {logger_id}: Cropped data from {len(df)} rows to {len(df_cropped)} rows.")

        # Determine upsampling factor
        upsampling_factor = highest_sampling_frequency / logger_data_info[logger_id]['sampling_frequency']

        if upsampling_factor > 1:
            original_length = len(df_cropped)
            df_cropped = df_cropped.set_index('datetime')

            # Upsample each sensor column that is not "extra"
            for column in df_cropped.columns:
                sensor_info = None

                # Search for the sensor type in `datareader.sensor_info`
                for sensor_name, sensor_details in datareader.sensor_info.items():
                    if column in sensor_details['channels']:
                        sensor_info = sensor_details
                        break

                if not sensor_info:
                    continue

                sensor_type = sensor_info['metadata'][column]['sensor']
                if sensor_type != 'extra':
                    print(f"Upsampling column {column} from logger {logger_id} by factor {upsampling_factor}.")
                    df_cropped[column] = upsample(df_cropped[column].values, int(upsampling_factor), original_length)

            df_cropped = df_cropped.reset_index()

        # Remove "extra" sensor columns and append to the concatenated DataFrame
        columns_to_keep = []
        for column in df_cropped.columns:
            sensor_info = None

#datareader.sensor_data['accelerometer']
#datareader.files_info

In [None]:
# Optionally look at first notes that have been read in

datareader.event_data[0:5]
#datareader.logger_data['CC-96']

## Inspect data

In [None]:
# Load the data_reader object from the pickle file
pkl_path = os.path.join(deployment_folder, 'outputs', 'data.pkl')

with open(pkl_path, 'rb') as file:
    data_pkl = pickle.load(file)

for logger_id, info in data_pkl.logger_info.items():
    sampling_frequency = info.get('datetime_metadata', {}).get('fs', None)
    if sampling_frequency is not None:
        # Format the sampling frequency to 5 significant digits
        print(f"Sampling frequency for {logger_id}: {sampling_frequency} Hz")
    else:
        print(f"No sampling frequency available for {logger_id}")

### Plot data
Make an interactive plot

In [None]:
data_pkl.logger_info['UF-01']['channelinfo']

In [None]:
# Load color mappings
color_mapping_path = os.path.join(root_dir, 'color_mappings.json')

# Streamlit sidebar for time range selection
imu_logger_to_use = 'CC-96'
ephys_logger_to_use = 'UF-01'

# Define the overlapping time range
imu_df = data_pkl.logger_data[imu_logger_to_use]
ephys_df = data_pkl.logger_data[ephys_logger_to_use]
overlap_start_time = max(imu_df['datetime'].min(), ephys_df['datetime'].min()).to_pydatetime()
overlap_end_time = min(imu_df['datetime'].max(), ephys_df['datetime'].max()).to_pydatetime()

# Define notes to plot
notes_to_plot = {
    'heartbeat_manual_ok': {'sensor': 'ecg', 'symbol': 'triangle-down', 'color': 'blue'},
    'exhalation_breath': {'sensor': 'depth', 'symbol': 'triangle-up', 'color': 'blue'},
}

# Plotting with the updated function call
fig = plot_tag_data_interactive4(
    data_pkl=data_pkl,
    time_range=(overlap_start_time, overlap_end_time), 
    note_annotations=notes_to_plot, 
    color_mapping_path=color_mapping_path,
    target_sampling_rate=10
)

fig.show()

In [None]:
# Plotting
fig = plot_tag_data_interactive(data_pkl, imu_channels=['depth', 'ax', 'ay', 'az', 'gx', 'gy', 'gz', 'mx', 'my', 'mz'], 
                                ephys_channels=['ecg'], 
                                imu_logger=imu_logger_to_use, 
                                ephys_logger=ephys_logger_to_use, 
                                time_range=(overlap_start_time, overlap_end_time), 
                                note_annotations=notes_to_plot, 
                                color_mapping_path=color_mapping_path)

fig.show()

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import os

def plot_tag_data_interactive(data_pkl, sensors=None, channels=None, time_range=None, note_annotations=None, 
                              color_mapping_path=None, target_sampling_rate=10):
    """
    Function to plot tag data interactively using Plotly.

    Parameters
    ----------
    data_pkl : object
        The object containing the sensor data and metadata.
    sensors : list, optional
        List of sensors to plot. If None, plot all available sensors.
    channels : dict, optional
        Dictionary specifying the channels to plot for each sensor.
        E.g., {'ecg': ['ecg'], 'depth': ['depth']}
        If None, plot all channels for the specified sensors.
    time_range : tuple, optional
        Tuple specifying the start and end time for plotting.
    note_annotations : dict, optional
        Dictionary of annotations to plot. E.g., {'heartbeat_manual_ok': 'ecg'}
    color_mapping_path : str, optional
        Path to the JSON file containing the color mappings.
    target_sampling_rate : int, optional
        The target sampling rate to downsample the data for plotting.
    """
    
    # Load the color mapping
    color_mapping = load_color_mapping(color_mapping_path) if color_mapping_path else {}

    # Determine the sensors to plot
    if sensors is None:
        sensors = list(data_pkl.sensor_data.keys())

    # Set up the figure
    fig = make_subplots(rows=len(sensors), cols=1, shared_xaxes=True, vertical_spacing=0.03)
    
    row_counter = 1

    for sensor in sensors:
        sensor_df = data_pkl.sensor_data[sensor]
        sensor_info = data_pkl.sensor_info[sensor]

        # Determine the channels to plot for the current sensor
        if channels is None or sensor not in channels:
            sensor_channels = sensor_info['channels']
        else:
            sensor_channels = channels[sensor]

        # Filter data to the time range
        if time_range:
            start_time, end_time = time_range
            sensor_df_filtered = sensor_df[(sensor_df['datetime'] >= start_time) & (sensor_df['datetime'] <= end_time)]
        else:
            sensor_df_filtered = sensor_df

        # Calculate original sampling rate
        original_fs = 1 / sensor_df_filtered['datetime'].diff().dt.total_seconds().mean()

        # Downsample the data
        def downsample(df, original_fs, target_fs):
            if target_fs >= original_fs:
                return df
            conversion_factor = int(original_fs / target_fs)
            return df.iloc[::conversion_factor, :]

        sensor_df_filtered = downsample(sensor_df_filtered, original_fs, target_sampling_rate)

        # Plot each channel
        for channel in sensor_channels:
            if channel in sensor_df_filtered.columns:
                x_data = sensor_df_filtered['datetime']
                y_data = sensor_df_filtered[channel]

                original_name = sensor_info['metadata'][channel]['original_name']
                unit = sensor_info['metadata'][channel]['unit']
                y_label = f"{original_name} [{unit}]"

                color = color_mapping.get(original_name, generate_random_color())
                color_mapping[original_name] = color

                fig.add_trace(go.Scatter(
                    x=x_data,
                    y=y_data,
                    mode='lines',
                    name=y_label,
                    line=dict(color=color)
                ), row=row_counter, col=1)

        # Handle annotations for this sensor
        if note_annotations:
            for note_type, note_channel in note_annotations.items():
                if note_channel in sensor_df_filtered.columns:
                    filtered_notes = data_pkl.notes_df[data_pkl.notes_df['key'] == note_type]
                    if not filtered_notes.empty:
                        for dt in filtered_notes['datetime']:
                            fig.add_trace(go.Scatter(
                                x=[dt, dt],
                                y=[sensor_df_filtered[note_channel].min(), sensor_df_filtered[note_channel].max()],
                                mode='lines',
                                line=dict(color=color_mapping.get(note_type, 'rgba(128, 128, 128, 0.5)'), width=1, dash='dot'),
                                showlegend=False
                            ), row=row_counter, col=1)

        fig.update_yaxes(title_text=sensor, row=row_counter, col=1)
        row_counter += 1

    fig.update_layout(
        height=200 * len(sensors),
        width=1200,
        hovermode="x unified",  # Enables the vertical hover line across subplots
        title_text=f"{data_pkl.selected_deployment['Deployment Name']}",
        showlegend=True,
        legend=dict(
            orientation="h",  # Horizontal legend
            xanchor='center',  # Anchor the legend horizontally at the center
            yanchor='top'   # Anchor the legend vertically at the top of the legend box
        )
    )

    fig.update_xaxes(title_text="Datetime", row=row_counter-1, col=1)

    return fig


In [None]:
fig = plot_tag_data_interactive(
    data_pkl=data_pkl,
    sensors=['ecg', 'depth', 'accelerometer'],  # Specify which sensors to plot
    channels={'ecg': ['ecg'], 'depth': ['depth']},  # Optionally, specify channels for each sensor
    time_range=(overlap_start_time, overlap_end_time),  # Optionally specify a time range
    note_annotations={'heartbeat_manual_ok': 'ecg', 'exhalation_breath': 'depth'},  # Optionally specify annotations
    color_mapping_path=color_mapping_path  # Optionally specify a color mapping path
)

fig.show()

In [None]:
# Plotting again (this takes longer without subplots but allows you to track the time across all plots in a grid)
fig = plot_tag_data_interactive2(data_pkl, imu_channels=['depth', 'accX', 'accY', 'accZ', 'gyrX', 'gyrY', 'gyrZ', 'magX', 'magY', 'magZ'], 
                                ephys_channels=['ecg'], 
                                imu_logger=imu_logger_to_use, 
                                ephys_logger=ephys_logger_to_use, 
                                time_range=(overlap_start_time, overlap_end_time), 
                                note_annotations=notes_to_plot, 
                                color_mapping_path=color_mapping_path)

fig.show()

In [None]:
import numpy as np
from scipy.signal import butter, filtfilt

# Function to apply a low-pass filter to extract the static component (gravity)
def low_pass_filter(data, cutoff, fs, order=4):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    filtered_data = filtfilt(b, a, data)
    return filtered_data

# Function to calculate ODBA
def calculate_odba(accX, accY, accZ, cutoff=0.1, fs=10):
    # Apply low-pass filter to get the static acceleration
    accX_static = low_pass_filter(accX, cutoff, fs)
    accY_static = low_pass_filter(accY, cutoff, fs)
    accZ_static = low_pass_filter(accZ, cutoff, fs)

    # Subtract the static component to get the dynamic acceleration
    accX_dynamic = accX - accX_static
    accY_dynamic = accY - accY_static
    accZ_dynamic = accZ - accZ_static

    # Calculate ODBA
    odba = np.abs(accX_dynamic) + np.abs(accY_dynamic) + np.abs(accZ_dynamic)
    
    return odba

# Example usage with your data
accX = data_pkl.data['CC-96']['accX_adjusted'].values
accY = data_pkl.data['CC-96']['accY_adjusted'].values
accZ = data_pkl.data['CC-96']['accZ_adjusted'].values

odba = calculate_odba(accX, accY, accZ)

data_pkl.data['CC-96']['odba'] = odba

imu_channels_to_plot = ['depth', 'accX', 'accY', 'accZ', 'odba', 'pitch_deg', 'roll_deg', 'heading_deg']
ephys_channels_to_plot = []
imu_logger_to_use = 'CC-96'
ephys_logger_to_use = 'UF-01'

# Get the overlapping time range
imu_df = data_pkl.data[imu_logger_to_use]
ephys_df = data_pkl.data[ephys_logger_to_use]
start_time = max(imu_df['datetime'].min(), ephys_df['datetime'].min()).to_pydatetime()
end_time = min(imu_df['datetime'].max(), ephys_df['datetime'].max()).to_pydatetime()

# Define notes to plot
notes_to_plot = {
    'exhalation_breath': 'depth'
}

plot_tag_data_interactive(data_pkl, imu_channels_to_plot, imu_sampling_rate=1, ephys_channels=ephys_channels_to_plot, 
                          imu_logger=imu_logger_to_use, ephys_logger=ephys_logger_to_use, note_annotations= notes_to_plot,
                          time_range=(start_time, end_time), color_mapping_path=color_mapping_path)