# Using `pyologger` data processing pipeline with `DiveDB`
Uses classes `Metadata` and `DataReader` to facilitate data intake, processing, and alignment. 

## Read deployment metadata

In [None]:
import re
# Import pyologger utilities
from pyologger.utils.folder_manager import *
from pyologger.plot_data.plotter import *
from pyologger.utils.json_manager import ConfigManager
from pyologger.load_data.datareader import DataReader
from pyologger.load_data.metadata import Metadata

# Load important file paths and configurations
config, data_dir, color_mapping_path, montage_path = load_configuration()

### Fetch metadata

Load in metadata stored in Notion databases. Alternatively, load in your own metadata in separate dataframes for deployments, loggers, recordings, animals, and datasets. See examples here in the `metadata_snapshot.pkl` file.

In [None]:
metadata = Metadata()

In [None]:
# Save database variables
deployment_db = metadata.get_metadata("deployment_DB")
logger_db = metadata.get_metadata("logger_DB")
recording_db = metadata.get_metadata("recording_DB")
animal_db = metadata.get_metadata("animal_DB")
dataset_db = metadata.get_metadata("dataset_DB")
procedure_db = metadata.get_metadata("procedure_DB")
observation_db = metadata.get_metadata("observation_DB")
collaborator_db = metadata.get_metadata("collaborator_DB")
location_db = metadata.get_metadata("location_DB")
montage_db = metadata.get_metadata("montage_DB")
sensor_db = metadata.get_metadata("sensor_DB")
attachment_db = metadata.get_metadata("attachment_DB")
originalchannel_db = metadata.get_metadata("originalchannel_DB")
standardizedchannel_db = metadata.get_metadata("standardizedchannel_DB")
derivedsignal_db = metadata.get_metadata("derivedsignal_DB")
derivedchannel_db = metadata.get_metadata("derivedchannel_DB")

# Get the relations map
relations_map = metadata.relations_map

# Define the path to save the relations map
relations_map_path = os.path.join(config['paths']['local_repo_path'], 'relations_map.json')

# Save the relations map as a JSON file
with open(relations_map_path, 'w') as file:
    json.dump(relations_map, file, indent=4)

print(f"Relations map saved at: {relations_map_path}")

In [None]:
deployment_db

In [None]:
standardizedchannel_db

### Optional: Save metadata snapshot as pickle

In [None]:
## OPTIONAL: Save metadata snapshot as a pickle file

# Combine DataFrames into a dictionary
metadata_databases = {
    'deployment_db': deployment_db,
    'logger_db': logger_db,
    'recording_db': recording_db,
    'animal_db': animal_db,
    'dataset_db': dataset_db,
    'procedure_db': procedure_db,
    'observation_db': observation_db,
    'collaborator_db': collaborator_db,
    'location_db': location_db,
    'montage_db': montage_db,
    'sensor_db': sensor_db,
    'attachment_db': attachment_db,
    'originalchannel_db': originalchannel_db,
    'standardizedchannel_db': standardizedchannel_db,
    'derivedsignal_db': derivedsignal_db,
    'derivedchannel_db': derivedchannel_db
}

# Save metadata snapshot as a pickle file
metadata_pickle_path = os.path.join(data_dir, "00_Metadata/metadata_snapshot.pkl")
with open(metadata_pickle_path, "wb") as file:
    pickle.dump(metadata_databases, file)

print(f"Metadata snapshot saved at: {metadata_pickle_path}")

In [None]:
# Select dataset folder
dataset_folder = select_folder(data_dir, "Select a dataset folder:")

In [None]:
deployment_folder = select_folder(dataset_folder, "Select a deployment folder:")

In [None]:
# Extract deployment_id and animal_id from the folder name
match = re.match(r"(\d{4}-\d{2}-\d{2}_[a-z]{4}-\d{3})", os.path.basename(deployment_folder), re.IGNORECASE)
if match:
    deployment_id = match.group(1)  # Extract YYYY-MM-DD_animalID
    animal_id = deployment_id.split("_")[1]  # Extract animal ID
    print(f"✅ Extracted deployment ID: {deployment_id}, Animal ID: {animal_id}")
else:
    raise ValueError(f"❌ Unable to extract deployment ID from folder: {deployment_folder}")

## Read Files in Deployment Folder

### Steps for Processing Deployment Data: 
The following steps follow the [`datareader`](../pyologger/load_data/datareader.py) class and specifically the `read_files()` method.

1. **Select Deployment Folder**:
   - **Description:** Prompts the user to select a deployment folder to initiate the data reading process. The folder name can include any suffix after the Deployment ID. The function checks for potential conflicts and halts the process if multiple matching folders are found.
   - **Function Used:** `check_deployment_folder()`

2. **Initialize Deployment Folder**:
   - **Description:** Begins the main `read_files` process with the selected deployment folder.
   - **Function Used:** `read_files()`

3. **Fetch Metadata**:
   - **Description:** Retrieves essential data from the metadata database, including logger and animal information.
   - **Functions Used:** `metadata.fetch_databases()`, `get_animal_info()`, `get_dataset_info()`

4. **Organize Files by Logger ID**:
   - **Description:** Groups files by logger ID for processing.
   - **Function Used:** `organize_files_by_logger_id()` (within `read_files()`)

5. **Check for Existing Processed Files**:
   - **Description:** Verifies if the output folder already contains processed files for each logger. Skips reprocessing if all necessary files are present.
   - **Function Used:** `check_outputs_folder()`

6. **Process Manufacturer-Specific Files**:
   - **Description:** Depending on the logger's manufacturer, different processing methods are applied. The `BaseManufacturer` class and its subclasses (`CATSManufacturer`, `UFIManufacturer`) handle the specifics, such as processing `.txt` files for CATS loggers or `.ube` files for UFI loggers.
   - **Functions Used:** `BaseManufacturer.process_files()`, `CATSManufacturer.process_files()`, `UFIManufacturer.process_files()`

7. **Save Processed Data**:
   - **Description:** Saves processed data files in the outputs folder with appropriate filenames, ensuring consistency across different formats such as CSV, Parquet, and EDF.
   - **Functions Used:** `save_data()`, `export_to_edf()`, `save_to_netcdf()`

8. **Finalize and Save DataReader Object**:
   - **Description:** Saves the state of the `DataReader` object, including all processed data and metadata, as a pickle file for easy retrieval and future processing.
   - **Function Used:** `save_datareader_object()`

### Classes and Their Roles:

- **`DataReader` Class:**
  - **Role:** Handles the overall reading, processing, and saving of deployment data files. It manages the deployment folder path, organizes data by logger and sensor, and interfaces with manufacturer-specific processing through the `BaseManufacturer` class.
  - **Methods:** `read_files()`, `save_datareader_object()`, `organize_files_by_logger_id()`, `save_data()`, `export_to_edf()`, `save_to_netcdf()`, and more.

- **`BaseManufacturer` Class:**
  - **Role:** Acts as a template for manufacturer-specific processing classes. It defines common methods for loading custom mappings, renaming columns, and mapping data to sensors. Subclasses like `CATSManufacturer` and `UFIManufacturer` extend this base class to implement specific processing logic.
  - **Methods:** `process_files()`, `load_custom_mapping()`, `rename_columns()`, `map_data_to_sensors()`, `parse_txt_for_intervals()`, and more.

- **`metadata` Class (from `pyologger.load_data.metadata`):**
  - **Role:** Provides methods for fetching and managing metadata related to deployments, loggers, animals, and datasets. It plays a crucial role in ensuring the correct organization and processing of data.
  - **Methods:** `fetch_databases()`, `get_metadata()`, and others as needed for metadata handling.


In [None]:
# Print extracted values for debugging
print(f"🐳 Deployment ID: {deployment_id}, Animal ID: {animal_id}")

deployment_info, loggers_used = metadata.extract_essential_metadata(deployment_id)

In [None]:
import pytz
overwrite = False

if overwrite: # If you store your metadata differently, you can set this manually:
    # Deployment ID and Animal ID - this is important because it sets the start date and animal ID
    # Your dataset ID is the folder name that this deployment folder is in
    deployment_id = "2019-11-08_apfo-001"
    animal_id = "apfo-001"
    print(f"🔍 Manually setting essential metadata for Deployment ID: {deployment_id}")

    # Manually setting deployment metadata
    deployment_info = {
        "Deployment Date": "2019-11-08",
        "Deployment Latitude": -77.858933,
        "Deployment Longitude": 166.5139,
        "Time Zone": "Antarctica/McMurdo"
    }
    print(f"📍 Deployment Metadata: {deployment_info}")

    # Manually setting loggers used with Montage ID inside each entry
    loggers_used = [
        {"Logger ID": "CC-35", "Manufacturer": "CATS", "Montage ID": "cats-penguin-video-montage_V1"}
    ]
    print(f"📟 Loggers Used: {loggers_used}")

    # No separate montage_id list anymore, since it's stored in loggers_used

def list_available_timezones():
    """
    Prints all available time zones in pytz.
    """
    timezones = pytz.all_timezones
    print("\n🌍 Available Time Zones in pytz:\n")
    for tz in timezones:
        print(tz)

# List all available time zones
list_available_timezones()

In [None]:
# Step 4: Initialize DataReader with dataset folder, deployment ID, and optional data subfolder
data_pkl = DataReader(dataset_folder=dataset_folder, deployment_id=deployment_id, data_subfolder="01_raw-data", montage_path=montage_path)
# Step 5: Initialize config manager
config_manager = ConfigManager(deployment_folder=deployment_folder, deployment_id=deployment_id)
config_manager.add_to_config("current_processing_step", "Processing Step 00: Data import pending.")

In [None]:
config_manager.export_config()

In [None]:
# Pass it to DataReader
data_pkl.read_files(
    deployment_info= deployment_info,
    loggers_used= loggers_used,
    save_parq= False,
    overwrite= False,
    save_netcdf= True
)

In [None]:
data_pkl.sensor_info['light']

In [None]:
import xarray as xr

# Step 8: Update processing step
config_manager.add_to_config("current_processing_step", "Processing Step 00: Data imported.")

# Step 9: Open NetCDF file
netcdf_path = os.path.join(deployment_folder, "outputs", f'{deployment_id}_00_processed.nc')
if os.path.exists(netcdf_path):
    data = xr.open_dataset(netcdf_path)
    print(f"📊 NetCDF file loaded: {netcdf_path}")
else:
    print(f"⚠ NetCDF file not found at {netcdf_path}.")

In [None]:
import pyedflib

# Define the path to the EDF file
edf_file_path = os.path.join(deployment_folder, '01_raw-data', '2019-10-25_mian-001a_NL-02_001.edf')

# Read the EDF file
edf_reader = pyedflib.EdfReader(edf_file_path)

# Print some basic information about the EDF file
print(f"Number of signals: {edf_reader.signals_in_file}")
print(f"Signal labels: {edf_reader.getSignalLabels()}")
print(f"Signal frequencies: {edf_reader.getSampleFrequencies()}")

# Close the EDF reader
edf_reader.close()

# Count the number of EXG channels (start with 'Ch')
exg_channels = [label for label in current_order if label.startswith('Ch')]
num_exg_channels = len(exg_channels)

# Count the number of Acc channels
acc_channels = [label for label in current_order if label.startswith('Acc')]
num_acc_channels = len(acc_channels)

# Count the number of Mag channels
mag_channels = [label for label in current_order if label.startswith('Mag')]
num_mag_channels = len(mag_channels)

# Count the number of Gyr channels
gyr_channels = [label for label in current_order if label.startswith('Gyr')]
num_gyr_channels = len(gyr_channels)

# Check for other signals
other_signals = [label for label in current_order if label not in exg_channels + acc_channels + mag_channels + gyr_channels]

# Print the counts
print(f"Number of EXG channels: {num_exg_channels}")
print(f"Number of Acc channels: {num_acc_channels}")
print(f"Number of Mag channels: {num_mag_channels}")
print(f"Number of Gyr channels: {num_gyr_channels}")
print(f"Other signals: {other_signals}")

check_path = '/Volumes/WORK-SSD/Datasets/Unpublished/mian-juv-nese_sleep_lml-ano_JKB/2019-10-25_mian-001a/01_raw-data/2019-10-25_mian-001a_NL-02_001.edf'

In [None]:
import pandas as pd
from datetime import timedelta

# Get timezone
timezone = data_pkl.deployment_info.get("Time Zone", "UTC")

# Load time settings
time_settings = config_manager.get_from_config(
    ["overlap_start_time", "overlap_end_time", "zoom_window_start_time", "zoom_window_end_time"],
    section="settings"
)

if time_settings:
    print("Time settings present.")
# If any required time settings are missing, compute and update them
if not any(v is None for v in time_settings.values()):
    print("Time settings not empty.")
else:
    print("Adding timestamps to config.")
    zoom_time_window = 5  # minutes

    # Extract start and end times for all sensors
    start_times = [df['datetime'].min() for df in data_pkl.sensor_data.values()]
    end_times = [df['datetime'].max() for df in data_pkl.sensor_data.values()]

    # Compute common start, end, and zoom window
    overlap_start_time = max(start_times)
    overlap_end_time = min(end_times)
    midpoint = overlap_start_time + (overlap_end_time - overlap_start_time) / 2
    zoom_window_start, zoom_window_end = midpoint - timedelta(minutes=zoom_time_window / 2), midpoint + timedelta(minutes=zoom_time_window / 2)

    # Update settings
    time_settings = {
        "overlap_start_time": str(overlap_start_time),
        "overlap_end_time": str(overlap_end_time),
        "zoom_window_start_time": str(zoom_window_start),
        "zoom_window_end_time": str(zoom_window_end),
    }
    config_manager.add_to_config(entries=time_settings, section="settings")

if any(v is None for v in time_settings.values()):
    print("YES")
time_settings

In [None]:
start = data_pkl.sensor_data["pressure"]["datetime"].min()
end = data_pkl.sensor_data["pressure"]["datetime"].max()
fig = plot_tag_data_interactive(
    data_pkl=data_pkl,
    time_range=(start, end),
    note_annotations={"dive": {"signal": "depth", "symbol": "triangle-down", "color": "blue"}},
    state_annotations={"dive": {"signal": "depth", "color": "rgba(150, 150, 150, 0.3)"}},
    color_mapping_path=color_mapping_path,
    target_sampling_rate=1
)
fig.show()

In [None]:
# Check if selected start and end times exist in the config file
truncate_times = config_manager.get_from_config(
    ["selected_start_time", "selected_end_time"],
    section="settings"
)

truncate_times

In [None]:
if not any(v is None for v in truncate_times.values()):
    print("Truncating with provided cropping times.")
    # Update overlap window with selected range
    OVERLAP_START_TIME = pd.Timestamp(truncate_times['selected_start_time']).tz_convert(timezone)
    OVERLAP_END_TIME = pd.Timestamp(truncate_times['selected_end_time']).tz_convert(timezone)

    # Truncate sensor data
    for sensor, df in data_pkl.sensor_data.items():
        # Truncate based on selected time range
        truncated_df = df[(df.iloc[:, 0] >= OVERLAP_START_TIME) & (df.iloc[:, 0] <= OVERLAP_END_TIME)].copy()
        data_pkl.sensor_data[sensor] = truncated_df  # Save truncated version to new variable

    # Recalculate Zoom Window (5-minute window in the middle)
    midpoint = OVERLAP_START_TIME + (OVERLAP_END_TIME - OVERLAP_START_TIME) / 2
    ZOOM_WINDOW_START_TIME = midpoint - timedelta(minutes=2.5)
    ZOOM_WINDOW_END_TIME = midpoint + timedelta(minutes=2.5)

    # Save new time settings
    time_settings_update = {
        "overlap_start_time": str(OVERLAP_START_TIME),
        "overlap_end_time": str(OVERLAP_END_TIME),
        "zoom_window_start_time": str(ZOOM_WINDOW_START_TIME),
        "zoom_window_end_time": str(ZOOM_WINDOW_END_TIME)
    }
    config_manager.add_to_config(entries=time_settings_update, section="settings")

    pkl_path = os.path.join(deployment_folder, 'outputs', 'data.pkl')
    with open(pkl_path, "wb") as file:
        pickle.dump(data_pkl, file)

In [None]:
fig = plot_tag_data_interactive(
    data_pkl=data_pkl,
    time_range=(OVERLAP_START_TIME, OVERLAP_END_TIME),
    zoom_start_time= ZOOM_WINDOW_START_TIME,
    zoom_end_time= ZOOM_WINDOW_END_TIME,
    note_annotations={"dive": {"signal": "depth", "symbol": "triangle-down", "color": "blue"}},
    state_annotations={"dive": {"signal": "depth", "color": "rgba(150, 150, 150, 0.3)"}},
    color_mapping_path=color_mapping_path,
    target_sampling_rate=1
)
fig.show()

## Inspect data

In [None]:
# Load the data_reader object from the pickle file
pkl_path = os.path.join(deployment_folder, 'outputs', 'data.pkl')

with open(pkl_path, 'rb') as file:
    data_pkl = pickle.load(file)

for logger_id, info in data_pkl.logger_info.items():
    sampling_frequency = info.get('datetime_metadata', {}).get('fs', None)
    if sampling_frequency is not None:
        # Format the sampling frequency to 5 significant digits
        print(f"Sampling frequency for {logger_id}: {sampling_frequency} Hz")
    else:
        print(f"No sampling frequency available for {logger_id}")