# Using `pyologger` data processing pipeline with `DiveDB`
Uses classes `Metadata` and `DataReader` to facilitate data intake, processing, and alignment. 

## Read deployment metadata

In [None]:
import os
import pickle
import pandas as pd

# Import necessary pyologger utilities
from pyologger.utils.config_manager import ConfigManager
from pyologger.load_data.datareader import DataReader
from pyologger.load_data.metadata import Metadata
from pyologger.plot_data.plotter import *
from pyologger.process_data.sampling import *
from pyologger.calibrate_data.tag2animal import *
from pyologger.calibrate_data.zoc import *

# Change the current working directory to the root directory
# os.chdir("/Users/fbar/Documents/GitHub/pyologger")
os.chdir("/Users/jessiekb/Documents/GitHub/pyologger")

root_dir = os.getcwd()
data_dir = os.path.join(root_dir, "data")
color_mapping_path = os.path.join(root_dir, "color_mappings.json")

# Verify the current working directory
print(f"Current working directory: {root_dir}")

### Fetch metadata

Load in metadata stored in Notion databases. Alternatively, load in your own metadata in separate dataframes for deployments, loggers, recordings, animals, and datasets. See examples here in the `metadata_snapshot.pkl` file.

In [None]:
# Use Notion to initialize the metadata class
metadata = Metadata()
metadata.fetch_databases(verbose=False)
metadata.find_relations(verbose=False)

# Fetch databases
deployment_db = metadata.get_metadata("deployment_DB")
logger_db = metadata.get_metadata("logger_DB")
recording_db = metadata.get_metadata("recording_DB")
animal_db = metadata.get_metadata("animal_DB")
dataset_db = metadata.get_metadata("dataset_DB")

### Optional: Save metadata snapshot as pickle

In [None]:
## OPTIONAL: Save metadata snapshot as a pickle file

# Combine DataFrames into a dictionary
metadata_databases = {
    'deployment_db': deployment_db,
    'logger_db': logger_db,
    'recording_db': recording_db,
    'animal_db': animal_db,
    'dataset_db': dataset_db
}

# Define the path to save the metadata pickle file
metadata_pickle_path = os.path.join(data_dir, 'metadata_snapshot.pkl')

# Save only the DataFrames as a pickle file
with open(metadata_pickle_path, 'wb') as file:
    pickle.dump(metadata_databases, file)

print(f"Metadata DataFrames saved as pickle file at: {metadata_pickle_path}")

In [None]:
metadata_databases['deployment_db']

In [None]:
# Check out the deployment dataframe.
deployment_db_sorted = deployment_db.sort_values('Recording Date')
deployment_db_sorted

## Read Files in Deployment Folder

### Steps for Processing Deployment Data: 
The following steps follow the [`datareader`](../pyologger/load_data/datareader.py) class and specifically the `read_files()` method.

1. **Select Deployment Folder**:
   - **Description:** Prompts the user to select a deployment folder to initiate the data reading process. The folder name can include any suffix after the Deployment ID. The function checks for potential conflicts and halts the process if multiple matching folders are found.
   - **Function Used:** `check_deployment_folder()`

2. **Initialize Deployment Folder**:
   - **Description:** Begins the main `read_files` process with the selected deployment folder.
   - **Function Used:** `read_files()`

3. **Fetch Metadata**:
   - **Description:** Retrieves essential data from the metadata database, including logger and animal information.
   - **Functions Used:** `metadata.fetch_databases()`, `get_animal_info()`, `get_dataset_info()`

4. **Organize Files by Logger ID**:
   - **Description:** Groups files by logger ID for processing.
   - **Function Used:** `organize_files_by_logger_id()` (within `read_files()`)

5. **Check for Existing Processed Files**:
   - **Description:** Verifies if the output folder already contains processed files for each logger. Skips reprocessing if all necessary files are present.
   - **Function Used:** `check_outputs_folder()`

6. **Process Manufacturer-Specific Files**:
   - **Description:** Depending on the logger's manufacturer, different processing methods are applied. The `BaseManufacturer` class and its subclasses (`CATSManufacturer`, `UFIManufacturer`) handle the specifics, such as processing `.txt` files for CATS loggers or `.ube` files for UFI loggers.
   - **Functions Used:** `BaseManufacturer.process_files()`, `CATSManufacturer.process_files()`, `UFIManufacturer.process_files()`

7. **Save Processed Data**:
   - **Description:** Saves processed data files in the outputs folder with appropriate filenames, ensuring consistency across different formats such as CSV, Parquet, and EDF.
   - **Functions Used:** `save_data()`, `export_to_edf()`, `save_to_netcdf()`

8. **Finalize and Save DataReader Object**:
   - **Description:** Saves the state of the `DataReader` object, including all processed data and metadata, as a pickle file for easy retrieval and future processing.
   - **Function Used:** `save_datareader_object()`

### Classes and Their Roles:

- **`DataReader` Class:**
  - **Role:** Handles the overall reading, processing, and saving of deployment data files. It manages the deployment folder path, organizes data by logger and sensor, and interfaces with manufacturer-specific processing through the `BaseManufacturer` class.
  - **Methods:** `read_files()`, `save_datareader_object()`, `organize_files_by_logger_id()`, `save_data()`, `export_to_edf()`, `save_to_netcdf()`, and more.

- **`BaseManufacturer` Class:**
  - **Role:** Acts as a template for manufacturer-specific processing classes. It defines common methods for loading custom mappings, renaming columns, and mapping data to sensors. Subclasses like `CATSManufacturer` and `UFIManufacturer` extend this base class to implement specific processing logic.
  - **Methods:** `process_files()`, `load_custom_mapping()`, `rename_columns()`, `map_data_to_sensors()`, `parse_txt_for_intervals()`, and more.

- **`metadata` Class (from `pyologger.load_data.metadata`):**
  - **Role:** Provides methods for fetching and managing metadata related to deployments, loggers, animals, and datasets. It plays a crucial role in ensuring the correct organization and processing of data.
  - **Methods:** `fetch_databases()`, `get_metadata()`, and others as needed for metadata handling.


In [None]:
# Define the path to your custom mapping file
channel_mapping_path = os.path.join(root_dir, 'channel_mapping.json')
datareader = DataReader(deployment_folder_path=data_dir)

deployment_folder, deployment_id = datareader.check_deployment_folder(deployment_db, data_dir)
config_manager = ConfigManager(deployment_folder=deployment_folder, deployment_id=deployment_id)

current_processing_step = "Processing Step 00 In progress: data import pending."
config_manager.add_to_config("current_processing_step", current_processing_step)

edf_filename_template = os.path.join(datareader.files_info['deployment_folder_path'], 'outputs', 'edf_test_{sensor}.edf')

if deployment_folder:
    datareader.read_files(metadata, save_csv=False, save_parq=False, save_edf=False, 
                          custom_mapping_path=channel_mapping_path, save_netcdf=True,
                          edf_filename_template=edf_filename_template, edf_save_from='sensor_data')


current_processing_step = "Processing Step 00 DATA IMPORTED."
config_manager.add_to_config("current_processing_step", current_processing_step)

In [None]:
import xarray as xr

# Define the path to the NetCDF file
netcdf_path = os.path.join(deployment_folder, 'outputs', 'deployment_data.nc')

# Open the NetCDF file
data = xr.open_dataset(netcdf_path)

# Display the contents of the NetCDF file
display(data)

## Inspect data

In [None]:
# Load the data_reader object from the pickle file
pkl_path = os.path.join(deployment_folder, 'outputs', 'data.pkl')

with open(pkl_path, 'rb') as file:
    data_pkl = pickle.load(file)

for logger_id, info in data_pkl.logger_info.items():
    sampling_frequency = info.get('datetime_metadata', {}).get('fs', None)
    if sampling_frequency is not None:
        # Format the sampling frequency to 5 significant digits
        print(f"Sampling frequency for {logger_id}: {sampling_frequency} Hz")
    else:
        print(f"No sampling frequency available for {logger_id}")