In [None]:
import os
import pickle
import pandas as pd
from collections import defaultdict
from pathlib import Path

from pyologger.utils.folder_manager import load_configuration
from pyologger.utils.folder_manager import select_and_load_deployment

def list_valid_deployments(dataset_folder):
    dataset_path = Path(dataset_folder)
    deployment_folders = [
        p for p in dataset_path.iterdir()
        if p.is_dir() and (p / 'outputs' / 'data.pkl').exists()
    ]
    return deployment_folders

# --- Config & Paths ---
config, data_dir, color_mapping_path, montage_path = load_configuration()
dataset_id = "oror-adult-orca_hr-sr-vid_sw_JKB-PP"
dataset_folder = os.path.join(data_dir, dataset_id)

# --- Discover valid deployments ---
deployment_paths = list_valid_deployments(dataset_folder)
print(f"Found {len(deployment_paths)} deployments in {dataset_id}")

In [None]:
config

In [None]:
# --- Prepare storage ---
all_event_data = []
all_sensor_data = defaultdict(list)
all_derived_data = defaultdict(list)
metadata = {
    'deployment_ids': [],
    'sensor_info': {},
    'derived_info': {},
    'logger_info': {},
    'animal_info': {},
    'deployment_info': {},
}

# --- Process deployments ---
for deployment_path in deployment_paths:
    deployment_id = deployment_path.name
    pkl_path = deployment_path / 'outputs' / 'data.pkl'

    with open(pkl_path, 'rb') as f:
        data_pkl = pickle.load(f)

    print(f"→ Processing {deployment_id}")
    metadata['deployment_ids'].append(deployment_id)

    # Event data
    if hasattr(data_pkl, 'event_data') and data_pkl.event_data is not None:
        df = data_pkl.event_data.copy()
        df['deployment_id'] = deployment_id
        all_event_data.append(df)

    # Sensor data
    for sensor, df in data_pkl.sensor_data.items():
        df_copy = df.copy()
        df_copy['deployment_id'] = deployment_id
        all_sensor_data[sensor].append(df_copy)

    # Derived data
    for signal, df in data_pkl.derived_data.items():
        df_copy = df.copy()
        df_copy['deployment_id'] = deployment_id
        all_derived_data[signal].append(df_copy)

    # Metadata
    metadata['sensor_info'][deployment_id] = data_pkl.sensor_info
    metadata['derived_info'][deployment_id] = data_pkl.derived_info
    metadata['logger_info'][deployment_id] = data_pkl.logger_info
    metadata['animal_info'][deployment_id] = data_pkl.animal_info
    metadata['deployment_info'][deployment_id] = data_pkl.deployment_info

# --- Build MEGA PICKLE ---
megadata_pkl = {
    'event_data': pd.concat(all_event_data, ignore_index=True) if all_event_data else None,
    'sensor_data': {k: pd.concat(v, ignore_index=True) for k, v in all_sensor_data.items()},
    'derived_data': {k: pd.concat(v, ignore_index=True) for k, v in all_derived_data.items()},
    'metadata': metadata
}

# --- Save output ---
output_path = os.path.join(dataset_folder, 'megadata_pkl.pkl')
with open(output_path, 'wb') as f:
    pickle.dump(megadata_pkl, f)

print(f"✅ MEGA PICKLE saved: {output_path}")

In [None]:
megadata_pkl['event_data']

In [None]:
megadata_pkl['derived_data']['prh']

In [None]:
from pyologger.process_data.sampling import calculate_sampling_frequency
import pandas as pd

def truncate_to_seconds(dt_series):
    return dt_series.dt.floor('s')

def downsample_and_merge_derived_data(megadata_pkl, keys, target_freq=1):
    derived = megadata_pkl['derived_data']
    dfs = {}

    # Step 1: Drop NA and collect sampling frequencies
    sampling_frequencies = {}

    for key in keys:
        df = derived[key].dropna().copy()
        unique_deployments = df['deployment_id'].unique()

        freqs = []
        for deployment_id in unique_deployments:
            sub = df[df['deployment_id'] == deployment_id]
            fs = calculate_sampling_frequency(sub['datetime'])
            freqs.append(fs)

        if len(set(freqs)) != 1:
            print(f"❌ Inconsistent sampling frequency in '{key}':")
            for i, deployment_id in enumerate(unique_deployments):
                print(f"   {deployment_id}: {freqs[i]:.2f} Hz")
            return None

        fs = freqs[0]
        print(f"✅ '{key}' has consistent sampling frequency: {fs:.2f} Hz")
        sampling_frequencies[key] = fs
        dfs[key] = df

    # Step 2: Downsample and truncate datetime
    downsampled = {}
    for key, df in dfs.items():
        step = int(sampling_frequencies[key] // target_freq)
        df = df.iloc[::step].reset_index(drop=True)
        df['datetime'] = truncate_to_seconds(df['datetime'])
        downsampled[key] = df

    # Step 3: Merge on 'datetime'
    merged_df = None
    for i, (key, df) in enumerate(downsampled.items()):
        if merged_df is None:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, on='datetime', how='inner', suffixes=('', f'_{key}'))

    # Step 4: Check deployment_id consistency row-wise
    deployment_cols = [col for col in merged_df.columns if col.startswith('deployment_id')]
    inconsistent_rows = merged_df[deployment_cols].nunique(axis=1) > 1
    if inconsistent_rows.any():
        print("❌ Inconsistent deployment IDs found in merged rows:")
        print(merged_df.loc[inconsistent_rows, ['datetime'] + deployment_cols])
        return None

    # Step 5: Collapse to one consistent deployment_id column (but keep it)
    merged_df['deployment_id'] = merged_df[deployment_cols[0]]
    # Optionally: Keep original columns or drop them
    # merged_df = merged_df.drop(columns=deployment_cols[1:])  # keep just one, drop others

    # Save result
    result_key = f"merged_{'_'.join(keys)}"
    megadata_pkl['derived_data'][result_key] = merged_df
    print(f"✅ Merged DataFrame saved as megadata_pkl['derived_data']['{result_key}']")
    return merged_df


In [None]:
merged_df = downsample_and_merge_derived_data(
    megadata_pkl,
    keys=['heart_rate', 'stroke_rate', 'prh', 'depth'],
    target_freq=1  # Hz
)
merged_df

merged_df['roll'] = abs(merged_df['roll'])

In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Make sure the category dtype is correct
merged_df['deployment_id'] = merged_df['deployment_id'].astype('category')

# Original scatter plot with proper categorical coloring
fig = px.scatter(
    merged_df,
    x='stroke_rate',
    y='heart_rate',
    color='deployment_id',
    color_discrete_sequence=px.colors.qualitative.Set2,  # optional, helps make sure colors are distinct
    title='Heart Rate vs Stroke Rate',
    labels={'stroke_rate': 'depth (m)', 'heart_rate': 'Heart Rate (bpm)', 'deployment_id': 'Deployment ID'},
    opacity=0.3
)

fig.update_traces(marker=dict(size=5))
fig.update_layout(
    xaxis_title='depth (m)',
    yaxis_title='Heart Rate (bpm)',
    template='plotly_white'
)

# Add a single regression line (not per deployment)
trend_fig = px.scatter(
    merged_df,
    x='stroke_rate',
    y='heart_rate',
    trendline='ols'
)

# Extract just the trendline (usually the 1st or 2nd trace depending on how many groups)
trendline_trace = [trace for trace in trend_fig.data if trace.mode == 'lines'][0]

# Add the regression line to your original colorful scatter plot
fig.add_trace(trendline_trace)

fig.show()


In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Make sure the category dtype is correct
merged_df['deployment_id'] = merged_df['deployment_id'].astype('category')

# Original scatter plot with proper categorical coloring
fig = px.scatter(
    merged_df,
    x='depth',
    y='heart_rate',
    color='deployment_id',
    color_discrete_sequence=px.colors.qualitative.Set2,  # optional, helps make sure colors are distinct
    title='Heart Rate vs Depth',
    labels={'depth': 'depth (m)', 'heart_rate': 'Heart Rate (bpm)', 'deployment_id': 'Deployment ID'},
    opacity=0.3
)

fig.update_traces(marker=dict(size=5))
fig.update_layout(
    xaxis_title='depth (m)',
    yaxis_title='Heart Rate (bpm)',
    template='plotly_white'
)

# Add a single regression line (not per deployment)
trend_fig = px.scatter(
    merged_df,
    x='depth',
    y='heart_rate',
    trendline='ols'
)

# Extract just the trendline (usually the 1st or 2nd trace depending on how many groups)
trendline_trace = [trace for trace in trend_fig.data if trace.mode == 'lines'][0]

# Add the regression line to your original colorful scatter plot
fig.add_trace(trendline_trace)

fig.show()


In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px

def plot_pca_plotly(
    merged_df,
    variables=None,
    color_by='deployment_id',  # can be continuous (e.g., heart_rate) or categorical
    n_components=2,
    opacity=0.7
):
    if variables is None:
        variables = ['heart_rate', 'stroke_rate', 'pitch', 'roll', 'heading', 'depth']

    # Ensure all required columns are present
    cols_needed = list(set(variables + [color_by, 'datetime']))
    data = merged_df[cols_needed].dropna().copy()

    # Standardize input variables
    X = data[variables].values
    X_scaled = StandardScaler().fit_transform(X)

    # Apply PCA
    pca = PCA(n_components=n_components)
    components = pca.fit_transform(X_scaled)
    explained_var = pca.explained_variance_ratio_

    # Build DataFrame for Plotly
    pca_df = pd.DataFrame(components, columns=[f'PC{i+1}' for i in range(n_components)])
    pca_df['datetime'] = data['datetime'].values
    pca_df[color_by] = data[color_by].values

    # Include original variables for hover tooltips
    for var in variables:
        pca_df[var] = data[var].values

    # Choose color scale depending on type
    color_args = {'color': color_by}
    if pd.api.types.is_numeric_dtype(data[color_by]):
        color_args['color_continuous_scale'] = 'Viridis'
    else:
        color_args['color_discrete_sequence'] = px.colors.qualitative.Set2

    # Plot
    fig = px.scatter(
        pca_df,
        x='PC1', y='PC2',
        title=f'PCA of Biologging Data colored by {color_by}',
        labels={
            'PC1': f'PC1 ({explained_var[0]*100:.1f}% variance)',
            'PC2': f'PC2 ({explained_var[1]*100:.1f}% variance)',
            color_by: color_by
        },
        hover_data=['datetime'] + variables + [color_by],
        opacity=opacity,
        **color_args
    )

    fig.update_layout(template='plotly_white')
    fig.show()

    return pca_df, pca


In [None]:
plot_pca_plotly(
    megadata_pkl['derived_data']['merged_heart_rate_stroke_rate_prh_depth'],
    color_by='heart_rate'
)


In [None]:
plot_pca_plotly(
    megadata_pkl['derived_data']['merged_heart_rate_stroke_rate_prh_depth'],
    color_by='stroke_rate'
)

In [None]:
plot_pca_plotly(
    megadata_pkl['derived_data']['merged_heart_rate_stroke_rate_prh_depth'],
    color_by='roll'
)

In [None]:
plot_pca_plotly(
    megadata_pkl['derived_data']['merged_heart_rate_stroke_rate_prh_depth'],
    color_by='depth'
)

In [None]:
# Import pyologger utilities
from pyologger.utils.folder_manager import *
from pyologger.plot_data.plotter import *

dataset_id = "oror-adult-orca_hr-sr-vid_sw_JKB-PP"
deployment_id = "2024-01-24_oror-001"

# Load important file paths and configurations
config, data_dir, color_mapping_path, montage_path = load_configuration()
# Streamlit load data
animal_id, dataset_id, deployment_id, dataset_folder, deployment_folder, data_pkl, param_manager = select_and_load_deployment(
    data_dir, dataset_id=dataset_id, deployment_id=deployment_id
    )
pkl_path = os.path.join(deployment_folder, 'outputs', 'data.pkl')

In [None]:
import xarray as xr

# Define the path to the NetCDF file
netcdf_path = os.path.join(deployment_folder, 'outputs', f'{deployment_id}_output.nc')

# Open the NetCDF file
data = xr.open_dataset(netcdf_path)

# Display the contents of the NetCDF file
display(data)