# PPG Dataset Analysis and Harmonization

This notebook demonstrates the usage of the PPG dataset harmonization tools.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
from data_reader import DataDictionary, DataLoader
from feature_mapper import FeatureMapper
from eda import DataAnalyzer
from standardizer import DataStandardizer
from utils import save_dataset, create_metadata

## 1. Load and Read Data Dictionaries

In [None]:
# Initialize data dictionary reader
data_dict = DataDictionary('../data/raw')

# Load data dictionaries for each source
# Example:
# source1_dict = data_dict.read_data_dictionary('source1')
# source2_dict = data_dict.read_data_dictionary('source2')

## 2. Feature Mapping

In [None]:
# Initialize feature mapper
mapper = FeatureMapper()

# Add feature mappings
# Example:
# mapper.add_feature_mapping('source1', 'heart_rate', 'source2', 'hr')

# Visualize mappings
mapper.visualize_mappings('../data/processed/feature_mappings.html')

## 3. Demographics EDA

In [None]:
# Load a dataset
loader = DataLoader('../data/raw')
# data = loader.load_dataset('source1')

# Initialize analyzer
# analyzer = DataAnalyzer(data)

# Generate demographics summary
# demographics = analyzer.generate_demographics_summary()
# print(demographics)

## 4. Data Standardization

In [None]:
# Initialize standardizer
standardizer = DataStandardizer()

# Define unit mappings
unit_mappings = {
    'height': {'from': 'cm', 'to': 'm'},
    'weight': {'from': 'kg', 'to': 'kg'}
}

# Define sampling rates
sampling_rates = {
    'ppg_signal': 100,  # Hz
    'ecg_signal': 250   # Hz
}

# Standardize dataset
# standardized_data = standardizer.standardize_dataset(
#     data,
#     unit_mappings=unit_mappings,
#     sampling_rates=sampling_rates,
#     target_rate=100
# )

## 5. Save Processed Dataset

In [None]:
# Save standardized dataset
# save_dataset(
#     standardized_data,
#     'harmonized_dataset.parquet',
#     '../data/final'
# )

# Create and save metadata
# metadata = create_metadata(standardized_data, 'combined_sources')
# print(metadata)