# Replicate the dataset of this paper

1. Download all data using powershell (.ps1 file in each folder)
2. Read the specify
3. Mapping the fields
4. Create script and execute and save the data

Conclusion:
dataset Dalia: 
- 15 participants 
- aged 30.6 +- 9.59 years (21–55)
- gender: 7 male 8 female
dataset PTT:
- 22 participants
- aged 28.52 +- (20-53)
- gender: 16 male 6 female
- healthy subjects performing 3 physical activities.

<!-- Stat thêm về height weight -->

This dataset is significantly healthy and young. Demographic has not as much balance and healthy -> not include illness condition.

# Import datasets

In [1]:
import os
import pandas as pd
import glob
import numpy as np
from datetime import datetime


In [2]:

def load_ptt_ppg_dataset(base_path):
    """
    Load the Pulse Transit Time PPG Dataset v1.1.0
    
    Parameters:
    base_path (str): Path to the PTT-PPG dataset folder
    
    Returns:
    dict: Dictionary containing dataframes and metadata
    """
    print("Loading PTT-PPG Dataset...")
    
    # Initialize containers
    data = {
        'signals': [],
        'demographics': [],
        'subject_ids': [],
        'activity_labels': []
    }
    
    # Find all subject folders
    subject_folders = glob.glob(os.path.join(base_path, 'S*'))
    
    for subject_folder in subject_folders:
        subject_id = os.path.basename(subject_folder)
        
        # Load demographics if available
        demo_file = os.path.join(subject_folder, f"{subject_id}_info.csv")
        if os.path.exists(demo_file):
            demo_df = pd.read_csv(demo_file)
            demo_df['subject_id'] = subject_id
            data['demographics'].append(demo_df)
        
        # Find all signal files
        signal_files = glob.glob(os.path.join(subject_folder, "*.csv"))
        for file in signal_files:
            if "_info.csv" in file:
                continue
            
            # Extract activity label from filename
            filename = os.path.basename(file)
            activity = filename.split('_')[1].split('.')[0] if '_' in filename else 'unknown'
            
            # Load signal data
            try:
                signal_df = pd.read_csv(file)
                signal_df['subject_id'] = subject_id
                signal_df['activity'] = activity
                signal_df['dataset'] = 'PTT-PPG'
                data['signals'].append(signal_df)
                data['subject_ids'].append(subject_id)
                data['activity_labels'].append(activity)
            except Exception as e:
                print(f"Error loading {file}: {e}")
    
    # Combine all demographics
    if data['demographics']:
        data['demographics_df'] = pd.concat(data['demographics'], ignore_index=True)
    else:
        data['demographics_df'] = pd.DataFrame()
    
    # Combine all signals
    if data['signals']:
        data['signals_df'] = pd.concat(data['signals'], ignore_index=True)
    else:
        data['signals_df'] = pd.DataFrame()
    
    return data

def load_ppg_dalia_dataset(base_path):
    """
    Load the PPG-DaLiA dataset
    
    Parameters:
    base_path (str): Path to the PPG-DaLiA dataset folder
    
    Returns:
    dict: Dictionary containing dataframes and metadata
    """
    print("Loading PPG-DaLiA Dataset...")
    
    # Initialize containers
    data = {
        'signals': [],
        'demographics': [],
        'subject_ids': [],
        'activity_labels': []
    }
    
    # Find all subject files/folders
    subject_folders = glob.glob(os.path.join(base_path, 'S*'))
    
    for subject_folder in subject_folders:
        subject_id = os.path.basename(subject_folder)
        
        # Load demographics if available (assumed structure, adjust as needed)
        demo_file = os.path.join(subject_folder, f"{subject_id}_info.csv")
        if os.path.exists(demo_file):
            demo_df = pd.read_csv(demo_file)
            demo_df['subject_id'] = subject_id
            data['demographics'].append(demo_df)
        
        # Find signal files - PPG-DaLiA typically has different structure
        signal_files = glob.glob(os.path.join(subject_folder, "**/*.csv"), recursive=True)
        
        for file in signal_files:
            filename = os.path.basename(file)
            
            # Skip demographic files
            if "info" in filename.lower() or "demographic" in filename.lower():
                continue
                
            # Extract activity label from path or filename
            # This is an assumption - adjust based on actual file structure
            if 'activity' in file.lower():
                activity = file.split('activity_')[1].split('/')[0] if 'activity_' in file else 'unknown'
            else:
                activity = 'unknown'
                
            # Load signal data
            try:
                signal_df = pd.read_csv(file)
                signal_df['subject_id'] = subject_id
                signal_df['activity'] = activity
                signal_df['dataset'] = 'PPG-DaLiA'
                data['signals'].append(signal_df)
                data['subject_ids'].append(subject_id)
                data['activity_labels'].append(activity)
            except Exception as e:
                print(f"Error loading {file}: {e}")
    
    # Combine all demographics
    if data['demographics']:
        data['demographics_df'] = pd.concat(data['demographics'], ignore_index=True)
    else:
        data['demographics_df'] = pd.DataFrame()
    
    # Combine all signals
    if data['signals']:
        data['signals_df'] = pd.concat(data['signals'], ignore_index=True)
    else:
        data['signals_df'] = pd.DataFrame()
    
    return data


Standardize data content

In [3]:

def standardize_column_names(df, mapping):
    """
    Standardize column names based on mapping
    
    Parameters:
    df (DataFrame): Input dataframe
    mapping (dict): Dictionary mapping original column names to standardized ones
    
    Returns:
    DataFrame: DataFrame with standardized column names
    """
    new_cols = {}
    for col in df.columns:
        lower_col = col.lower()
        for key, value in mapping.items():
            if key in lower_col:
                new_cols[col] = value
                break
    
    if new_cols:
        df = df.rename(columns=new_cols)
    return df


# Mapping and merge

In [4]:

def merge_datasets(ptt_ppg_data, ppg_dalia_data):
    """
    Merge the two datasets into a standardized format
    
    Parameters:
    ptt_ppg_data (dict): Dictionary containing PTT-PPG data
    ppg_dalia_data (dict): Dictionary containing PPG-DaLiA data
    
    Returns:
    dict: Dictionary containing merged data
    """
    print("Merging datasets...")
    
    # Define column name mappings for standardization
    signal_mapping = {
        'ppg': 'ppg_signal',
        'red': 'ppg_red',
        'ir': 'ppg_ir',
        'green': 'ppg_green',
        'ecg': 'ecg_signal',
        'acc_x': 'accelerometer_x',
        'acc_y': 'accelerometer_y',
        'acc_z': 'accelerometer_z',
        'gyro_x': 'gyroscope_x',
        'gyro_y': 'gyroscope_y',
        'gyro_z': 'gyroscope_z',
        'time': 'timestamp'
    }
    
    demographic_mapping = {
        'age': 'age',
        'gender': 'gender',
        'height': 'height_cm',
        'weight': 'weight_kg',
        'bmi': 'bmi'
    }
    
    # Standardize signal column names
    if 'signals_df' in ptt_ppg_data and not ptt_ppg_data['signals_df'].empty:
        ptt_ppg_data['signals_df'] = standardize_column_names(ptt_ppg_data['signals_df'], signal_mapping)
    
    if 'signals_df' in ppg_dalia_data and not ppg_dalia_data['signals_df'].empty:
        ppg_dalia_data['signals_df'] = standardize_column_names(ppg_dalia_data['signals_df'], signal_mapping)
    
    # Standardize demographic column names
    if 'demographics_df' in ptt_ppg_data and not ptt_ppg_data['demographics_df'].empty:
        ptt_ppg_data['demographics_df'] = standardize_column_names(ptt_ppg_data['demographics_df'], demographic_mapping)
    
    if 'demographics_df' in ppg_dalia_data and not ppg_dalia_data['demographics_df'].empty:
        ppg_dalia_data['demographics_df'] = standardize_column_names(ppg_dalia_data['demographics_df'], demographic_mapping)
    
    # Merge signals dataframes
    signals_dfs = []
    if 'signals_df' in ptt_ppg_data and not ptt_ppg_data['signals_df'].empty:
        signals_dfs.append(ptt_ppg_data['signals_df'])
    if 'signals_df' in ppg_dalia_data and not ppg_dalia_data['signals_df'].empty:
        signals_dfs.append(ppg_dalia_data['signals_df'])
    
    merged_signals = pd.concat(signals_dfs, ignore_index=True) if signals_dfs else pd.DataFrame()
    
    # Merge demographics dataframes
    demographics_dfs = []
    if 'demographics_df' in ptt_ppg_data and not ptt_ppg_data['demographics_df'].empty:
        demographics_dfs.append(ptt_ppg_data['demographics_df'])
    if 'demographics_df' in ppg_dalia_data and not ppg_dalia_data['demographics_df'].empty:
        demographics_dfs.append(ppg_dalia_data['demographics_df'])
    
    merged_demographics = pd.concat(demographics_dfs, ignore_index=True) if demographics_dfs else pd.DataFrame()
    
    # Create a comprehensive dataset with demographics joined to signals
    if not merged_signals.empty and not merged_demographics.empty:
        # Ensure both have subject_id columns for joining
        if 'subject_id' in merged_signals.columns and 'subject_id' in merged_demographics.columns:
            complete_dataset = pd.merge(
                merged_signals, 
                merged_demographics, 
                on='subject_id', 
                how='left'
            )
        else:
            complete_dataset = merged_signals
            print("Warning: Could not merge demographics due to missing subject_id columns")
    else:
        complete_dataset = merged_signals
    
    # Add a unique identifier column
    complete_dataset['record_id'] = [f"record_{i}" for i in range(len(complete_dataset))]
    
    # Gather metadata about the merged dataset
    metadata = {
        'total_records': len(complete_dataset),
        'unique_subjects': complete_dataset['subject_id'].nunique() if 'subject_id' in complete_dataset.columns else 0,
        'datasets_included': ['PTT-PPG', 'PPG-DaLiA'],
        'merge_date': datetime.now().strftime('%Y-%m-%d'),
        'columns': list(complete_dataset.columns)
    }
    
    return {
        'merged_dataset': complete_dataset,
        'merged_demographics': merged_demographics,
        'merged_signals': merged_signals,
        'metadata': metadata
    }


# Save data

In [5]:

def save_merged_dataset(merged_data, output_path):
    """
    Save the merged dataset and metadata to files
    
    Parameters:
    merged_data (dict): Dictionary containing merged data
    output_path (str): Path to save the output files
    """
    print("Saving merged dataset...")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Save complete dataset
    if 'merged_dataset' in merged_data and not merged_data['merged_dataset'].empty:
        merged_data['merged_dataset'].to_csv(os.path.join(output_path, 'merged_ppg_complete.csv'), index=False)
    
    # Save signals only
    if 'merged_signals' in merged_data and not merged_data['merged_signals'].empty:
        merged_data['merged_signals'].to_csv(os.path.join(output_path, 'merged_ppg_signals.csv'), index=False)
    
    # Save demographics only
    if 'merged_demographics' in merged_data and not merged_data['merged_demographics'].empty:
        merged_data['merged_demographics'].to_csv(os.path.join(output_path, 'merged_ppg_demographics.csv'), index=False)
    
    # Save metadata
    if 'metadata' in merged_data:
        metadata_df = pd.DataFrame([merged_data['metadata']])
        metadata_df.to_csv(os.path.join(output_path, 'merged_ppg_metadata.csv'), index=False)
    
    print(f"Merged dataset saved to {output_path}")

def main():
    # Set paths to your datasets
    ptt_ppg_path = '/path/to/ptt_ppg_dataset'  # Replace with your actual path
    ppg_dalia_path = '/path/to/ppg_dalia_dataset'  # Replace with your actual path
    output_path = '/path/to/output'  # Replace with your desired output path
    
    # Load datasets
    ptt_ppg_data = load_ptt_ppg_dataset(ptt_ppg_path)
    ppg_dalia_data = load_ppg_dalia_dataset(ppg_dalia_path)
    
    # Merge datasets
    merged_data = merge_datasets(ptt_ppg_data, ppg_dalia_data)
    
    # Save merged dataset
    save_merged_dataset(merged_data, output_path)
    
    print("Dataset merging complete!")
    
    # Print summary statistics
    if 'metadata' in merged_data:
        print("\nMerged Dataset Summary:")
        print(f"Total Records: {merged_data['metadata']['total_records']}")
        print(f"Unique Subjects: {merged_data['metadata']['unique_subjects']}")
        print(f"Datasets Included: {', '.join(merged_data['metadata']['datasets_included'])}")
        print(f"Merge Date: {merged_data['metadata']['merge_date']}")
        print(f"Number of Columns: {len(merged_data['metadata']['columns'])}")


In [6]:
main()

Loading PTT-PPG Dataset...
Loading PPG-DaLiA Dataset...
Merging datasets...
Saving merged dataset...
Merged dataset saved to /path/to/output
Dataset merging complete!

Merged Dataset Summary:
Total Records: 0
Unique Subjects: 0
Datasets Included: PTT-PPG, PPG-DaLiA
Merge Date: 2025-03-30
Number of Columns: 1
