In [None]:
import pandas as pd
import os

# Set your dataset directory paths
base_dir = r"C:\Users\Lenovo\Downloads\Project1_VT_Dataset-20250603T114935Z-1-001\Project1_VT_Dataset"
ppg_dir = os.path.join(base_dir, r"C:\Users\Lenovo\Downloads\Project1_VT_Dataset-20250603T114935Z-1-001\Project1_VT_Dataset\synthetic_ppg_data")
master_path = os.path.join(base_dir, r"C:\Users\Lenovo\Downloads\Project1_VT_Dataset-20250603T114935Z-1-001\Project1_VT_Dataset\master_data.csv")

# Load master CSV file
master_df = pd.read_csv(master_path)

# Load all subject CSVs into a single list of DataFrames
subject_dfs = []

for filename in sorted(os.listdir(ppg_dir)):
    if filename.endswith(".csv"):
        file_path = os.path.join(ppg_dir, filename)
        df = pd.read_csv(file_path)

        # Extract subject ID from the filename (e.g., subject_3.csv -> 3)
        subject_id = int(filename.split('_')[1].split('.')[0])
        df['subject_id'] = subject_id

        subject_dfs.append(df)

# Combine all subject data into a single DataFrame (optional)
all_ppg_df = pd.concat(subject_dfs, ignore_index=True)

# Preview
print("Master CSV:")
print(master_df.head())

print("\nCombined PPG data (first few rows):")
print(all_ppg_df.head())


Master CSV:
   subject_id         hr        hrv
0           1  95.579696  66.052442
1           2  60.898322  90.738742
2           3  82.398083  91.914974
3           4  76.918197  64.998944
4           5  86.345628  89.120312

Combined PPG data (first few rows):
       time  ppg_signal  subject_id
0  0.000000   -0.005162           1
1  0.010002    0.115503           1
2  0.020003    0.239821           1
3  0.030005    0.339529           1
4  0.040007    0.472772           1


In [3]:
import pandas as pd
import numpy as np
from scipy.signal import find_peaks, welch
from scipy.stats import skew, kurtosis

def extract_features_from_ppg(df):
    time = df['time'].values
    signal = df['ppg_signal'].values

    # --- Time-Domain Features ---
    mean_val = np.mean(signal)
    std_val = np.std(signal)
    var_val = np.var(signal)
    max_val = np.max(signal)
    min_val = np.min(signal)
    peak_indices, _ = find_peaks(signal, distance=50)
    num_peaks = len(peak_indices)

    # RR intervals (in milliseconds)
    if len(peak_indices) > 1:
        peak_times = time[peak_indices]
        rr_intervals = np.diff(peak_times) * 1000
        mean_rr = np.mean(rr_intervals)
        std_rr = np.std(rr_intervals)
        median_rr = np.median(rr_intervals)
    else:
        mean_rr = std_rr = median_rr = 0

    # --- Derivative-Based Features ---
    diff_signal = np.diff(signal)
    mean_diff = np.mean(diff_signal)
    std_diff = np.std(diff_signal)
    max_diff = np.max(diff_signal)
    min_diff = np.min(diff_signal)

    # --- Frequency-Domain Features ---
    fs = 1 / np.mean(np.diff(time))  # Sampling frequency
    freqs, power = welch(signal, fs=fs, nperseg=min(256, len(signal)))
    total_power = np.sum(power)
    dominant_freq = freqs[np.argmax(power)]
    low_freq_power = np.sum(power[(freqs >= 0.04) & (freqs < 0.15)])
    high_freq_power = np.sum(power[(freqs >= 0.15) & (freqs < 0.4)])

    # --- Statistical Features ---
    signal_skew = skew(signal)
    signal_kurtosis = kurtosis(signal)

    return {
        'mean_signal': mean_val,
        'std_signal': std_val,
        'var_signal': var_val,
        'max_signal': max_val,
        'min_signal': min_val,
        'num_peaks': num_peaks,
        'mean_rr': mean_rr,
        'std_rr': std_rr,
        'median_rr': median_rr,
        'mean_diff': mean_diff,
        'std_diff': std_diff,
        'max_diff': max_diff,
        'min_diff': min_diff,
        'total_power': total_power,
        'dominant_freq': dominant_freq,
        'low_freq_power': low_freq_power,
        'high_freq_power': high_freq_power,
        'signal_skew': signal_skew,
        'signal_kurtosis': signal_kurtosis
    }










import os

features_list = []
for filename in os.listdir(r"C:\Users\Lenovo\Downloads\Project1_VT_Dataset-20250603T114935Z-1-001\Project1_VT_Dataset\synthetic_ppg_data"):
    if filename.endswith(".csv"):
        subject_df = pd.read_csv(os.path.join(r"C:\Users\Lenovo\Downloads\Project1_VT_Dataset-20250603T114935Z-1-001\Project1_VT_Dataset\synthetic_ppg_data", filename))
        subject_id = int(filename.split('_')[1].split('.')[0])
        features = extract_features_from_ppg(subject_df)
        features['subject_id'] = subject_id
        features_list.append(features)

features_df = pd.DataFrame(features_list)
print(features_df)


    mean_signal  std_signal  var_signal  max_signal  min_signal  num_peaks  \
0      0.001781    0.474244    0.224908    0.802330   -0.804611         96   
1      0.001545    0.474971    0.225597    0.800360   -0.802970         80   
2      0.002319    0.474912    0.225542    0.803607   -0.808394         63   
3      0.001247    0.474743    0.225381    0.807112   -0.800432         83   
4      0.002024    0.474221    0.224885    0.804935   -0.803378         96   
5      0.001964    0.474980    0.225606    0.799971   -0.805559         88   
6      0.001979    0.474759    0.225397    0.802821   -0.811989         93   
7      0.000352    0.474487    0.225138    0.800007   -0.802691         84   
8      0.002227    0.474040    0.224714    0.802454   -0.802217         79   
9      0.000080    0.474619    0.225263    0.802485   -0.811815         62   
10     0.001738    0.473607    0.224303    0.805459   -0.804447         87   
11     0.000480    0.474579    0.225225    0.807301   -0.800685 