In [43]:
# ----LOADING PROJECT DATASETS ----
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
folder_path = r"C:\Users\Lenovo\Downloads\Project1_VT_Dataset-20250603T114935Z-1-001\Project1_VT_Dataset\synthetic_ppg_data"
csv_files = glob.glob(f"{folder_path}/*.csv")
df = pd.concat([pd.read_csv(file) for file in csv_files], )

print(df.shape)  

(120000, 3)


In [47]:
#------LOADING MASTER DATASET-------

y=df.index
master_df = pd.read_csv(r"C:\Users\Lenovo\Downloads\Project1_VT_Dataset-20250603T114935Z-1-001\Project1_VT_Dataset\master_data.csv")
master_df.head()

Unnamed: 0,subject_id,hr,hrv
0,1,95.579696,66.052442
1,2,60.898322,90.738742
2,3,82.398083,91.914974
3,4,76.918197,64.998944
4,5,86.345628,89.120312


In [45]:
# ---MERGING MASTER DATA AND 20 DATAs OF SUBJECTS----

df = df.merge(master_df, on='subject_id', how='left')
df

Unnamed: 0,time,ppg_signal,subject_id,hr,hrv
0,0.000000,-0.005162,1,95.579696,66.052442
1,0.010002,0.115503,1,95.579696,66.052442
2,0.020003,0.239821,1,95.579696,66.052442
3,0.030005,0.339529,1,95.579696,66.052442
4,0.040007,0.472772,1,95.579696,66.052442
...,...,...,...,...,...
119995,59.959993,0.267373,9,70.082284,93.794391
119996,59.969995,0.335932,9,70.082284,93.794391
119997,59.979997,0.422308,9,70.082284,93.794391
119998,59.989998,0.493671,9,70.082284,93.794391


In [48]:

# ----TAKING VALUE ONLY OF SUBJECT 1 ---------

sub1_ppg = df[df['subject_id']==1]
sub1_ppg

Unnamed: 0,time,ppg_signal,subject_id,hr,hrv
0,0.000000,-0.005162,1,95.579696,66.052442
1,0.010002,0.115503,1,95.579696,66.052442
2,0.020003,0.239821,1,95.579696,66.052442
3,0.030005,0.339529,1,95.579696,66.052442
4,0.040007,0.472772,1,95.579696,66.052442
...,...,...,...,...,...
5995,59.959993,-0.002160,1,95.579696,66.052442
5996,59.969995,-0.001314,1,95.579696,66.052442
5997,59.979997,0.000871,1,95.579696,66.052442
5998,59.989998,-0.014353,1,95.579696,66.052442


In [49]:
# ------------PLOTING PPG SIGNALS OVER THE TIME FOR THE SUBJECT 1


px.line(sub1_ppg,x='time' ,y='ppg_signal', title="PPG Signal Over Time For subject 1", markers=True)

In [50]:
# --------- PLOTING PPG SIGNALS OVER TIME ------------

px.line(df,x='time' ,y='ppg_signal', title="PPG Signal Over Time", markers=True)

In [51]:
# -----------PLOTING PEAKS ON PPG SIGNALS-----

import pandas as pd
import plotly.graph_objects as go
from scipy.signal import find_peaks

# Prepare PPG data
df['ppg_signal'] = df['ppg_signal'].apply(lambda x: eval(x) if isinstance(x, str) else x)
merged_df = df.explode('ppg_signal').reset_index(drop=True)
merged_df['ppg_signal'] = pd.to_numeric(merged_df['ppg_signal'], errors='coerce')
ppg = merged_df['ppg_signal'].values

# Find peaks
peaks, _ = find_peaks(ppg, distance=30, prominence=0.1)

# Plot
fig = go.Figure()
fig.add_trace(go.Scatter(y=ppg, mode='lines', name='PPG'))
fig.add_trace(go.Scatter(x=peaks, y=ppg[peaks], mode='markers', name='Peaks',
                         marker=dict(color='red', size=6)))
fig.update_layout(title='PPG Signal with Peaks', xaxis_title='Sample Index',
                  yaxis_title='PPG Signal', template='plotly_white')
fig.show()

In [52]:
# ----------------- EXTRACTING FEATURES-------------------



from scipy.signal import find_peaks, welch
from scipy.stats import skew, kurtosis

# Assuming df_combined is your full dataset
all_features = []

# Group by subject_id
for subject_id, group in df.groupby('subject_id'):
    signal = group['ppg_signal'].values
    time = group['time'].values

    # Sampling frequency
    fs = 1 / np.mean(np.diff(time))

    # Time-domain features
    mean_val = np.mean(signal)
    std_val = np.std(signal)
    var_val = np.var(signal)
    max_val = np.max(signal)
    min_val = np.min(signal)
    skew_val = skew(signal)
    kurt_val = kurtosis(signal)

    # Find peaks
    peaks, _ = find_peaks(signal, distance=50)
    num_peaks = len(peaks)

    if len(peaks) > 1:
        rr_intervals = np.diff(time[peaks]) * 1000
        mean_rr = np.mean(rr_intervals)
        std_rr = np.std(rr_intervals)
        median_rr = np.median(rr_intervals)
    else:
        mean_rr = std_rr = median_rr = 0

    # Derivative-based features
    diff_signal = np.diff(signal)
    mean_diff = np.mean(diff_signal)
    std_diff = np.std(diff_signal)
    max_diff = np.max(diff_signal)
    min_diff = np.min(diff_signal)

    # Frequency domain
    freqs, power = welch(signal, fs=fs, nperseg=min(256, len(signal)))
    total_power = np.sum(power)
    dominant_freq = freqs[np.argmax(power)]
    lf_power = np.sum(power[(freqs >= 0.04) & (freqs < 0.15)])
    hf_power = np.sum(power[(freqs >= 0.15) & (freqs < 0.4)])

 # Ground truth values (take first row — same for all within subject)
    hr = group['hr'].iloc[0]
    hrv = group['hrv'].iloc[0]

    # Append feature dictionary
    features = {
        'subject_id': subject_id,
        'mean_signal': mean_val,
        'std_signal': std_val,
        'var_signal': var_val,
        'max_signal': max_val,
        'min_signal': min_val,
        'skewness': skew_val,
        'kurtosis': kurt_val,
        'num_peaks': num_peaks,
        'mean_rr': mean_rr,
        'std_rr': std_rr,
        'median_rr': median_rr,
        'mean_diff': mean_diff,
        'std_diff': std_diff,
        'max_diff': max_diff,
        'min_diff': min_diff,
        'total_power': total_power,
        'dominant_freq': dominant_freq,
        'lf_power': lf_power,
        'hf_power': hf_power,
        'hr': hr,
        'hrv': hrv
    }

    all_features.append(features)

# Final DataFrame with all features
features_df = pd.DataFrame(all_features)

# Preview
print(features_df)


    subject_id  mean_signal  std_signal  var_signal  max_signal  min_signal  \
0            1     0.001781    0.474244    0.224908    0.802330   -0.804611   
1            2     0.000480    0.474579    0.225225    0.807301   -0.800685   
2            3     0.002182    0.474611    0.225256    0.801908   -0.806918   
3            4     0.000243    0.474527    0.225176    0.804416   -0.810989   
4            5     0.002223    0.474949    0.225577    0.802566   -0.802512   
5            6     0.001933    0.474697    0.225337    0.802001   -0.811811   
6            7     0.002645    0.474779    0.225415    0.805565   -0.810147   
7            8     0.002025    0.474718    0.225357    0.802440   -0.800681   
8            9     0.000402    0.474059    0.224732    0.801556   -0.818189   
9           10     0.001545    0.474971    0.225597    0.800360   -0.802970   
10          11     0.002319    0.474912    0.225542    0.803607   -0.808394   
11          12     0.001247    0.474743    0.225381 

In [53]:

# -------- APPLYING LINEAR REGRESSION MODEL ON EXTRACTED FEATURES-----



from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Drop non-feature and target columns
X = features_df.drop(columns=['subject_id', 'hr', 'hrv'])
y_hr = features_df['hr']
y_hrv = features_df['hrv']

# Split for HR prediction
X_train_hr, X_test_hr, y_train_hr, y_test_hr = train_test_split(X, y_hr, test_size=0.2, random_state=42)

# Train Linear Regression for HR
model_hr = LinearRegression()
model_hr.fit(X_train_hr, y_train_hr)

# Predict and Evaluate HR
y_pred_hr = model_hr.predict(X_test_hr)

mae_hr = mean_absolute_error(y_test_hr, y_pred_hr)
rmse_hr = np.sqrt(mean_squared_error(y_test_hr, y_pred_hr))
r2_hr = r2_score(y_test_hr, y_pred_hr)

print("📊 Heart Rate (HR) Prediction Results:")
print(f"MAE:  {mae_hr:.2f}")
print(f"RMSE: {rmse_hr:.2f}")
print(f"R²:   {r2_hr:.2f}\n")

# Split for HRV prediction
X_train_hrv, X_test_hrv, y_train_hrv, y_test_hrv = train_test_split(X, y_hrv, test_size=0.2, random_state=42)

# Train Linear Regression for HRV
model_hrv = LinearRegression()
model_hrv.fit(X_train_hrv, y_train_hrv)

# Predict and Evaluate HRV
y_pred_hrv = model_hrv.predict(X_test_hrv)

mae_hrv = mean_absolute_error(y_test_hrv, y_pred_hrv)
rmse_hrv = np.sqrt(mean_squared_error(y_test_hrv, y_pred_hrv))
r2_hrv = r2_score(y_test_hrv, y_pred_hrv)

print("📊 Heart Rate Variability (HRV) Prediction Results:")
print(f"MAE:  {mae_hrv:.2f}")
print(f"RMSE: {rmse_hrv:.2f}")
print(f"R²:   {r2_hrv:.2f}")


📊 Heart Rate (HR) Prediction Results:
MAE:  0.86
RMSE: 1.03
R²:   1.00

📊 Heart Rate Variability (HRV) Prediction Results:
MAE:  50.85
RMSE: 56.23
R²:   -16.20


In [54]:
# -----------------APPLING RANDOM FOREST MODEL ON EXTRACTED DATA-------------




from sklearn.ensemble import RandomForestRegressor
# Split features and targets
X = features_df.drop(columns=['subject_id', 'hr', 'hrv'])
y_hr = features_df['hr']
y_hrv = features_df['hrv']

# ---------------- HR Prediction ----------------
X_train_hr, X_test_hr, y_train_hr, y_test_hr = train_test_split(X, y_hr, test_size=0.2, random_state=42)

# Train Random Forest for HR
rf_hr = RandomForestRegressor(n_estimators=100, random_state=42)
rf_hr.fit(X_train_hr, y_train_hr)

# Predict and evaluate HR
y_pred_hr = rf_hr.predict(X_test_hr)
mae_hr = mean_absolute_error(y_test_hr, y_pred_hr)
rmse_hr = np.sqrt(mean_squared_error(y_test_hr, y_pred_hr))
r2_hr = r2_score(y_test_hr, y_pred_hr)

print("📊 Random Forest: Heart Rate (HR) Prediction")
print(f"MAE:  {mae_hr:.2f}")
print(f"RMSE: {rmse_hr:.2f}")
print(f"R²:   {r2_hr:.2f}\n")

# ---------------- HRV Prediction ----------------
X_train_hrv, X_test_hrv, y_train_hrv, y_test_hrv = train_test_split(X, y_hrv, test_size=0.2, random_state=42)

# Train Random Forest for HRV
rf_hrv = RandomForestRegressor(n_estimators=100, random_state=42)
rf_hrv.fit(X_train_hrv, y_train_hrv)

# Predict and evaluate HRV
y_pred_hrv = rf_hrv.predict(X_test_hrv)
mae_hrv = mean_absolute_error(y_test_hrv, y_pred_hrv)
rmse_hrv = np.sqrt(mean_squared_error(y_test_hrv, y_pred_hrv))
r2_hrv = r2_score(y_test_hrv, y_pred_hrv)

print("📊 Random Forest: Heart Rate Variability (HRV) Prediction")
print(f"MAE:  {mae_hrv:.2f}")
print(f"RMSE: {rmse_hrv:.2f}")
print(f"R²:   {r2_hrv:.2f}")


📊 Random Forest: Heart Rate (HR) Prediction
MAE:  4.49
RMSE: 5.36
R²:   0.87

📊 Random Forest: Heart Rate Variability (HRV) Prediction
MAE:  12.23
RMSE: 15.89
R²:   -0.37
