Dataset generation

In [3]:
# ===============================
# 1. Imports
# ===============================

!pip install -q scikit-fuzzy

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import skfuzzy as fuzz
from skfuzzy import control as ctrl

np.random.seed(42)

# ===============================
# 2. Synthetic Dataset Generation (Research-Grade)
# ===============================

N_PATIENTS = 2000
WINDOWS = 30
rows = []

for pid in range(1, N_PATIENTS + 1):   # Start IDs from 1
    base_hr = np.random.uniform(55, 105)
    base_noise = np.random.uniform(1, 5)

    for t in range(WINDOWS):
        hr = base_hr + np.random.normal(0, base_noise)
        hr = round(hr, 1)  # Realistic precision

        anomaly = 0
        if np.random.rand() < 0.12:
            hr += np.random.choice([-40, +40])
            anomaly = 1

        motion = np.random.choice([0,1], p=[0.85, 0.15])
        battery = round(100 - (t * 0.3), 1)
        rows.append([pid, t, hr, base_noise, anomaly, motion, battery])

synthetic = pd.DataFrame(rows, columns=[
    'patient_id','time_step','heart_rate',
    'sensor_noise','anomaly_flag',
    'motion_artifact','battery_level'
])

# Real timestamps
start_time = pd.Timestamp("2025-01-01 00:00:00")
synthetic['timestamp'] = start_time + pd.to_timedelta(synthetic['time_step'], unit='min')

# Clinical labeling (independent logic)
def clinical_label(hr):
    if hr < 40: return 'BLACK'
    elif hr < 60: return 'RED'
    elif hr <= 100: return 'GREEN'
    elif hr <= 130: return 'YELLOW'
    else: return 'RED'

synthetic['label'] = synthetic['heart_rate'].apply(clinical_label)

print(synthetic['label'].value_counts())

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/920.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m911.4/920.8 kB[0m [31m44.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m920.8/920.8 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hlabel
GREEN     43562
RED        7407
YELLOW     7111
BLACK      1920
Name: count, dtype: int64


Real-Time Windowing

In [4]:
windowed = synthetic.groupby(
    ['patient_id', 'time_step']
).agg({
    'heart_rate': ['mean','std','min','max']
}).reset_index()

windowed.columns = [
    'patient_id','time_step',
    'hr_mean','hr_std','hr_min','hr_max'
]

# Apply LABELS AFTER windowing
import random

def noisy_clinical_label(hr):
    label = clinical_label(hr)

    # 8% realistic mislabeling
    if random.random() < 0.08:
        label = random.choice(['GREEN','YELLOW','RED','BLACK'])
    return label

windowed['label'] = windowed['hr_mean'].apply(noisy_clinical_label)

print(windowed.head())

   patient_id  time_step  hr_mean  hr_std  hr_min  hr_max   label
0           1          0     76.8     NaN    76.8    76.8   GREEN
1           1          1    121.0     NaN   121.0   121.0  YELLOW
2           1          2    115.6     NaN   115.6   115.6  YELLOW
3           1          3     69.3     NaN    69.3    69.3   GREEN
4           1          4     71.0     NaN    71.0    71.0   GREEN


Table-Based Model

In [5]:
def table_model(hr):
    return clinical_label(hr)

windowed['table_pred'] = windowed['hr_mean'].apply(table_model)

Aggregation Model

In [6]:
windowed['CHRE'] = (
    windowed['hr_mean'] - windowed['hr_mean'].min()
) / (
    windowed['hr_mean'].max() - windowed['hr_mean'].min()
)

windowed['agg_pred'] = pd.qcut(
    windowed['CHRE'],
    q=4,
    labels=['GREEN','YELLOW','RED','BLACK']
)

Fuzzy Logic Model

In [7]:
hr = ctrl.Antecedent(np.arange(30,201,1), 'hr')
triage = ctrl.Consequent(np.arange(0,4,1), 'triage')

hr['brady']  = fuzz.trapmf(hr.universe, [30,30,45,60])
hr['normal'] = fuzz.trapmf(hr.universe, [55,70,90,110])
hr['tachy']  = fuzz.trapmf(hr.universe, [100,130,200,200])

triage['green'] = fuzz.trimf(triage.universe,[0,0,1])
triage['yellow'] = fuzz.trimf(triage.universe,[1,1,2])
triage['red'] = fuzz.trimf(triage.universe,[2,2,3])
triage['black'] = fuzz.trimf(triage.universe,[3,3,3])

rules = [
    ctrl.Rule(hr['normal'], triage['green']),
    ctrl.Rule(hr['brady'], triage['red']),
    ctrl.Rule(hr['tachy'], triage['yellow'])
]

system = ctrl.ControlSystem(rules)
sim = ctrl.ControlSystemSimulation(system)

def fuzzy_model(x):
    sim.input['hr'] = x
    sim.compute()
    o = sim.output['triage']
    if o < 0.5: return 'GREEN'
    elif o < 1.5: return 'YELLOW'
    elif o < 2.5: return 'RED'
    else: return 'BLACK'

windowed['fuzzy_pred'] = windowed['hr_mean'].apply(fuzzy_model)

ML Model (No Label Leakage)

In [8]:
from sklearn.model_selection import GroupShuffleSplit

X = windowed[['hr_mean','hr_std','hr_min','hr_max']].copy()
X += np.random.normal(0, 0.8, X.shape)
y = windowed['label']
groups = windowed['patient_id']

gss = GroupShuffleSplit(test_size=0.2, random_state=42, n_splits=1)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

Evaluation

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("=== TABLE MODEL ===")
print(accuracy_score(y, windowed['table_pred']))

print("=== AGGREGATION MODEL ===")
print(accuracy_score(y, windowed['agg_pred']))

print("=== FUZZY MODEL ===")
print(accuracy_score(y, windowed['fuzzy_pred']))

print("=== ML MODEL (REALISTIC) ===")
print(accuracy_score(y_test, y_pred))

print("\nConfusion Matrix - ML")
print(confusion_matrix(y_test, y_pred))

print("\n=== CLASSIFICATION REPORT - ML (Precision / Recall / F1-score) ===")
print(classification_report(y_test, y_pred))


=== TABLE MODEL ===
0.9390166666666667
=== AGGREGATION MODEL ===
0.12376666666666666
=== FUZZY MODEL ===
0.8442
=== ML MODEL (REALISTIC) ===
0.9263333333333333

Confusion Matrix - ML
[[ 336  162   32   31]
 [   4 7989   59   74]
 [   4  208 1353   37]
 [   5  230   38 1438]]

=== CLASSIFICATION REPORT - ML (Precision / Recall / F1-score) ===
              precision    recall  f1-score   support

       BLACK       0.96      0.60      0.74       561
       GREEN       0.93      0.98      0.96      8126
         RED       0.91      0.84      0.88      1602
      YELLOW       0.91      0.84      0.87      1711

    accuracy                           0.93     12000
   macro avg       0.93      0.82      0.86     12000
weighted avg       0.93      0.93      0.92     12000



Save Dataset

In [10]:
synthetic.to_csv("/content/synthetic_HR_dataset.csv", index=False)
print("✅ Dataset saved")

✅ Dataset saved


In [11]:
windowed.to_csv("/content/windowed_HR_features.csv", index=False)
print("✅ Windowed features saved")

✅ Windowed features saved
