In [2]:
import pandas as pd
import numpy as np
import os

def combine_data():
    # Directory containing participant CSV files
    data_dir = 'downsampled_data'

    # Initialize an empty list to store dataframes
    all_data = []

    # Loop through each participant's file
    for filename in os.listdir(data_dir):
        if filename.endswith('.csv'):
            # Load the CSV file
            participant_data = pd.read_csv("downsampled_data/"+filename)
            
            # Append to the list
            all_data.append(participant_data)

    # Combine all data into a single DataFrame
    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

In [6]:
from sklearn.model_selection import train_test_split

combined_data = combine_data()

#keep original columns
# orig_cols = ['sleep_stage', 'BVP_mean', 'IBI_mean', 'EDA_mean', 'TEMP_mean', 'HR_mean', 'mag_mean' ]

# combined_data = combined_data[orig_cols]

X = combined_data.drop('sleep_stage', axis=1)
y = combined_data['sleep_stage']

# Split data into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    stratify=y,  # Preserve class distribution
    random_state=42
)

In [7]:
from sklearn.dummy import DummyClassifier

# Baseline model: Predicts the most frequent class in y_train
baseline_model = DummyClassifier(strategy="most_frequent")
baseline_model.fit(X_train, y_train)  # "Trains" by memorizing the majority class

# Evaluate on test data
baseline_accuracy = baseline_model.score(X_test, y_test)
print(f"Baseline Accuracy: {baseline_accuracy:.2%}")

Baseline Accuracy: 50.23%


In [10]:
# give percentages of each class in y
combined_data['sleep_stage'].value_counts(normalize=True)

sleep_stage
N2         0.502300
W          0.245164
N1         0.112338
R          0.107053
N3         0.032612
Missing    0.000533
Name: proportion, dtype: float64

In [11]:
# Stratified baseline (matches class distribution)
stratified_model = DummyClassifier(strategy="stratified")
stratified_model.fit(X_train, y_train)
stratified_accuracy = stratified_model.score(X_test, y_test)

# Uniform random baseline
uniform_model = DummyClassifier(strategy="uniform")
uniform_model.fit(X_train, y_train)
uniform_accuracy = uniform_model.score(X_test, y_test)

print(f"Stratified Accuracy: {stratified_accuracy:.2%}")
print(f"Uniform Random Accuracy: {uniform_accuracy:.2%}")

Stratified Accuracy: 33.95%
Uniform Random Accuracy: 16.73%
