# Load all the data and libraries

In [1]:
path = '/nesi/nobackup/aut03802/dataset_sleep/sleep_classifiers/'
import os
import time
os.chdir(path)

In [2]:
from source.analysis.figures.data_plot_builder import DataPlotBuilder
from source.analysis.setup.subject_builder import SubjectBuilder
from source.constants import Constants
from source.preprocessing.activity_count.activity_count_service import ActivityCountService
from source.preprocessing.feature_builder import FeatureBuilder
from source.preprocessing.raw_data_processor import RawDataProcessor
from source.preprocessing.time.circadian_service import CircadianService

define the function

In [3]:
def run_preprocessing(subject_set):
    start_time = time.time()

    for subject in subject_set:
        print("Cropping data from subject " + str(subject) + "...")
        RawDataProcessor.crop_all(str(subject))

    if Constants.INCLUDE_CIRCADIAN:
        ActivityCountService.build_activity_counts()  # This uses MATLAB, but has been replaced with a python implementation
        CircadianService.build_circadian_model()      # Both of the circadian lines require MATLAB to run
        CircadianService.build_circadian_mesa()       # INCLUDE_CIRCADIAN = False by default because most people don't have MATLAB

    for subject in subject_set:
        FeatureBuilder.build(str(subject))

    end_time = time.time()
    print("Execution took " + str((end_time - start_time) / 60) + " minutes")

# getting subject ids

In [4]:
import os

# Define the paths
main_path = '/nesi/project/aut03802/Data/Data_uncompressed/Big Ideas Lab/big-ideas-lab-glycemic-variability-and-wearable-device-data-1.1.1'
processed_dir = os.path.join(main_path, 'processed')
path_to_motion = os.path.join(processed_dir, 'motion')

# List all files in the motion directory
motion_files = os.listdir(path_to_motion)

# Function to extract subject ID from the filename
def extract_subject_id(filename):
    return filename.split('_acceleration')[0]

# Set to hold unique subject IDs
subject_ids = set()

# Extract subject IDs
for filename in motion_files:
    subject_id = extract_subject_id(filename)
    subject_ids.add(subject_id)

subject_ids = list(subject_ids)

Changed the delimiter

In [5]:
import os

# Directory containing the acceleration files
directory = '/nesi/nobackup/aut03802/dataset_sleep/sleep_classifiers/data/motion'

# Iterate over all files in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath):
        # Read the content of the file
        with open(filepath, 'r') as file:
            content = file.read()

        # Replace tabs ('\t') with spaces (' ')
        content = content.replace('\t', ' ')

        # Write the modified content back to the file
        with open(filepath, 'w') as file:
            file.write(content)

print("Delimiter changed successfully.")

Delimiter changed successfully.


In [6]:
path_to_heart_rate= os.path.join(processed_dir,'heart_rate')

In [7]:
run_preprocessing(subject_ids)

Cropping data from subject participant_016_day_1...
type <class 'list'>
type <class 'list'>
type <class 'list'>
Cropping data from subject participant_007_day_3...
type <class 'list'>
type <class 'list'>
type <class 'list'>
Cropping data from subject participant_010_day_1...
type <class 'list'>
type <class 'list'>
type <class 'list'>
Cropping data from subject participant_004_day_4...
type <class 'list'>
type <class 'list'>
type <class 'list'>
Cropping data from subject participant_006_day_1...
type <class 'list'>
type <class 'list'>
type <class 'list'>
Cropping data from subject participant_008_day_6...
type <class 'list'>
type <class 'list'>
type <class 'list'>
Cropping data from subject participant_011_day_9...
type <class 'list'>
type <class 'list'>
type <class 'list'>
Cropping data from subject participant_012_day_2...
type <class 'list'>
type <class 'list'>
type <class 'list'>
Cropping data from subject participant_011_day_6...
type <class 'list'>
type <class 'list'>
type <class 

In [25]:
import os
import pandas as pd

# Path to the directory containing the feature files
directory = '/nesi/nobackup/aut03802/dataset_sleep/sleep_classifiers/outputs/features'

# Initialize a dictionary to hold dataframes for each participant and day
participants_data = {}

# List all files in the directory
for filename in os.listdir(directory):
    if filename.startswith('participant') and filename.endswith('.out'):
        # Extract participant number and other relevant information from filename
        parts = filename.split('_')
        participant = parts[1]
        day = parts[3]
        feature = '_'.join(parts[4:]).replace('.out', '')

        # Read the file into a dataframe
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path, header=None, names=[feature])

        # Group data by participant and day
        key = (participant, day)
        if key in participants_data:
            participants_data[key].append(df)
        else:
            participants_data[key] = [df]

# Concatenate all features for each participant and day
for key, dfs in participants_data.items():
    participant, day = key
    combined_df = pd.concat(dfs, axis=1)
    # Optionally, you can save each combined DataFrame to a separate file
    output_path = f'/nesi/nobackup/aut03802/dataset_sleep/sleep_classifiers/outputs/features/{participant}_day_{day}_combined.csv'
    combined_df.to_csv(output_path, index=False)


In [12]:
combined_df.columns

Index(['day_7_hr_feature', 'participant', 'day', 'day_2_psg_labels',
       'day_5_count_feature', 'day_7_count_feature', 'day_5_psg_labels',
       'day_2_hr_feature', 'day_4_psg_labels', 'day_8_cosine_feature',
       'day_5_cosine_feature', 'day_2_cosine_feature', 'day_1_hr_feature',
       'day_1_psg_labels', 'day_1_cosine_feature', 'day_1_count_feature',
       'day_8_count_feature', 'day_1_time_feature', 'day_5_hr_feature',
       'day_6_time_feature', 'day_4_time_feature', 'day_4_cosine_feature',
       'day_3_hr_feature', 'day_6_hr_feature', 'day_9_cosine_feature',
       'day_3_count_feature', 'day_6_psg_labels', 'day_4_count_feature',
       'day_6_count_feature', 'day_2_count_feature', 'day_2_time_feature',
       'day_3_psg_labels', 'day_9_hr_feature', 'day_8_hr_feature',
       'day_3_cosine_feature', 'day_6_cosine_feature', 'day_4_hr_feature',
       'day_3_time_feature', 'day_7_psg_labels', 'day_9_count_feature',
       'day_7_cosine_feature', 'day_8_psg_labels', 'day_9_

In [22]:
participants_data

{'009':      7_hr_feature  participant  day  5_count_feature  participant  day  \
 0        0.957666          009    7        35.880550          009    5   
 1        0.994761          009    7       555.895277          009    5   
 2        1.007722          009    7       828.579450          009    5   
 3        0.999412          009    7      1009.517335          009    5   
 4        0.980481          009    7      1038.074183          009    5   
 ..            ...          ...  ...              ...          ...  ...   
 934           NaN          NaN  NaN              NaN          NaN  NaN   
 935           NaN          NaN  NaN              NaN          NaN  NaN   
 936           NaN          NaN  NaN              NaN          NaN  NaN   
 937           NaN          NaN  NaN              NaN          NaN  NaN   
 938           NaN          NaN  NaN              NaN          NaN  NaN   
 
      5_psg_labels  participant  day  5_hr_feature  ...  day 4_count_feature  \
 0         

In [1]:
import os 


In [2]:
import os
import pandas as pd

# Path to the directory containing the feature files
directory = '/nesi/nobackup/aut03802/dataset_sleep/sleep_classifiers/outputs/features'

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from yellowbrick.classifier import ClassificationReport
from sklearn.preprocessing import StandardScaler

# Path to the combined CSV file
combined_csv_path = '/nesi/nobackup/aut03802/dataset_sleep/sleep_classifiers/outputs/features/combined_features.csv'  # change this to your actual file path

# Read the combined CSV file into a DataFrame
combined_df = pd.read_csv(combined_csv_path)
combined_df[combined_df['psg_labels']==4.0]=3.0

# Define the feature columns and the target column
feature_cols = ['hr_feature', 'cosine_feature', 'count_feature', 'time_feature']
target_col = 'psg_labels'
scaler = StandardScaler()

In [19]:
for file in os.listdir(directory):
    if not file.startswith('participant') and not file.endswith('.out') and 'features' not in file and file.endswith('.csv'):
        path_file=os.path.join(directory,file)
        data=pd.read_csv(path_file)
        data[data['psg_labels']==4.0]=3.0
        X = data[feature_cols]
        y = data[target_col]
        
        X_scaled = scaler.fit_transform(X)
        y_pred = model.predict(X_scaled)
        data['psg_predicted']=y_pred
        data.to_csv(path_file)
        

In [12]:
import pickle
path_to_model='/nesi/nobackup/aut03802/dataset_sleep/sleep_classifiers/outputs/model_files/best_classifier.pkl'

# Replace 'model.pkl' with the path to your .pkl file
with open(path_to_model, 'rb') as file:
    model = pickle.load(file)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [18]:
y_pred

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 3., 1., 0., 1., 1., 0.,
       1., 0., 0., 0., 2., 5., 5., 2., 5., 5., 5., 2., 2., 1., 3., 2., 2.,
       3., 3., 3., 2., 2., 1., 2., 2., 3., 3., 1., 2., 2., 1., 1., 1., 0.,
       0., 1., 1., 1., 1.