In [None]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict

# Directory containing user folders
user_data_dir = "/Users/audrey/Desktop/dsc106/SleepTracker/data/user_data"

# Activity categories as per dataset documentation
activity_categories = {
    1: 'sleeping',
    2: 'laying down',
    3: 'sitting',
    4: 'light movement',
    5: 'medium movement',
    6: 'heavy activity',
    7: 'eating',
    8: 'small screen',
    9: 'large screen',
    10: 'caffeinated drink',
    11: 'smoking',
    12: 'alcohol'
}

# Create a dictionary for storing the result
user_activity_profiles = defaultdict(lambda: np.zeros(12))

# Loop through each user folder and calculate activity time
for user_folder in os.listdir(user_data_dir):
    activity_file = os.path.join(user_data_dir, user_folder, "Activity.csv")
    if os.path.exists(activity_file):
        try:
            df = pd.read_csv(activity_file)
            for i in range(1, 13):  # for each activity type
                total_time = df[df['Activity'] == i].apply(
                    lambda row: pd.Timedelta(
                        pd.to_datetime(row['End'].replace("24:00", "00:00"), format='%H:%M') -
                        pd.to_datetime(row['Start'].replace("24:00", "00:00"), format='%H:%M')
                    ).seconds / 60.0,
                    axis=1
                ).sum()
                user_activity_profiles[user_folder][i - 1] = total_time
        except Exception as e:
            print(f"Error reading {activity_file}: {e}")

# Convert to DataFrame for analysis
activity_profiles_df = pd.DataFrame.from_dict(user_activity_profiles, orient='index', columns=list(activity_categories.values()))

# Display (optional, in Jupyter)
activity_profiles_df.head()


Error reading /Users/audrey/Desktop/dsc106/SleepTracker/data/user_data/user_3/Activity.csv: time data "24:00" doesn't match format "%H:%M", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.
Error reading /Users/audrey/Desktop/dsc106/SleepTracker/data/user_data/user_4/Activity.csv: time data "24:00" doesn't match format "%H:%M", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.


Unnamed: 0,sleeping,laying down,sitting,light movement,medium movement,heavy activity,eating,small screen,large screen,caffeinated drink,smoking,alcohol
user_5,0.0,300.0,50.0,0.0,0.0,120.0,135.0,0.0,20.0,0.0,0.0,0.0
user_2,12.0,44.0,101.0,17.0,0.0,52.0,160.0,30.0,2.0,0.0,0.0,20.0
user_4,1.0,175.0,0.0,0.0,0.0,110.0,235.0,0.0,0.0,0.0,0.0,0.0
user_22,0.0,20.0,490.0,0.0,0.0,80.0,120.0,0.0,0.0,0.0,10.0,0.0
user_14,46.0,105.0,92.0,0.0,0.0,58.0,589.0,118.0,0.0,0.0,0.0,5.0


In [4]:
import os

# Start at your current notebook's directory
base_path = os.getcwd()

# See what's inside
for root, dirs, files in os.walk(base_path):
    for dir_name in dirs:
        print(os.path.join(root, dir_name))


/Users/audrey/Desktop/dsc106/SleepTracker/.git
/Users/audrey/Desktop/dsc106/SleepTracker/data
/Users/audrey/Desktop/dsc106/SleepTracker/.git/objects
/Users/audrey/Desktop/dsc106/SleepTracker/.git/info
/Users/audrey/Desktop/dsc106/SleepTracker/.git/logs
/Users/audrey/Desktop/dsc106/SleepTracker/.git/hooks
/Users/audrey/Desktop/dsc106/SleepTracker/.git/refs
/Users/audrey/Desktop/dsc106/SleepTracker/.git/branches
/Users/audrey/Desktop/dsc106/SleepTracker/.git/objects/pack
/Users/audrey/Desktop/dsc106/SleepTracker/.git/objects/info
/Users/audrey/Desktop/dsc106/SleepTracker/.git/logs/refs
/Users/audrey/Desktop/dsc106/SleepTracker/.git/logs/refs/heads
/Users/audrey/Desktop/dsc106/SleepTracker/.git/logs/refs/remotes
/Users/audrey/Desktop/dsc106/SleepTracker/.git/logs/refs/remotes/origin
/Users/audrey/Desktop/dsc106/SleepTracker/.git/refs/heads
/Users/audrey/Desktop/dsc106/SleepTracker/.git/refs/tags
/Users/audrey/Desktop/dsc106/SleepTracker/.git/refs/remotes
/Users/audrey/Desktop/dsc106/Sleep

Efficiency Chart Data

In [11]:

base_path = "data/user_data"
merged_users = []

for folder in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder)

    if os.path.isdir(folder_path) and folder.startswith("user_"):
        user_id = int(folder.split("_")[1])

        sleep_path = os.path.join(folder_path, "sleep.csv")
        info_path = os.path.join(folder_path, "user_info.csv")

        if os.path.exists(sleep_path) and os.path.exists(info_path):
            sleep_df = pd.read_csv(sleep_path)
            info_df = pd.read_csv(info_path)

            sleep_df["user_id"] = user_id
            info_df["user_id"] = user_id

            merged = sleep_df.merge(info_df, on="user_id")
            merged_users.append(merged)

# Filter out empty DataFrames
non_empty_merged = [df for df in merged_users if not df.empty]

# Concatenate
all_users_df = pd.concat(non_empty_merged, ignore_index=True)

# Drop unwanted index columns
all_users_df.drop(columns=["Unnamed: 0_x", "Unnamed: 0_y"], errors="ignore", inplace=True)

# Set index to user_id, sort by user_id
all_users_df.set_index("user_id", inplace=True)
all_users_df.sort_index(inplace=True)


# Preview
all_users_df.head()


Unnamed: 0_level_0,In Bed Date,In Bed Time,Out Bed Date,Out Bed Time,Onset Date,Onset Time,Latency,Efficiency,Total Minutes in Bed,Total Sleep Time (TST),Wake After Sleep Onset (WASO),Number of Awakenings,Average Awakening Length,Movement Index,Fragmentation Index,Sleep Fragmentation Index,Gender,Weight,Height,Age
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,2,03:57,2,07:30,2,03:57,0,92.02,213,196,17,9,1.89,8.92,0.0,8.92,M,65,169,29
1,2,00:46,2,03:31,2,00:46,0,87.27,165,144,21,9,2.33,9.091,10.0,19.091,M,65,169,29
2,2,00:50,2,06:22,2,00:54,4,73.49,332,244,84,18,4.67,15.06,5.556,20.616,M,95,183,27
3,1,22:29,1,05:52,1,22:32,3,79.23,443,351,89,16,5.56,18.962,0.0,18.962,M,70,174,34
4,2,00:57,2,07:10,2,01:01,4,85.52,373,319,50,28,1.79,8.847,14.286,23.133,M,76,180,27


Write to csv

In [12]:
#all_users_df.to_csv("data/clean_data/user_sleep_data.csv", index=True)