#### Goal
- Integrate cleaned Fitbit daily datasets (activity, sleep, resting HR)
- Produce one row per user per day
- Preserve missingness across modalities

#### Inputs
- fitbit_daily_activity_clean.csv
- fitbit_sleep_daily_clean.csv
- fitbit_restingHR_clean.csv

#### Output
- fitbit_user_day_clean.csv

#### Primary Key
- (user_id, date)

#### Joining
- Activity as base table
- LEFT JOIN sleep
- LEFT JOIN resting HR

In [1]:
import pandas as pd
import duckdb

In [2]:
con = duckdb.connect()

In [3]:
fitbit_daily_activity = pd.read_csv("data/clean/fitbit_daily_activity_clean.csv")

In [4]:
fitbit_sleep_daily = pd.read_csv("data/clean/fitbit_sleep_daily_clean.csv")

In [5]:
fitbit_restingHR = pd.read_csv("data/clean/fitbit_restingHR_clean.csv")

In [6]:
con.register("fitbit_daily_activity", fitbit_daily_activity)
con.register("fitbit_sleep_daily", fitbit_sleep_daily)
con.register("fitbit_restingHR", fitbit_restingHR)

<_duckdb.DuckDBPyConnection at 0x103d34930>

In [7]:
fitbit_daily_activity.columns

Index(['Id', 'ActivityDate', 'TotalSteps', 'TrackerDistance',
       'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes',
       'SedentaryMinutes', 'Calories', 'source', 'activity_date',
       'steps_clean', 'steps_flag_outlier', 'minutes_flag_over_1440'],
      dtype='object')

In [8]:
fitbit_sleep_daily.columns

Index(['user_id', 'sleep_date', 'total_minutes_asleep', 'total_minutes_awake',
       'avg_sleep_efficiency', 'segment_count', 'has_unknown_main_sleep'],
      dtype='object')

In [9]:
fitbit_restingHR.columns

Index(['user_id', 'rhr_date', 'rhr_clean', 'rhr_flag_invalid'], dtype='object')

In [10]:
con.execute("""CREATE OR REPLACE VIEW fitbit_user_day_step1 AS
SELECT* EXCLUDE (Id, ActivityDate, activity_date),
Id AS user_id,
activity_date AS date
FROM fitbit_daily_activity""")

<_duckdb.DuckDBPyConnection at 0x103d34930>

In [11]:
con.execute("SELECT COUNT(*) AS rows FROM fitbit_user_day_step1").df()

Unnamed: 0,rows
0,14299


In [12]:
con.execute("DESCRIBE fitbit_user_day_step1").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,TotalSteps,DOUBLE,YES,,,
1,TrackerDistance,DOUBLE,YES,,,
2,VeryActiveMinutes,DOUBLE,YES,,,
3,FairlyActiveMinutes,DOUBLE,YES,,,
4,LightlyActiveMinutes,DOUBLE,YES,,,
5,SedentaryMinutes,DOUBLE,YES,,,
6,Calories,DOUBLE,YES,,,
7,source,VARCHAR,YES,,,
8,steps_clean,DOUBLE,YES,,,
9,steps_flag_outlier,BIGINT,YES,,,


In [13]:
con.execute("DESCRIBE fitbit_sleep_daily").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,user_id,VARCHAR,YES,,,
1,sleep_date,VARCHAR,YES,,,
2,total_minutes_asleep,DOUBLE,YES,,,
3,total_minutes_awake,DOUBLE,YES,,,
4,avg_sleep_efficiency,DOUBLE,YES,,,
5,segment_count,BIGINT,YES,,,
6,has_unknown_main_sleep,BIGINT,YES,,,


In [14]:
con.execute("""CREATE OR REPLACE VIEW fitbit_user_day_step2 AS
SELECT
a.*,
s.total_minutes_asleep,
s.total_minutes_awake,
s.avg_sleep_efficiency,
s.segment_count,
s.has_unknown_main_sleep
FROM fitbit_user_day_step1 a
LEFT JOIN fitbit_sleep_daily s
ON a.user_id = s.user_id
AND a.date = s.sleep_date""")

<_duckdb.DuckDBPyConnection at 0x103d34930>

In [15]:
con.execute("SELECT COUNT(*) AS rows FROM fitbit_user_day_step2").df()

Unnamed: 0,rows
0,14299


In [16]:
con.execute("""SELECT COUNT(*) AS rows, 
SUM(CASE WHEN total_minutes_asleep IS NOT NULL THEN 1 ELSE 0 END) AS sleep_matched
FROM fitbit_user_day_step2""").df()

Unnamed: 0,rows,sleep_matched
0,14299,12005.0


In [17]:
con.execute("""SELECT user_id, rhr_date,
COUNT(*) AS n_rows
FROM fitbit_restingHR
GROUP BY user_id, rhr_date
HAVING COUNT(*) > 1
ORDER BY n_rows DESC
LIMIT 10""").df()

Unnamed: 0,user_id,rhr_date,n_rows
0,U_5NQ4FMYZYCWT,2024-09-13,2
1,U_5NQ4FMYZYCWT,2024-10-03,2
2,U_5NQ4FMYZYCWT,2024-11-15,2
3,U_W2PCN9T84AZY,2024-04-03,2
4,U_W2PCN9T84AZY,2024-06-09,2
5,U_W2PCN9T84AZY,2024-11-01,2
6,U_W2PCN9T84AZY,2024-12-01,2
7,U_1ERTJ5PHT0HL,2024-03-23,2
8,U_1ERTJ5PHT0HL,2024-08-25,2
9,U_F2GOG72OY0IF,2024-03-11,2


In [18]:
con.execute("""CREATE OR REPLACE VIEW fitbit_restingHR_dedup_daily AS
SELECT 
user_id, 
rhr_date, 
avg(rhr_clean) AS rhr_clean_daily, 
max(rhr_flag_invalid) AS rhr_flag_invalid_daily 
FROM fitbit_restingHR
GROUP BY user_id, rhr_date""")

<_duckdb.DuckDBPyConnection at 0x103d34930>

In [19]:
con.execute("""SELECT user_id, rhr_date,
COUNT(*) AS n_rows
FROM fitbit_restingHR_dedup_daily
GROUP BY user_id, rhr_date
HAVING COUNT(*) > 1
ORDER BY n_rows DESC
LIMIT 10""").df()

Unnamed: 0,user_id,rhr_date,n_rows


In [20]:
con.execute("""CREATE OR REPLACE VIEW fitbit_user_day_step3 AS
SELECT
a.*,
r.rhr_clean_daily,
r.rhr_flag_invalid_daily
FROM fitbit_user_day_step2 a
LEFT JOIN fitbit_restingHR_dedup_daily r
ON a.user_id = r.user_id
AND a.date = r.rhr_date""")

<_duckdb.DuckDBPyConnection at 0x103d34930>

In [21]:
con.execute("""SELECT COUNT(*) AS rows,
SUM(CASE WHEN rhr_clean_daily IS NOT NULL THEN 1 ELSE 0 END) AS rhr_matched
FROM fitbit_user_day_step3""").df()

Unnamed: 0,rows,rhr_matched
0,14299,12483.0


In [22]:
con.execute("""SELECT user_id, date, COUNT(*) AS rows FROM fitbit_user_day_step3
GROUP BY user_id, date
HAVING COUNT(*) > 1
""").df()

Unnamed: 0,user_id,date,rows


In [23]:
con.execute("""SELECT COUNT(*) AS total_rows,
SUM(CASE WHEN total_minutes_asleep IS NULL THEN 1 ELSE 0 END) AS missing_sleep,
SUM(CASE WHEN rhr_clean_daily IS NULL THEN 1 ELSE 0 END) AS missing_RHR
FROM fitbit_user_day_step3""").df()

Unnamed: 0,total_rows,missing_sleep,missing_RHR
0,14299,2294.0,1816.0


In [24]:
con.execute("""COPY fitbit_user_day_step3
TO 'data/clean/fitbit_user_day_clean.csv'
(HEADER, DELIMITER ',')""")

<_duckdb.DuckDBPyConnection at 0x103d34930>

- Integrated Fitbit daily activity + sleep + resting HR at user-day grain using LEFT JOINs from activity.
- Missingness preserved (sleep missing 16.0%, RHR missing 12.7%).