In [1]:
!pip install duckdb



In [2]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt

### User devices
- No time series metrics
- not for modeling, can be used later to explain coverage

In [3]:
devices_registry = pd.read_csv("wearables_synthetic_raw/devices_registry.csv")

In [4]:
devices_registry.head()

Unnamed: 0,user_id,vendor,model,device_id,paired_at,firmware,active_flag
0,U_XAJI0Y6DPBHS,Oura,Oura Ring Gen3,D_6WIXAW63,2024-02-19,3.4,TRUE
1,U_5NQ4FMYZYCWT,Fitbit,Fitbit Versa 4,D_3DR2H2P3,2025-01-12,,0
2,U_W2PCN9T84AZY,Fitbit,Fitbit Versa 4,D_UE421VME,2024-03-13,3.4,1
3,U_W2PCN9T84AZY,Samsung,Samsung Galaxy Watch6,D_ZTP7BD4W,2025-02-02,2.1.0,1
4,U_1ERTJ5PHT0HL,Fitbit,Fitbit Versa 4,D_A7ZVKY2R,2024-03-05,2.1.0,1


In [5]:
devices_registry.describe()

Unnamed: 0,user_id,vendor,model,device_id,paired_at,firmware,active_flag
count,221,221,221,221,221,185,190
unique,120,5,8,221,169,5,4
top,U_XEK47O3LQI7E,Apple,Oura Ring Gen3,D_6WIXAW63,2024-08-05,2.1.0,1
freq,3,55,33,1,3,48,95


In [6]:
devices_registry.shape

(221, 7)

In [7]:
devices_registry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      221 non-null    object
 1   vendor       221 non-null    object
 2   model        221 non-null    object
 3   device_id    221 non-null    object
 4   paired_at    221 non-null    object
 5   firmware     185 non-null    object
 6   active_flag  190 non-null    object
dtypes: object(7)
memory usage: 12.2+ KB


### Users
- clean but sex is inconsistent
- baseline weight has no unit and inconsistent
- home_tz is present but users may travel
- not ground truth

In [8]:
users = pd.read_csv("wearables_synthetic_raw/users.csv")

In [9]:
users.head()

Unnamed: 0,user_id,email_hash,age,sex,height_cm,baseline_weight,home_tz
0,U_XAJI0Y6DPBHS,5501e76c0ed6b0ffea1b8be48fdb4697,34,Unknown,165,91.7,Europe/Berlin
1,U_5NQ4FMYZYCWT,0942476dafed98f4081ed2e380d92bab,43,M,164,75.8,Asia/Tokyo
2,U_W2PCN9T84AZY,b0198e2c87e43c6262afa887ea89b1b6,43,female,173,71.5,Asia/Karachi
3,U_1ERTJ5PHT0HL,12bdf3ddb4d87bd4dc83166db001549b,35,,162,71.6,Asia/Karachi
4,U_OM5IGQPKI7P5,42ab6d0178ca086e55641dda5afa182a,32,F,149,49.0,America/Los_Angeles


In [10]:
users.describe()

Unnamed: 0,age,height_cm,baseline_weight
count,120.0,120.0,120.0
mean,31.208333,164.833333,82.0275
std,6.393137,8.283154,18.788558
min,18.0,145.0,45.0
25%,27.0,159.0,68.175
50%,32.0,164.0,82.25
75%,35.0,171.25,93.55
max,48.0,187.0,145.0


In [11]:
users.shape

(120, 7)

In [12]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   user_id          120 non-null    object 
 1   email_hash       120 non-null    object 
 2   age              120 non-null    int64  
 3   sex              99 non-null     object 
 4   height_cm        120 non-null    int64  
 5   baseline_weight  120 non-null    float64
 6   home_tz          120 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 6.7+ KB


### Apple Health
- event-based, not daily
- everything is in one column (value)
- meaning depends on type column
- units differ
- sleep is not a number
- multiple rows per user per day

In [13]:
apple_health = pd.read_csv("wearables_synthetic_raw/apple_health_export_like.csv")

In [14]:
apple_health.head()

Unnamed: 0,user,type,startDate,endDate,value,unit,sourceName,device,metadata
0,U_OM5IGQPKI7P5,HKCategoryTypeIdentifierSleepAnalysis,2024-01-01T00:00:00,2024-01-02T00:00:00,13242.0,min,,Apple Watch Series 8,"{alg:v2,confidence:0.8}"
1,U_OM5IGQPKI7P5,HKQuantityTypeIdentifierStepCount,2024-01-02T00:00:00,2024-01-03T00:00:00,48.3,count,Health,Apple Watch Ultra,"{alg:v2,confidence:0.8}"
2,U_OM5IGQPKI7P5,HKCategoryTypeIdentifierSleepAnalysis,2024-01-03T00:00:00,2024-01-04T00:00:00,10489.0,min,iPhone,Apple Watch Ultra,{workout:run}
3,U_OM5IGQPKI7P5,HKQuantityTypeIdentifierRestingHeartRate,2024-01-04T00:00:00,2024-01-05T00:00:00,11030.0,bpm,Health,Apple Watch Series 8,"{alg:v2,confidence:0.8}"
4,U_OM5IGQPKI7P5,HKQuantityTypeIdentifierStepCount,2024-01-05 00:00:00,2024-01-06T00:00:00,2243.6,count,Apple Watch,Apple Watch Series 8,"{alg:v2,confidence:0.8}"


In [15]:
apple_health.describe()

Unnamed: 0,value
count,16373.0
mean,3544.474458
std,3253.663708
min,0.0
25%,1818.8
50%,2282.0
75%,5700.0
max,15441.0


In [16]:
apple_health.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17399 entries, 0 to 17398
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   user        17399 non-null  object 
 1   type        17399 non-null  object 
 2   startDate   17399 non-null  object 
 3   endDate     17399 non-null  object 
 4   value       16373 non-null  float64
 5   unit        14893 non-null  object 
 6   sourceName  13144 non-null  object 
 7   device      11590 non-null  object 
 8   metadata    12962 non-null  object 
dtypes: float64(1), object(8)
memory usage: 1.2+ MB


In [17]:
apple_health.shape

(17399, 9)

### Fitbit Daily Activity
- missing Id values
- multiple activity intentsity columns
- calories inconsistent
- sedentaryMinutes stable?
- provides the richest activity breakdown but still requires completeness checks.

In [18]:
fitbit_daily_activity = pd.read_csv("wearables_synthetic_raw/fitbit_dailyActivity_2024.csv")

In [19]:
fitbit_daily_activity.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TrackerDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,source
0,U_5NQ4FMYZYCWT,01-Jan-2024,10233.0,7.31,,22.0,38.0,1225.0,2617.0,Fitbit
1,U_5NQ4FMYZYCWT,2024-01-02,7380.0,5.27,26.0,24.0,33.0,,1841.0,Fitbit
2,U_5NQ4FMYZYCWT,03-Jan-2024,11229.0,8.02,77.0,31.0,73.0,1234.0,2107.0,Fitbit
3,U_5NQ4FMYZYCWT,2024-01-04,6515.0,4.65,19.0,38.0,104.0,1306.0,2216.0,Fitbit
4,U_5NQ4FMYZYCWT,01/05/2024,10808.0,7.72,58.0,38.0,40.0,,2367.0,Fitbit


In [20]:
fitbit_daily_activity.describe()

Unnamed: 0,TotalSteps,TrackerDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
count,15095.0,14329.0,13722.0,13768.0,13701.0,13978.0,14809.0
mean,6798.779,4.845919,39.390759,19.92025,110.406175,1293.857777,2163.29212
std,2575.168771,1.844863,23.887781,11.832044,63.446038,57.365587,278.818776
min,0.0,0.0,0.0,0.0,0.0,1137.0,1200.0
25%,5075.0,3.61,22.0,10.0,55.0,1248.0,1975.0
50%,6771.0,4.83,39.0,20.0,111.0,1294.0,2163.0
75%,8523.5,6.07,56.0,30.0,165.0,1340.0,2349.0
max,15982.0,11.42,128.0,40.0,220.0,1439.0,3233.0


In [21]:
fitbit_daily_activity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15595 entries, 0 to 15594
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Id                    14299 non-null  object 
 1   ActivityDate          15595 non-null  object 
 2   TotalSteps            15095 non-null  float64
 3   TrackerDistance       14329 non-null  float64
 4   VeryActiveMinutes     13722 non-null  float64
 5   FairlyActiveMinutes   13768 non-null  float64
 6   LightlyActiveMinutes  13701 non-null  float64
 7   SedentaryMinutes      13978 non-null  float64
 8   Calories              14809 non-null  float64
 9   source                15595 non-null  object 
dtypes: float64(7), object(3)
memory usage: 1.2+ MB


In [22]:
fitbit_daily_activity.shape

(15595, 10)

### Fitbit Resting HR
- timezones sometimes missing
- rhr_confidence is categorical
- one value per user per day mostly
- cleaner physiological signals

In [23]:
fitbit_resting_HR = pd.read_csv("wearables_synthetic_raw/fitbit_restingHR_2024.csv")

In [24]:
fitbit_resting_HR.head()

Unnamed: 0,id,dateTime,restingHeartRate,timezone,device,rhr_confidence
0,U_5NQ4FMYZYCWT,2024/01/01 00:00,66.3,,Charge 5,low
1,U_5NQ4FMYZYCWT,2024-01-02 00:00:00 +0400,68.0,Asia/Tokyo,,high
2,U_5NQ4FMYZYCWT,2024/01/03 00:00,65.3,Asia/Tokyo,Sense 2,
3,U_5NQ4FMYZYCWT,2024/01/04 00:00,67.9,Asia/Tokyo,Charge 5,low
4,U_5NQ4FMYZYCWT,2024/01/05 00:00,68.6,,Sense 2,low


In [25]:
fitbit_resting_HR.describe()

Unnamed: 0,restingHeartRate
count,14380.0
mean,70.387636
std,13.122674
min,45.0
25%,61.2
50%,70.0
75%,79.9
max,95.0


In [26]:
fitbit_resting_HR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15595 entries, 0 to 15594
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                15595 non-null  object 
 1   dateTime          15595 non-null  object 
 2   restingHeartRate  14380 non-null  float64
 3   timezone          14069 non-null  object 
 4   device            11767 non-null  object 
 5   rhr_confidence    13355 non-null  object 
dtypes: float64(1), object(5)
memory usage: 731.1+ KB


In [27]:
fitbit_resting_HR.shape

(15595, 6)

### Fitbit Sleep
- multiple rows per night
- isMainSleep is boolean and integer
- sleep segments cross midnight
- some segments missing minutesAsleep

In [28]:
fitbit_sleep = pd.read_csv("wearables_synthetic_raw/fitbit_sleep_2024.csv")

In [29]:
fitbit_sleep.head()

Unnamed: 0,id,sleep_start,sleep_end,minutesAsleep,minutesAwake,sleep_efficiency,isMainSleep
0,U_5NQ4FMYZYCWT,2024/01/01 20:51,2024-01-01 23:44:00,173.0,22.0,79.0,TRUE
1,U_5NQ4FMYZYCWT,2024-01-01 22:25:00,2024-01-02 01:20:00,,50.0,83.0,FALSE
2,U_5NQ4FMYZYCWT,2024-01-02 22:17:00,2024-01-03 04:17:00,360.0,12.0,91.0,0
3,U_5NQ4FMYZYCWT,2024-01-03 23:24:00,2024-01-04 05:24:00,360.0,56.0,79.0,FALSE
4,U_5NQ4FMYZYCWT,2024-01-04 22:51:00,2024-01-05 01:43:00,172.0,42.0,82.0,0


In [30]:
fitbit_sleep.describe()

Unnamed: 0,minutesAsleep,minutesAwake,sleep_efficiency
count,23535.0,21418.0,22275.0
mean,220.472148,34.854655,84.396543
std,93.61844,19.216191,6.8569
min,40.0,0.0,58.0
25%,145.0,21.0,80.0
50%,192.0,34.0,84.0
75%,339.0,48.0,89.0
max,360.0,114.0,98.0


In [31]:
fitbit_sleep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25264 entries, 0 to 25263
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                25264 non-null  object 
 1   sleep_start       25264 non-null  object 
 2   sleep_end         25264 non-null  object 
 3   minutesAsleep     23535 non-null  float64
 4   minutesAwake      21418 non-null  float64
 5   sleep_efficiency  22275 non-null  float64
 6   isMainSleep       21542 non-null  object 
dtypes: float64(3), object(4)
memory usage: 1.3+ MB


In [32]:
fitbit_sleep.shape

(25264, 7)

### Oura
- Sleep durations are in seconds
- multiple date formats
- missing deep/REM sleep
- Readiness present even though sleep components missing sometimes
- temperature deviation is subtle
- sleep first, not activity

In [33]:
oura_sleep = pd.read_csv("wearables_synthetic_raw/oura_sleep.csv")

In [34]:
oura_sleep.head()

Unnamed: 0,UserId,date,total_sleep_duration,deep_sleep_duration,rem_sleep_duration,restless,readiness_score,temperature_deviation,bedtime_start,bedtime_end,source
0,U_XAJI0Y6DPBHS,2024/01/01 00:00,29828.0,8774.0,5059.0,32.0,61.0,0.15,2024-01-01T23:59:00,2024-01-02T05:56:00,Oura
1,U_XAJI0Y6DPBHS,02-Jan-2024,29699.0,6517.0,7863.0,36.0,54.0,-0.14,2024-01-02T20:45:00,2024-01-03T05:24:00,Oura
2,U_XAJI0Y6DPBHS,03-Jan-2024,28117.0,,3761.0,19.0,61.0,0.31,2024-01-04T00:03:00,2024-01-04T05:33:00,Oura
3,U_XAJI0Y6DPBHS,2024/01/04 00:00,27670.0,5275.0,3884.0,,68.0,0.23,2024-01-04T20:33:00,2024-01-05T07:25:00,Oura
4,U_XAJI0Y6DPBHS,05-Jan-2024,24997.0,5604.0,5465.0,14.0,61.0,0.1,2024-01-05T23:43:00,2024-01-06T06:29:00,Oura


In [35]:
oura_sleep.describe()

Unnamed: 0,total_sleep_duration,deep_sleep_duration,rem_sleep_duration,restless,readiness_score,temperature_deviation
count,10048.0,9668.0,9633.0,8943.0,9864.0,9263.0
mean,26367.856986,5023.108295,5772.095609,27.533266,66.260341,0.02475
std,3947.149387,1763.910395,2017.579749,13.659394,10.810618,0.181057
min,12659.0,0.0,0.0,0.0,25.0,-0.58
25%,23593.75,3793.0,4387.0,18.0,59.0,-0.1
50%,26397.0,4947.5,5649.0,27.0,66.0,0.02
75%,29221.25,6176.0,7031.0,37.0,74.0,0.15
max,38115.0,12992.0,14360.0,80.0,100.0,0.71


In [36]:
oura_sleep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10945 entries, 0 to 10944
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   UserId                 10945 non-null  object 
 1   date                   10945 non-null  object 
 2   total_sleep_duration   10048 non-null  float64
 3   deep_sleep_duration    9668 non-null   float64
 4   rem_sleep_duration     9633 non-null   float64
 5   restless               8943 non-null   float64
 6   readiness_score        9864 non-null   float64
 7   temperature_deviation  9263 non-null   float64
 8   bedtime_start          10945 non-null  object 
 9   bedtime_end            10945 non-null  object 
 10  source                 10945 non-null  object 
dtypes: float64(6), object(5)
memory usage: 940.7+ KB


In [37]:
oura_sleep.shape

(10945, 11)

### Samsung
- steps has negative values ??
- uid sometimes missing
- mixed date formats
- tz is inconsistent
- sleep is in hours


In [38]:
samsung = pd.read_csv("wearables_synthetic_raw/samsung_health_daily.csv")

In [39]:
samsung.head()

Unnamed: 0,uid,date,steps,calories_burned,sleep_duration,rhr,workout_type,workout_min,tz
0,U_W2PCN9T84AZY,2024/01/01 00:00,10014.0,2236.0,,,Gym,52.0,
1,U_W2PCN9T84AZY,01/02/2024,,2160.0,7.23,62.3,none,56.0,Asia/Karachi
2,U_W2PCN9T84AZY,2024/01/03 00:00,7295.0,2111.0,5.02,64.8,yoga,42.0,+04:00
3,,01/04/2024,6942.0,2503.0,6.43,70.9,yoga,57.0,UTC
4,U_W2PCN9T84AZY,2024/01/05 00:00,7975.0,2017.0,7.14,70.0,none,53.0,+04:00


In [40]:
samsung.describe()

Unnamed: 0,steps,calories_burned,sleep_duration,rhr,workout_min
count,14561.0,14108.0,13648.0,13647.0,12485.0
mean,6921.889843,2170.980862,7.038381,71.211255,38.095795
std,2350.326282,295.745559,1.028254,15.09471,22.676615
min,-431.0,1001.0,3.5,41.1,0.0
25%,5385.0,1972.0,6.3,59.2,21.0
50%,7062.0,2175.0,7.03,69.6,38.0
75%,8586.0,2370.0,7.75,84.0,54.0
max,14511.0,3281.0,10.5,99.3,127.0


In [41]:
samsung.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15155 entries, 0 to 15154
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   uid              14387 non-null  object 
 1   date             15155 non-null  object 
 2   steps            14561 non-null  float64
 3   calories_burned  14108 non-null  float64
 4   sleep_duration   13648 non-null  float64
 5   rhr              13647 non-null  float64
 6   workout_type     10909 non-null  object 
 7   workout_min      12485 non-null  float64
 8   tz               11338 non-null  object 
dtypes: float64(5), object(4)
memory usage: 1.0+ MB


In [42]:
samsung.shape

(15155, 9)

### Whoop
- member_id =! user_id format
- recovery missing more than strain
- whoop = physiology + recovery model
- does not fully align with fitbit/samsung like oura

In [43]:
whoop_cycles = pd.read_csv("wearables_synthetic_raw/whoop_cycles.csv")

In [44]:
whoop_cycles.head()

Unnamed: 0,member_id,cycle_date,strain,recovery_score,hrv_rmssd,sleep_performance,notes
0,JZA7TZ0YNCXL,2024-01-01 00:00:00 -0800,9.04,73.0,60.0,76.0,
1,JZA7TZ0YNCXL,2024-01-02,8.41,,66.0,88.0,
2,JZA7TZ0YNCXL,2024-01-03,7.81,70.0,58.0,64.0,
3,JZA7TZ0YNCXL,2024-01-04T00:00:00,3.3,61.0,51.0,84.0,late meal
4,JZA7TZ0YNCXL,2024-01-05,6.94,58.0,79.0,79.0,


In [45]:
whoop_cycles.describe()

Unnamed: 0,strain,recovery_score,hrv_rmssd,sleep_performance
count,9889.0,9759.0,9598.0,9365.0
mean,9.241093,58.982785,51.696812,69.915643
std,4.48472,12.779257,16.526605,11.790204
min,0.0,16.0,15.0,29.0
25%,6.03,50.0,40.0,62.0
50%,9.22,59.0,51.0,70.0
75%,12.39,67.0,63.0,78.0
max,21.0,99.0,120.0,100.0


In [46]:
whoop_cycles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10620 entries, 0 to 10619
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   member_id          10620 non-null  object 
 1   cycle_date         10620 non-null  object 
 2   strain             9889 non-null   float64
 3   recovery_score     9759 non-null   float64
 4   hrv_rmssd          9598 non-null   float64
 5   sleep_performance  9365 non-null   float64
 6   notes              7101 non-null   object 
dtypes: float64(4), object(3)
memory usage: 580.9+ KB


In [47]:
whoop_cycles.shape

(10620, 7)

### Grain Definitions

- users: 1 row = 1 user (static user attributes)
- devices_registry: 1 row = 1 user–device (device ownership metadata)

- apple_health: 1 row = 1 measurement event (event-level, heterogeneous metrics)

- fitbit_daily_activity: 1 row = 1 user–day (daily activity summary)
- fitbit_sleep: 1 row = 1 sleep segment (multiple rows per night)
- fitbit_resting_HR: 1 row = 1 user–day (daily resting heart rate summary)

- samsung_health_daily: 1 row = 1 user–day (daily summary; messy dates/units/invalid values)

- whoop_cycles: 1 row = 1 user–day (strain/recovery-focused daily cycle)
- oura_sleep: 1 row = 1 user–day (sleep-first daily summary)

## Initial Canonical Table Design
1. Primary Key:
 - (user_id, date)   
2. Table Density:
 - Sparse table
3. Metrics:
 - Required:
   - user_id
   - date
 - Optional:
   - steps
   - sleep_hours
   - resting_hr
   - calories_burned
   - strain
   - readiness
 - Derived:
   - rolling averages
   - consistency metrics
   - volatility measures