#### Goal
To convert Apple Health event-level data into daily user metrics for a small, curated set of signals.

#### Input Grain (Raw)
1 row = 1 measurement event (event-level, heterogeneous metrics)

#### Output Grain (Clean)
1 row = 1 user-day

#### Metrics to Include
- steps
- active energy (calories)
- resting HR
- sleep duration

#### Metrics to Exclude
- workouts
- stand hours
- heart rate variability
- ECG / oxygen / niche metrics

#### Outcome
- apple_user_day_clean.csv

In [1]:
import pandas as pd
import duckdb

In [2]:
con = duckdb.connect()

In [3]:
apple_health = pd.read_csv("wearables_synthetic_raw/apple_health_export_like.csv")

In [4]:
con.register("apple_health", apple_health)

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [5]:
apple_health.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17399 entries, 0 to 17398
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   user        17399 non-null  object 
 1   type        17399 non-null  object 
 2   startDate   17399 non-null  object 
 3   endDate     17399 non-null  object 
 4   value       16373 non-null  float64
 5   unit        14893 non-null  object 
 6   sourceName  13144 non-null  object 
 7   device      11590 non-null  object 
 8   metadata    12962 non-null  object 
dtypes: float64(1), object(8)
memory usage: 1.2+ MB


In [6]:
apple_health.head()

Unnamed: 0,user,type,startDate,endDate,value,unit,sourceName,device,metadata
0,U_OM5IGQPKI7P5,HKCategoryTypeIdentifierSleepAnalysis,2024-01-01T00:00:00,2024-01-02T00:00:00,13242.0,min,,Apple Watch Series 8,"{alg:v2,confidence:0.8}"
1,U_OM5IGQPKI7P5,HKQuantityTypeIdentifierStepCount,2024-01-02T00:00:00,2024-01-03T00:00:00,48.3,count,Health,Apple Watch Ultra,"{alg:v2,confidence:0.8}"
2,U_OM5IGQPKI7P5,HKCategoryTypeIdentifierSleepAnalysis,2024-01-03T00:00:00,2024-01-04T00:00:00,10489.0,min,iPhone,Apple Watch Ultra,{workout:run}
3,U_OM5IGQPKI7P5,HKQuantityTypeIdentifierRestingHeartRate,2024-01-04T00:00:00,2024-01-05T00:00:00,11030.0,bpm,Health,Apple Watch Series 8,"{alg:v2,confidence:0.8}"
4,U_OM5IGQPKI7P5,HKQuantityTypeIdentifierStepCount,2024-01-05 00:00:00,2024-01-06T00:00:00,2243.6,count,Apple Watch,Apple Watch Series 8,"{alg:v2,confidence:0.8}"


In [7]:
con.execute("""SELECT type, count(*) as n
FROM apple_health
GROUP  BY type
ORDER BY n DESC""").df()

Unnamed: 0,type,n
0,HKQuantityTypeIdentifierActiveEnergyBurned,4401
1,HKQuantityTypeIdentifierRestingHeartRate,4347
2,HKCategoryTypeIdentifierSleepAnalysis,4343
3,HKQuantityTypeIdentifierStepCount,4308


In [8]:
con.execute("SELECT DISTINCT startDate FROM apple_health LIMIT 10").df()

Unnamed: 0,startDate
0,2024-01-03T00:00:00
1,2024-01-10T00:00:00
2,2024-01-16 00:00:00
3,2024-02-15 00:00:00
4,2024-03-01 00:00:00
5,2024-03-08T00:00:00
6,2024-03-29 00:00:00
7,2024-04-15 00:00:00
8,2024-04-19 00:00:00
9,2024-04-21 00:00:00


In [9]:
con.execute("SELECT DISTINCT endDate FROM apple_health LIMIT 10").df()

Unnamed: 0,endDate
0,2024-01-06T00:00:00
1,2024-01-12T00:00:00
2,2024-01-18T00:00:00
3,2024-01-21T00:00:00
4,2024-02-29T00:00:00
5,2024-03-05T00:00:00
6,2024-03-09T00:00:00
7,2024-03-11T00:00:00
8,2024-04-03T00:00:00
9,2024-04-11T00:00:00


In [10]:
con.execute("""CREATE OR REPLACE VIEW apple_health_step1 AS
SELECT *,
COALESCE(
try_strptime(startDate, '%Y-%m-%dT%H:%M:%S'),
try_strptime(startDate, '%Y-%m-%d %H:%M:%S')) AS start_ts,
COALESCE(
try_strptime(endDate, '%Y-%m-%dT%H:%M:%S'),
try_strptime(endDate, '%Y-%m-%d %H:%M:%S')) AS end_ts
FROM apple_health""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [11]:
con.execute("""CREATE OR REPLACE VIEW apple_health_step2 AS
SELECT *,
CAST(end_ts AS DATE) AS event_date
FROM apple_health_step1""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [12]:
con.execute("SELECT endDate, end_ts, event_date FROM apple_health_step2 LIMIT 5").df()

Unnamed: 0,endDate,end_ts,event_date
0,2024-01-02T00:00:00,2024-01-02,2024-01-02
1,2024-01-03T00:00:00,2024-01-03,2024-01-03
2,2024-01-04T00:00:00,2024-01-04,2024-01-04
3,2024-01-05T00:00:00,2024-01-05,2024-01-05
4,2024-01-06T00:00:00,2024-01-06,2024-01-06


In [13]:
con.execute("""SELECT COUNT(*) AS total_rows,
SUM(CASE WHEN start_ts IS NULL THEN 1 ELSE 0 END) AS start_null,
SUM(CASE WHEN end_ts IS NULL THEN 1 ELSE 0 END) AS end_null,
SUM(CASE WHEN event_date IS NULL THEN 1 ELSE 0 END) AS event_date_null
FROM apple_health_step2""").df()

Unnamed: 0,total_rows,start_null,end_null,event_date_null
0,17399,0.0,0.0,0.0


In [14]:
con.execute("""SELECT COUNT(*) AS total_rows,
SUM(CASE 
WHEN user IS NULL OR TRIM(user) = '' THEN 1 
ELSE 0 END) as bad_id
FROM apple_health_step2""").df()

Unnamed: 0,total_rows,bad_id
0,17399,0.0


In [15]:
con.execute("SELECT COUNT(DISTINCT user) FROM apple_health_step2").df()

Unnamed: 0,"count(DISTINCT ""user"")"
0,51


In [16]:
con.execute("""CREATE OR REPLACE VIEW apple_steps_daily AS
SELECT user AS user_id, event_date AS date,
SUM(value) AS apple_steps
FROM apple_health_step2
WHERE type IN ('HKQuantityTypeIdentifierStepCount')
GROUP BY user, event_date""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [17]:
con.execute("""SELECT COUNT(*) AS total_rows,
MIN(apple_steps) AS min_steps,
MAX(apple_steps) AS max_steps,
avg(apple_steps) AS avg_steps
FROM apple_steps_daily""").df()

Unnamed: 0,total_rows,min_steps,max_steps,avg_steps
0,4208,0.0,26680.0,3605.530494


In [18]:
con.execute("""CREATE OR REPLACE VIEW apple_energy_daily AS
SELECT user AS user_id, event_date AS date,
SUM(value) AS apple_active_energy
FROM apple_health_step2
WHERE type IN ('HKQuantityTypeIdentifierActiveEnergyBurned')
GROUP BY user, event_date""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [19]:
con.execute("""SELECT COUNT(*) AS total_rows,
MIN(apple_active_energy) AS min_energy,
MAX(apple_active_energy) AS max_energy,
AVG(apple_active_energy) AS avg_energy
FROM apple_energy_daily""").df()

Unnamed: 0,total_rows,min_energy,max_energy,avg_energy
0,4272,0.0,29054.0,3630.871144


In [20]:
con.execute("""CREATE OR REPLACE VIEW apple_rhr_daily AS
SELECT user AS user_id, event_date AS date,
AVG(CASE WHEN value BETWEEN 35 AND 120 THEN value ELSE NULL END) AS apple_resting_hr
FROM apple_health_step2
WHERE type IN ('HKQuantityTypeIdentifierRestingHeartRate')
GROUP BY user, event_date""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [21]:
con.execute("""SELECT COUNT(*) AS total_rows,
MIN(apple_resting_hr) AS min_rhr,
MAX(apple_resting_hr) AS max_rhr,
AVG(apple_resting_hr) AS avg_rhr
FROM apple_rhr_daily""").df()

Unnamed: 0,total_rows,min_rhr,max_rhr,avg_rhr
0,4225,45.0,95.0,71.217917


In [22]:
con.execute("""CREATE OR REPLACE VIEW apple_sleep_daily AS
SELECT user AS user_id, event_date AS date, 
SUM(DATEDIFF('minute', start_ts, end_ts)) AS apple_sleep_minutes
FROM apple_health_step2
WHERE type IN ('HKCategoryTypeIdentifierSleepAnalysis')
GROUP BY user, event_date""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [23]:
con.execute("""SELECT COUNT(*) AS total_rows,
MIN(apple_sleep_minutes) AS min_sleep,
MAX(apple_sleep_minutes) AS max_sleep,
AVG(apple_sleep_minutes) AS avg_sleep
FROM apple_sleep_daily""").df()

Unnamed: 0,total_rows,min_sleep,max_sleep,avg_sleep
0,4191,1440.0,2880.0,1492.226199


In [24]:
#Cap sleep per day
con.execute("""CREATE OR REPLACE VIEW apple_sleep_daily AS
SELECT user AS user_id, event_date AS date, 
(CASE
WHEN SUM(DATEDIFF('minute', start_ts, end_ts)) > 960 THEN 960
ELSE SUM(DATEDIFF('minute', start_ts, end_ts)) 
END)
AS apple_sleep_minutes
FROM apple_health_step2
WHERE type IN ('HKCategoryTypeIdentifierSleepAnalysis')
GROUP BY user, event_date""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [25]:
con.execute("""SELECT COUNT(*) AS total_rows,
MIN(apple_sleep_minutes) AS min_sleep,
MAX(apple_sleep_minutes) AS max_sleep,
AVG(apple_sleep_minutes) AS avg_sleep
FROM apple_sleep_daily""").df()

Unnamed: 0,total_rows,min_sleep,max_sleep,avg_sleep
0,4191,960.0,960.0,960.0


In [26]:
con.execute("""SELECT user, start_ts, end_ts,
DATEDIFF('minute', start_ts, end_ts) AS sleep_minutes
FROM apple_health_step2
ORDER BY sleep_minutes DESC
LIMIT 10""").df()

Unnamed: 0,user,start_ts,end_ts,sleep_minutes
0,U_OM5IGQPKI7P5,2024-01-01,2024-01-02,1440
1,U_OM5IGQPKI7P5,2024-01-02,2024-01-03,1440
2,U_OM5IGQPKI7P5,2024-01-03,2024-01-04,1440
3,U_OM5IGQPKI7P5,2024-01-04,2024-01-05,1440
4,U_OM5IGQPKI7P5,2024-01-05,2024-01-06,1440
5,U_OM5IGQPKI7P5,2024-01-06,2024-01-07,1440
6,U_OM5IGQPKI7P5,2024-01-07,2024-01-08,1440
7,U_OM5IGQPKI7P5,2024-01-08,2024-01-09,1440
8,U_OM5IGQPKI7P5,2024-01-09,2024-01-10,1440
9,U_OM5IGQPKI7P5,2024-01-10,2024-01-11,1440


In [27]:
#Cap sleep per day
con.execute("""CREATE OR REPLACE VIEW apple_sleep_daily AS
SELECT user AS user_id, event_date AS date, 
SUM(CASE
WHEN DATEDIFF('minute', start_ts, end_ts) >= 1440 THEN NULL
ELSE DATEDIFF('minute', start_ts, end_ts)
END)
AS apple_sleep_minutes
FROM apple_health_step2
WHERE type IN ('HKCategoryTypeIdentifierSleepAnalysis')
GROUP BY user, event_date""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [28]:
con.execute("""SELECT COUNT(*) AS total_rows,
MIN(apple_sleep_minutes) AS min_sleep,
MAX(apple_sleep_minutes) AS max_sleep,
AVG(apple_sleep_minutes) AS avg_sleep
FROM apple_sleep_daily""").df()

Unnamed: 0,total_rows,min_sleep,max_sleep,avg_sleep
0,4191,,,


- Apple Health sleep records in this dataset represent full-day coverage intervals rather than true sleep sessions.
- As a result, sleep duration could not be reliably inferred and was excluded from downstream analysis.

In [29]:
con.execute("""CREATE OR REPLACE VIEW apple_user_day_clean AS
SELECT
s.*,
e.apple_active_energy,
r.apple_resting_hr,
m.apple_sleep_minutes
FROM apple_steps_daily s
LEFT JOIN apple_energy_daily e
ON s.user_id = e.user_id
AND s.date = e.date
LEFT JOIN apple_rhr_daily r
ON s.user_id = r.user_id
AND s.date = r.date
LEFT JOIN apple_sleep_daily m
ON s.user_id = m.user_id
AND s.date = m.date""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [30]:
con.execute("""SELECT COUNT(*) AS total_rows,
SUM(CASE WHEN apple_active_energy IS NULL THEN 1 ELSE 0 END) AS energy_null,
SUM(CASE WHEN apple_resting_hr IS NULL THEN 1 ELSE 0 END) AS rhr_null,
SUM(CASE WHEN apple_sleep_minutes IS NULL THEN 1 ELSE 0 END) AS sleep_null
FROM apple_user_day_clean""").df()

Unnamed: 0,total_rows,energy_null,rhr_null,sleep_null
0,4208,4208.0,4208.0,4208.0


In [31]:
con.execute("SELECT user_id, date FROM apple_steps_daily LIMIT 5").df()

Unnamed: 0,user_id,date
0,U_OM5IGQPKI7P5,2024-01-03
1,U_OM5IGQPKI7P5,2024-01-31
2,U_OM5IGQPKI7P5,2024-02-01
3,U_OM5IGQPKI7P5,2024-05-03
4,U_OM5IGQPKI7P5,2024-05-23


In [32]:
con.execute("SELECT user_id, date FROM apple_energy_daily LIMIT 5").df()

Unnamed: 0,user_id,date
0,U_OM5IGQPKI7P5,2024-01-23
1,U_OM5IGQPKI7P5,2024-04-14
2,U_OM5IGQPKI7P5,2024-05-31
3,U_OM5IGQPKI7P5,2024-06-12
4,U_OM5IGQPKI7P5,2024-06-29


In [33]:
con.execute("SELECT user_id, date FROM apple_rhr_daily LIMIT 5").df()

Unnamed: 0,user_id,date
0,U_OM5IGQPKI7P5,2024-01-05
1,U_OM5IGQPKI7P5,2024-04-23
2,U_OM5IGQPKI7P5,2024-06-02
3,U_OM5IGQPKI7P5,2024-06-09
4,U_OM5IGQPKI7P5,2024-06-24


In [34]:
con.execute("""
  SELECT COUNT(*) AS matched_rows
  FROM apple_steps_daily s
  INNER JOIN apple_energy_daily e
    ON s.user_id = e.user_id
   AND s.date = e.date
""").df()

Unnamed: 0,matched_rows
0,0


In [35]:
con.execute("""
  SELECT COUNT(*) AS matched_rows
  FROM apple_steps_daily s
  INNER JOIN apple_rhr_daily r
    ON s.user_id = r.user_id
   AND s.date = r.date
""").df()

Unnamed: 0,matched_rows
0,0


In [36]:
con.execute("""
  CREATE OR REPLACE VIEW apple_user_day_spine AS
  SELECT user_id, date FROM apple_steps_daily
  UNION
  SELECT user_id, date FROM apple_energy_daily
  UNION
  SELECT user_id, date FROM apple_rhr_daily
  UNION
  SELECT user_id, date FROM apple_sleep_daily
""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [37]:
con.execute("""
  CREATE OR REPLACE VIEW apple_user_day_clean AS
  SELECT
    p.user_id,
    p.date,
    s.apple_steps,
    e.apple_active_energy,
    r.apple_resting_hr,
    sl.apple_sleep_minutes
  FROM apple_user_day_spine p
  LEFT JOIN apple_steps_daily s
    ON p.user_id = s.user_id AND p.date = s.date
  LEFT JOIN apple_energy_daily e
    ON p.user_id = e.user_id AND p.date = e.date
  LEFT JOIN apple_rhr_daily r
    ON p.user_id = r.user_id AND p.date = r.date
  LEFT JOIN apple_sleep_daily sl
    ON p.user_id = sl.user_id AND p.date = sl.date
""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>

In [38]:
con.execute("""
  SELECT
    COUNT(*) AS total_rows,
    SUM(CASE WHEN apple_steps IS NULL THEN 1 ELSE 0 END) AS steps_null,
    SUM(CASE WHEN apple_active_energy IS NULL THEN 1 ELSE 0 END) AS energy_null,
    SUM(CASE WHEN apple_resting_hr IS NULL THEN 1 ELSE 0 END) AS rhr_null,
    SUM(CASE WHEN apple_sleep_minutes IS NULL THEN 1 ELSE 0 END) AS sleep_null
  FROM apple_user_day_clean
""").df()

Unnamed: 0,total_rows,steps_null,energy_null,rhr_null,sleep_null
0,16896,12951.0,12876.0,16243.0,16896.0


In [39]:
con.execute("""
  SELECT COUNT(*) AS dup_keys
  FROM (
    SELECT user_id, date, COUNT(*) AS n
    FROM apple_user_day_clean
    GROUP BY user_id, date
    HAVING COUNT(*) > 1
  )
""").df()

Unnamed: 0,dup_keys
0,0


Apple metrics occur on mostly non-overlapping dates in this dataset; therefore we used a union date spine to preserve all user-days where any Apple metric exists. Apple sleep records were full-day intervals (1440 min) and treated as unusable for sleep duration.

In [40]:
con.execute("""
  COPY apple_user_day_clean
  TO 'data/clean/apple_user_day_clean.csv'
  (HEADER, DELIMITER ',')
""")

<_duckdb.DuckDBPyConnection at 0x105c7c1b0>