In [1]:
import pandas as pd
import duckdb

In [2]:
whoop_cycles = pd.read_csv("wearables_synthetic_raw/whoop_cycles.csv")

In [3]:
con = duckdb.connect()

In [4]:
con.register("whoop_cycles", whoop_cycles)

<_duckdb.DuckDBPyConnection at 0x132bd48b0>

In [5]:
con.execute("""SELECT COUNT(*) AS total_rows,
member_id,
MIN(cycle_date),
MAX(cycle_date)
FROM whoop_cycles
GROUP BY member_id
ORDER BY total_rows DESC""").df()

Unnamed: 0,total_rows,member_id,min(cycle_date),max(cycle_date)
0,324,PP3XVKVZRGOL,2024-01-01T00:00:00,2024-12-29 00:00:00 +0530
1,323,AGS6E1ZHWY9H,2024-01-01,2024-12-31T00:00:00
2,322,7E1047WWBSLU,2024-01-01 00:00:00 +0000,2024-12-31T00:00:00
3,319,GVWRDMLY4LYK,2024-01-02T00:00:00,2024-12-31
4,316,NV0H0GQ9C80E,2024-01-01T00:00:00,2024-12-31T00:00:00
...,...,...,...,...
59,21,U_PP3XVKVZRGOL,2024-01-17 00:00:00 +0400,2024-12-31 00:00:00 +0000
60,20,U_3RZSJ49ITN7S,2024-01-11T00:00:00,2024-12-24T00:00:00
61,20,U_U84XYJOA21A6,2024-01-12 00:00:00 +0530,2024-12-26
62,19,U_AGS6E1ZHWY9H,2024-02-07,2024-11-26T00:00:00


In [6]:
con.execute("""SELECT count(*) AS total_rows,
SUM(CASE WHEN member_id IS NULL THEN 1 ELSE 0 END) AS null_ids,
SUM(CASE WHEN TRIM(member_id) = '' THEN 1 ELSE 0 END) AS blank_ids
FROM whoop_cycles""").df()

Unnamed: 0,total_rows,null_ids,blank_ids
0,10620,0.0,0.0


In [7]:
con.execute("""SELECT DISTINCT cycle_date 
FROM whoop_cycles
LIMIT 10""").df()

Unnamed: 0,cycle_date
0,2024-01-11 00:00:00 +0000
1,2024-02-21T00:00:00
2,2024-02-23T00:00:00
3,2024-03-13 00:00:00 +0400
4,2024-04-08
5,2024-04-09 00:00:00 -0800
6,2024-04-19T00:00:00
7,2024-04-22 00:00:00 -0800
8,2024-05-03 00:00:00 +0530
9,2024-05-06 00:00:00 +0400


#### Goal
Create a clean date column we can join on later

In [8]:
con.execute("""CREATE OR REPLACE VIEW whoop_cycles_step1 AS
SELECT *,
COALESCE(
try_strptime(CAST(cycle_date AS VARCHAR), '%Y-%m-%d'),
try_strptime(CAST(cycle_date AS VARCHAR), '%Y-%m-%dT%H:%M:%S'),
try_strptime(CAST(cycle_date AS VARCHAR), '%Y-%m-%d %H:%M:%S %z')) AS cycle_ts
FROM whoop_cycles""")

<_duckdb.DuckDBPyConnection at 0x132bd48b0>

In [9]:
con.execute("""SELECT COUNT(*) AS total_rows,
SUM(CASE WHEN cycle_ts IS NULL THEN 1 ELSE 0 END) AS cycle_ts_null
FROM whoop_cycles_step1""").df()

Unnamed: 0,total_rows,cycle_ts_null
0,10620,0.0


In [10]:
con.execute("""CREATE OR REPLACE VIEW whoop_cycles_step2 AS
SELECT *,
DATE(cycle_ts) AS whoop_date
FROM whoop_cycles_step1""")

<_duckdb.DuckDBPyConnection at 0x132bd48b0>

In [11]:
con.execute("""SELECT member_id,
cycle_date,
cycle_ts,
whoop_date
FROM whoop_cycles_step2
LIMIT 5""").df()

Unnamed: 0,member_id,cycle_date,cycle_ts,whoop_date
0,JZA7TZ0YNCXL,2024-01-01 00:00:00 -0800,2024-01-01 12:00:00+04:00,2024-01-01
1,JZA7TZ0YNCXL,2024-01-02,2024-01-02 00:00:00+04:00,2024-01-02
2,JZA7TZ0YNCXL,2024-01-03,2024-01-03 00:00:00+04:00,2024-01-03
3,JZA7TZ0YNCXL,2024-01-04T00:00:00,2024-01-04 00:00:00+04:00,2024-01-04
4,JZA7TZ0YNCXL,2024-01-05,2024-01-05 00:00:00+04:00,2024-01-05


#### Goal
Add a clean user_id 

In [12]:
con.execute("""CREATE OR REPLACE VIEW whoop_cycles_step3 AS
SELECT *,
(CASE WHEN member_id LIKE 'U_%' THEN SUBSTR(member_id, 3) ELSE member_id END) AS user_id
FROM whoop_cycles_step2""")

<_duckdb.DuckDBPyConnection at 0x132bd48b0>

In [13]:
con.execute("""SELECT COUNT(DISTINCT member_id) AS distinct_raw_ids,
COUNT(DISTINCT user_id) AS distinct_norm_ids
FROM whoop_cycles_step3""").df()

Unnamed: 0,distinct_raw_ids,distinct_norm_ids
0,64,33


#### Goal
Metric sanity + cleaning rules

In [14]:
con.execute("DESCRIBE whoop_cycles_step3").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,member_id,VARCHAR,YES,,,
1,cycle_date,VARCHAR,YES,,,
2,strain,DOUBLE,YES,,,
3,recovery_score,DOUBLE,YES,,,
4,hrv_rmssd,DOUBLE,YES,,,
5,sleep_performance,DOUBLE,YES,,,
6,notes,VARCHAR,YES,,,
7,cycle_ts,TIMESTAMP WITH TIME ZONE,YES,,,
8,whoop_date,DATE,YES,,,
9,user_id,VARCHAR,YES,,,


In [15]:
con.execute("""SELECT COUNT(*) AS total_rows,
MIN(strain),
MAX(strain),
MIN(recovery_score),
MAX(recovery_score),
MIN(hrv_rmssd),
MAX(hrv_rmssd),
MIN(sleep_performance),
MAX(sleep_performance)
FROM whoop_cycles_step3""").df()

Unnamed: 0,total_rows,min(strain),max(strain),min(recovery_score),max(recovery_score),min(hrv_rmssd),max(hrv_rmssd),min(sleep_performance),max(sleep_performance)
0,10620,0.0,21.0,16.0,99.0,15.0,120.0,29.0,100.0


#### Cleaning
- Preserve missingness
- Do not impute
- Do not cap valid physiological ranges
- Only normalize names + add flags

In [16]:
con.execute("""CREATE OR REPLACE VIEW whoop_user_day_clean AS
SELECT
user_id,
whoop_date AS date,
recovery_score AS whoop_recovery,
strain AS whoop_strain,
hrv_rmssd AS whoop_hrv,
sleep_performance AS whoop_sleep_score,
notes,
(CASE WHEN strain < 0 OR strain > 21 THEN 1 ELSE 0 END) AS strain_flag_invalid,
(CASE WHEN recovery_score <0 OR recovery_score > 100 THEN 1 ELSE 0 END) AS recovery_flag_invalid,
(CASE WHEN sleep_performance <0 OR sleep_performance > 100 THEN 1 ELSE 0 END) AS sleep_flag_invalid,
(CASE WHEN hrv_rmssd <=0 THEN 1 ELSE 0 END) AS hrv_flag_invalid
FROM whoop_cycles_step3
""")

<_duckdb.DuckDBPyConnection at 0x132bd48b0>

In [17]:
con.execute("""SELECT COUNT(*) AS total_rows,
SUM(strain_flag_invalid) AS strain_flags,
SUM(recovery_flag_invalid) AS recovery_flags,
SUM(sleep_flag_invalid) AS sleep_flags,
SUM(hrv_flag_invalid) AS hrv_flags
FROM whoop_user_day_clean""").df()

Unnamed: 0,total_rows,strain_flags,recovery_flags,sleep_flags,hrv_flags
0,10620,0.0,0.0,0.0,0.0


In [18]:
con.execute("""COPY whoop_user_day_clean
TO 'data/clean/whoop_user_day_clean.csv'
(HEADER, DELIMITER ',')""")

<_duckdb.DuckDBPyConnection at 0x132bd48b0>