In [1]:
import pandas as pd
import duckdb

In [2]:
fitbit_restingHR = pd.read_csv("wearables_synthetic_raw/fitbit_restingHR_2024.csv")

In [3]:
fitbit_restingHR.head()

Unnamed: 0,id,dateTime,restingHeartRate,timezone,device,rhr_confidence
0,U_5NQ4FMYZYCWT,2024/01/01 00:00,66.3,,Charge 5,low
1,U_5NQ4FMYZYCWT,2024-01-02 00:00:00 +0400,68.0,Asia/Tokyo,,high
2,U_5NQ4FMYZYCWT,2024/01/03 00:00,65.3,Asia/Tokyo,Sense 2,
3,U_5NQ4FMYZYCWT,2024/01/04 00:00,67.9,Asia/Tokyo,Charge 5,low
4,U_5NQ4FMYZYCWT,2024/01/05 00:00,68.6,,Sense 2,low


In [4]:
con = duckdb.connect()

In [5]:
con.register("fitbit_restingHR", fitbit_restingHR)

<_duckdb.DuckDBPyConnection at 0x13186ae30>

In [6]:
#Audit Counts
con.execute("""SELECT COUNT(*) AS total_rows,
SUM(CASE WHEN id IS NULL OR TRIM(CAST(id AS VARCHAR)) = ''
THEN 1 ELSE 0 
END) AS bad_id_rows,
SUM(CASE WHEN dateTime IS NULL OR TRIM(CAST(dateTime AS VARCHAR)) = ''
THEN 1 ELSE 0
END) AS bad_dateTime_rows,
SUM(CASE WHEN restingHeartRate IS NULL
THEN 1 ELSE 0
END) AS missing_restingHR_rows
FROM fitbit_restingHR""").df()

Unnamed: 0,total_rows,bad_id_rows,bad_dateTime_rows,missing_restingHR_rows
0,15595,0.0,0.0,1215.0


- Resting Heart Rate data has complete identity and timestamps
- only around 7.8 rows have missing resting HR values

In [7]:
#dateTime Formats
con.execute("""SELECT dateTime, COUNT(*) AS n
FROM fitbit_restingHR
GROUP BY dateTime
ORDER BY n DESC
LIMIT 10""").df()

Unnamed: 0,dateTime,n
0,2024/04/29 00:00,19
1,2024/03/16 00:00,19
2,2024/12/14 00:00,18
3,2024-07-18,18
4,2024/05/18 00:00,18
5,2024-07-01,18
6,2024-03-31,18
7,2024-01-01,18
8,2024/07/20 00:00,17
9,2024/04/01 00:00,17


In [8]:
#Parse dateTime
con.execute("""CREATE OR REPLACE VIEW fitbit_restingHR_clean_step1 AS
SELECT *,
COALESCE(
try_strptime(CAST(dateTime AS VARCHAR), '%Y-%m-%d'),
try_strptime(CAST(dateTime AS VARCHAR), '%Y/%m/%d %H:%M'),
try_strptime(CAST(dateTime AS VARCHAR), '%Y-%m-%dT%H:%M:%S'),
try_strptime(CAST(dateTime AS VARCHAR), '%Y-%m-%d %H:%M:%S %z')) AS rhr_ts
FROM fitbit_restingHR""")

<_duckdb.DuckDBPyConnection at 0x13186ae30>

In [9]:
con.execute("""SELECT dateTime, id
FROM fitbit_restingHR_clean_step1
WHERE rhr_ts IS NULL
LIMIT 15""").df()

Unnamed: 0,dateTime,id


In [10]:
con.execute("""SELECT COUNT(*) AS n,
SUM(CASE WHEN rhr_ts IS NULL THEN 1 ELSE 0 END) AS rhr_ts_null
FROM fitbit_restingHR_clean_step1""").df()

Unnamed: 0,n,rhr_ts_null
0,15595,0.0


In [11]:
con.execute("""SELECT * from fitbit_restingHR_clean_step1
LIMIT 10""").df()

Unnamed: 0,id,dateTime,restingHeartRate,timezone,device,rhr_confidence,rhr_ts
0,U_5NQ4FMYZYCWT,2024/01/01 00:00,66.3,,Charge 5,low,2024-01-01 00:00:00+04:00
1,U_5NQ4FMYZYCWT,2024-01-02 00:00:00 +0400,68.0,Asia/Tokyo,,high,2024-01-02 00:00:00+04:00
2,U_5NQ4FMYZYCWT,2024/01/03 00:00,65.3,Asia/Tokyo,Sense 2,,2024-01-03 00:00:00+04:00
3,U_5NQ4FMYZYCWT,2024/01/04 00:00,67.9,Asia/Tokyo,Charge 5,low,2024-01-04 00:00:00+04:00
4,U_5NQ4FMYZYCWT,2024/01/05 00:00,68.6,,Sense 2,low,2024-01-05 00:00:00+04:00
5,U_5NQ4FMYZYCWT,2024-01-06T00:00:00,66.7,Asia/Tokyo,Versa 4,0.8,2024-01-06 00:00:00+04:00
6,U_5NQ4FMYZYCWT,2024-01-07 00:00:00 -0800,75.8,Asia/Tokyo,Sense 2,1.0,2024-01-07 12:00:00+04:00
7,U_5NQ4FMYZYCWT,2024-01-08T00:00:00,69.9,Asia/Tokyo,,1.0,2024-01-08 00:00:00+04:00
8,U_5NQ4FMYZYCWT,2024-01-09T00:00:00,69.4,Asia/Tokyo,Charge 5,1.0,2024-01-09 00:00:00+04:00
9,U_5NQ4FMYZYCWT,2024-01-10 00:00:00 +0000,66.1,Asia/Tokyo,,1.0,2024-01-10 04:00:00+04:00


In [12]:
con.execute("""CREATE OR REPLACE VIEW fitbit_restingHR_clean_step2 AS
SELECT *,
DATE(rhr_ts) AS rhr_date
FROM fitbit_restingHR_clean_step1""")

<_duckdb.DuckDBPyConnection at 0x13186ae30>

In [13]:
con.execute("""SELECT dateTime, rhr_ts, rhr_date
FROM fitbit_restingHR_clean_step2
LIMIT 10""").df()

Unnamed: 0,dateTime,rhr_ts,rhr_date
0,2024/01/01 00:00,2024-01-01 00:00:00+04:00,2024-01-01
1,2024-01-02 00:00:00 +0400,2024-01-02 00:00:00+04:00,2024-01-02
2,2024/01/03 00:00,2024-01-03 00:00:00+04:00,2024-01-03
3,2024/01/04 00:00,2024-01-04 00:00:00+04:00,2024-01-04
4,2024/01/05 00:00,2024-01-05 00:00:00+04:00,2024-01-05
5,2024-01-06T00:00:00,2024-01-06 00:00:00+04:00,2024-01-06
6,2024-01-07 00:00:00 -0800,2024-01-07 12:00:00+04:00,2024-01-07
7,2024-01-08T00:00:00,2024-01-08 00:00:00+04:00,2024-01-08
8,2024-01-09T00:00:00,2024-01-09 00:00:00+04:00,2024-01-09
9,2024-01-10 00:00:00 +0000,2024-01-10 04:00:00+04:00,2024-01-10


In [14]:
con.execute("""SELECT COUNT(*) AS total_rows,
SUM(CASE WHEN rhr_date IS NULL THEN 1 ELSE 0 END) AS rhr_date_null
FROM fitbit_restingHR_clean_step2""").df()

Unnamed: 0,total_rows,rhr_date_null
0,15595,0.0


Parsed mixed-format dateTime into timezone-aware rhr_ts and derived rhr_date from it; 0 unparsed timestamps and 0 null rhr_date.

In [15]:
#keep missing HR, flag invalid HR
con.execute("""CREATE OR REPLACE VIEW fitbit_restingHR_clean_step3 AS
SELECT id AS user_id, rhr_date,
(CASE
WHEN restingHeartRate <35 OR restingHeartRate >120 THEN NULL
WHEN restingHeartRate IS NULL THEN NULL 
ELSE restingHeartRate 
END)
AS rhr_clean,
(CASE
WHEN restingHeartRate <35 OR restingHeartRate >120 THEN 1 
ELSE 0 
END) AS rhr_flag_invalid
FROM fitbit_restingHR_clean_step2""")

<_duckdb.DuckDBPyConnection at 0x13186ae30>

In [16]:
con.execute("""SELECT COUNT(*) AS total_rows,
SUM(CASE WHEN rhr_clean IS NULL THEN 1 ELSE 0 END) as missing_HR_rows,
SUM(CASE WHEN rhr_flag_invalid = 1 THEN 1 ELSE 0 END) as flagged_invalid_rows
FROM fitbit_restingHR_clean_step3""").df()

Unnamed: 0,total_rows,missing_HR_rows,flagged_invalid_rows
0,15595,1215.0,0.0


- Resting HR cleaned with range rule (35â€“120 bpm)
- missing values preserved
- 0 invalid values flagged

In [17]:
con.execute("""SELECT * FROM fitbit_restingHR_clean_step3
LIMIT 8""").df()

Unnamed: 0,user_id,rhr_date,rhr_clean,rhr_flag_invalid
0,U_5NQ4FMYZYCWT,2024-01-01,66.3,0
1,U_5NQ4FMYZYCWT,2024-01-02,68.0,0
2,U_5NQ4FMYZYCWT,2024-01-03,65.3,0
3,U_5NQ4FMYZYCWT,2024-01-04,67.9,0
4,U_5NQ4FMYZYCWT,2024-01-05,68.6,0
5,U_5NQ4FMYZYCWT,2024-01-06,66.7,0
6,U_5NQ4FMYZYCWT,2024-01-07,75.8,0
7,U_5NQ4FMYZYCWT,2024-01-08,69.9,0


In [18]:
con.execute("""COPY fitbit_restingHR_clean_step3
TO 'data/clean/fitbit_restingHR_clean.csv'
(HEADER, DELIMITER ',')""")

<_duckdb.DuckDBPyConnection at 0x13186ae30>