#### Cleaning Fitbit Daily Activity

Goal:
- Produce a reliable Fitbit-only daily activity table
- Preserve missingness
- Flag invalid values
- Do not infer or fill using other devices

Identity:
- Drop rows where Id is missing or blank, as user-level analysis is not possible.

Dates:
- Parse ActivityDate into a date object representing the user’s local calendar day.

Steps:
- Steps < 0 are invalid and set to missing.
- Extremely high step counts will be flagged via an indicator column, not removed.

Calories:
- Keep calories as-is; allow missing values.

Activity minutes:
- Retain all activity minute columns (VeryActive, FairlyActive, LightlyActive, Sedentary) for flexibility.

Columns:
- Keep all metric and metadata columns at this stage.
- Do not drop columns unless they are unusable for analysis.


In [1]:
import pandas as pd
import duckdb

In [2]:
con = duckdb.connect()

In [3]:
fitbit_daily_activity = pd.read_csv("wearables_synthetic_raw/fitbit_dailyActivity_2024.csv")

In [4]:
con.register("fitbit_daily_activity", fitbit_daily_activity)

<_duckdb.DuckDBPyConnection at 0x123ce17b0>

In [5]:
con.execute("SELECT COUNT(*) FROM fitbit_daily_activity").fetchall()

[(15595,)]

In [6]:
con.execute("""SELECT COUNT(*) AS total_rows,
SUM(CASE WHEN Id IS NULL THEN 1 ELSE 0 END) AS null_id_rows,
SUM(CASE WHEN Id is NOT NULL AND TRIM(Id) = '' THEN 1 ELSE 0 END) AS blank_id_rows
FROM fitbit_daily_activity""").df()

Unnamed: 0,total_rows,null_id_rows,blank_id_rows
0,15595,1296.0,0.0


In [7]:
#Cleaned view
con.execute("""CREATE OR REPLACE VIEW fitbit_daily_activity_clean_step1 AS
SELECT * FROM fitbit_daily_activity
WHERE Id IS NOT NULL AND TRIM(Id) <> ''""")

<_duckdb.DuckDBPyConnection at 0x123ce17b0>

In [8]:
con.execute("""SELECT COUNT(*) AS clean_rows 
FROM fitbit_daily_activity_clean_step1""").df()

Unnamed: 0,clean_rows
0,14299


Dropped 1296 rows (8.31%) due to missing Id values

In [9]:
#How many different formats exist?
con.execute("""SELECT ActivityDate, COUNT(*) AS n
FROM fitbit_daily_activity_clean_step1
GROUP BY ActivityDate
ORDER BY n DESC
LIMIT 15""").df()

Unnamed: 0,ActivityDate,n
0,25-Jan-2024,18
1,05-Dec-2024,18
2,02/25/2024,18
3,16-Aug-2024,18
4,20-Aug-2024,18
5,2024/07/16 00:00,18
6,2024-05-07,17
7,17-May-2024,17
8,05/09/2024,17
9,2024/11/06 00:00,17


In [10]:
#weird dates
con.execute("""SELECT ActivityDate FROM fitbit_daily_activity_clean_step1
WHERE ActivityDate LIKE '%/%' OR ActivityDate LIKE '%-%-%-%' OR ActivityDate LIKE '% %'
LIMIT 15""").df()

Unnamed: 0,ActivityDate
0,01/05/2024
1,2024/01/08 00:00
2,01/10/2024
3,01/11/2024
4,2024/01/13 00:00
5,01/15/2024
6,2024/01/18 00:00
7,01/19/2024
8,01/26/2024
9,01/28/2024


In [11]:
con.execute("""CREATE OR REPLACE VIEW fitbit_daily_activity_clean_step2 AS
SELECT *,
COALESCE(
try_strptime(ActivityDate, '%Y-%m-%d'),
try_strptime(ActivityDate, '%m/%d/%Y'),
try_strptime(ActivityDate, '%d-%b-%Y'),
try_strptime(ActivityDate, '%Y/%m/%d %H:%M')
)::DATE AS activity_date
FROM fitbit_daily_activity_clean_step1""")

<_duckdb.DuckDBPyConnection at 0x123ce17b0>

In [12]:
con.execute("""SELECT COUNT(*) AS rows,
SUM(CASE WHEN activity_date IS NULL THEN 1 ELSE 0 END) AS unparsed_rows
FROM fitbit_daily_activity_clean_step2""").df()

Unnamed: 0,rows,unparsed_rows
0,14299,0.0


In [13]:
#steps distribution & validity
con.execute("""SELECT
MIN(TotalSteps) AS min_steps,
MAX(TotalSteps) AS max_steps,
AVG(TotalSteps) AS avg_steps,
approx_quantile(TotalSteps, 0.5) AS p50_steps,
approx_quantile(TotalSteps, 0.95) AS p95_steps,
approx_quantile(TotalSteps, 0.99) AS p99_steps
FROM fitbit_daily_activity_clean_step2""").df()

Unnamed: 0,min_steps,max_steps,avg_steps,p50_steps,p95_steps,p99_steps
0,0.0,15982.0,6803.975273,6781.850305,11107.772251,12790.998455


In [14]:
con.execute("""CREATE OR REPLACE VIEW fitbit_daily_activity_clean_step3 AS
SELECT *,
CASE
WHEN TotalSteps IS NULL THEN NULL
WHEN TotalSteps < 0 THEN NULL
ELSE TotalSteps
END AS steps_clean,
CASE
WHEN TotalSteps IS NULL THEN 0
WHEN TotalSteps < 0 THEN 1
WHEN TotalSteps > 100000 THEN 1
ELSE 0
END AS steps_flag_outlier
FROM fitbit_daily_activity_clean_step2""")

<_duckdb.DuckDBPyConnection at 0x123ce17b0>

In [15]:
con.execute("""SELECT COUNT(*) AS rows,
SUM(steps_flag_outlier) AS flagged_rows
FROM fitbit_daily_activity_clean_step3""").df()

Unnamed: 0,rows,flagged_rows
0,14299,0.0


In [16]:
#Inspecting Ranges
con.execute(""" SELECT
MIN(VeryActiveMinutes) AS min_very,
MAX(VeryActiveMinutes) AS max_very,
MIN(FairlyActiveMinutes) AS min_fairly,
MAX(FairlyActiveMinutes) AS max_fairly,
MIN(LightlyActiveMinutes) AS min_light,
MAX(LightlyActiveMinutes) AS max_light,
MIN(SedentaryMinutes) AS min_sedentary,
MAX(SedentaryMinutes) AS max_sedentary
FROM fitbit_daily_activity_clean_step3""").df()

Unnamed: 0,min_very,max_very,min_fairly,max_fairly,min_light,max_light,min_sedentary,max_sedentary
0,0.0,128.0,0.0,40.0,0.0,220.0,1137.0,1439.0


In [17]:
#Impossible Totals
con.execute("""SELECT COUNT(*) AS rows,
SUM(CASE 
WHEN 
COALESCE(VeryActiveMinutes,0) +
COALESCE(FairlyActiveMinutes,0) +
COALESCE(LightlyActiveMinutes,0) +
COALESCE(SedentaryMinutes,0) > 1440
THEN 1 ELSE 0
END) AS over_1440_rows
FROM fitbit_daily_activity_clean_step3""").df()

Unnamed: 0,rows,over_1440_rows
0,14299,6513.0


In [18]:
#flag
con.execute("""CREATE OR REPLACE VIEW fitbit_daily_activity_clean_step4 AS
SELECT*,
CASE
WHEN 
COALESCE(VeryActiveMinutes,0) +
COALESCE(FairlyActiveMinutes,0) +
COALESCE(LightlyActiveMinutes,0) +
COALESCE(SedentaryMinutes,0) > 1440
THEN 1
ELSE 0
END AS minutes_flag_over_1440 FROM fitbit_daily_activity_clean_step3""")

<_duckdb.DuckDBPyConnection at 0x123ce17b0>

In [19]:
con.execute("""SELECT COUNT(*) AS rows, SUM(minutes_flag_over_1440) AS flagged_rows
FROM fitbit_daily_activity_clean_step4""").df()

Unnamed: 0,rows,flagged_rows
0,14299,6513.0


6,513 rows (45.5%) have total activity minutes exceeding 1,440.
This is expected due to overlapping Fitbit activity categories.
Rows flagged but not modified.

In [20]:
con.execute("SELECT * FROM fitbit_daily_activity_clean_step4").df()

Unnamed: 0,Id,ActivityDate,TotalSteps,TrackerDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,source,activity_date,steps_clean,steps_flag_outlier,minutes_flag_over_1440
0,U_5NQ4FMYZYCWT,01-Jan-2024,10233.0,7.31,,22.0,38.0,1225.0,2617.0,Fitbit,2024-01-01,10233.0,0,0
1,U_5NQ4FMYZYCWT,2024-01-02,7380.0,5.27,26.0,24.0,33.0,,1841.0,Fitbit,2024-01-02,7380.0,0,0
2,U_5NQ4FMYZYCWT,03-Jan-2024,11229.0,8.02,77.0,31.0,73.0,1234.0,2107.0,Fitbit,2024-01-03,11229.0,0,0
3,U_5NQ4FMYZYCWT,2024-01-04,6515.0,4.65,19.0,38.0,104.0,1306.0,2216.0,Fitbit,2024-01-04,6515.0,0,1
4,U_5NQ4FMYZYCWT,01/05/2024,10808.0,7.72,58.0,38.0,40.0,,2367.0,Fitbit,2024-01-05,10808.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14294,U_3GC9HOXDTPWD,27-Dec-2024,2918.0,2.08,0.0,18.0,,1278.0,2174.0,Fitbit,2024-12-27,2918.0,0,0
14295,U_3GC9HOXDTPWD,2024/12/28 00:00,4151.0,2.96,25.0,21.0,94.0,1293.0,1748.0,Fitbit,2024-12-28,4151.0,0,0
14296,U_3GC9HOXDTPWD,2024/12/29 00:00,,1.22,4.0,16.0,,1272.0,2085.0,Fitbit,2024-12-29,,0,0
14297,U_3GC9HOXDTPWD,2024/12/30 00:00,6421.0,4.59,28.0,2.0,158.0,1226.0,1946.0,Fitbit,2024-12-30,6421.0,0,0


In [21]:
con.execute("""COPY fitbit_daily_activity_clean_step4
TO 'data/clean/fitbit_daily_activity_clean.csv'
(HEADER, DELIMITER ',')
""")

<_duckdb.DuckDBPyConnection at 0x123ce17b0>

Saved cleaned Fitbit activity data to data/clean/fitbit_daily_activity_clean.csv.