In [120]:
from pymongo import MongoClient
import json
import pandas as pd
from dataprep.eda import create_report
import warnings
warnings.filterwarnings("ignore")

In [121]:
with open('../credentials.json') as f:
    data = json.load(f)
    username = data['username']
    password = data['password']

In [122]:
client = MongoClient('mongodb://%s:%s@127.0.0.1' % (username, password))
db = client.rais

# read steps

In [123]:
number_of_steps = db.fitbit.find({"type": "steps"})

users=db.fitbit.distinct('id')

df = pd.DataFrame()

for user in users:
    user_data = list(db.fitbit.find({"id": user, "type": "steps"})) 

    # Create the 3 columns
    userId = []
    date = []
    steps = []
    for obj in user_data:
        d = obj["data"]
        date.append(d["dateTime"])
        steps.append(d["value"])
        userId.append(obj["id"])

    # Construct the dataframe
    data = {'date': date, 'id': userId, 'steps': steps} 
    user_df = pd.DataFrame(data)
    
    # Preprocessing 
    
    user_df['date'] = pd.to_datetime(user_df.date)
    user_df['steps'] = pd.to_numeric(user_df['steps'])
    user_df=user_df.resample('H', on='date').sum()
    user_df['id'] = obj['id']
    
    df = pd.concat([df, user_df]) 

In [124]:
df.to_pickle('../data/daily_hourly_fitbit_types/steps_hourly') 

# read exercise

In [125]:
exercise = db.fitbit.find({"type": "exercise"})

users=db.fitbit.distinct('id')

df = pd.DataFrame()

for user in users:
    user_data = list(db.fitbit.find({"id": user, "type": "exercise"})) 

    # Create the 3 columns
    userId = []
    date = []
    exercise = []
    for obj in user_data:
        d = obj["data"]
        date.append(d["startTime"])
        exercise.append(d["activityName"])
        userId.append(obj["id"])

    # Construct the dataframe
    data = {'date': date, 'id': userId, 'exercise': exercise} 
    user_df = pd.DataFrame(data)
    
    # Preprocessing 
    
    user_df['date'] = pd.to_datetime(user_df.date)
    user_df['id'] = obj['id']
    
    df = pd.concat([df, user_df]) 

In [126]:
df

Unnamed: 0,date,id,exercise
0,2021-05-24 10:40:03,621e2e8e67b776a24055b564,Walk
1,2021-05-26 09:46:21,621e2e8e67b776a24055b564,Walk
2,2021-05-28 10:25:42,621e2e8e67b776a24055b564,Walk
3,2021-05-29 11:10:36,621e2e8e67b776a24055b564,Walk
4,2021-06-01 10:17:39,621e2e8e67b776a24055b564,Walk
...,...,...,...
84,2021-07-20 22:37:36,621e375b67b776a240290cdc,Walk
85,2021-07-23 13:56:01,621e375b67b776a240290cdc,Workout
86,2021-07-25 12:22:45,621e375b67b776a240290cdc,Walk
87,2021-07-28 15:00:43,621e375b67b776a240290cdc,Walk


In [127]:
df['exercise'].value_counts()

Walk                2900
Bike                 255
Workout              178
Sport                170
Aerobic Workout      141
Run                  127
Swim                  65
Yoga/Pilates          52
Weights               44
Treadmill             37
Circuit Training      37
Elliptical            14
Hike                  13
Martial Arts           6
Interval Workout       5
Spinning               3
Bootcamp               2
Tennis                 2
Name: exercise, dtype: int64

In [128]:
df['exercise'].isna().sum()

0

In [129]:
#create_report(df).show_browser()

In [130]:
df['exercise'] = 1
df['date'] = df['date'].round('H')
df

Unnamed: 0,date,id,exercise
0,2021-05-24 11:00:00,621e2e8e67b776a24055b564,1
1,2021-05-26 10:00:00,621e2e8e67b776a24055b564,1
2,2021-05-28 10:00:00,621e2e8e67b776a24055b564,1
3,2021-05-29 11:00:00,621e2e8e67b776a24055b564,1
4,2021-06-01 10:00:00,621e2e8e67b776a24055b564,1
...,...,...,...
84,2021-07-20 23:00:00,621e375b67b776a240290cdc,1
85,2021-07-23 14:00:00,621e375b67b776a240290cdc,1
86,2021-07-25 12:00:00,621e375b67b776a240290cdc,1
87,2021-07-28 15:00:00,621e375b67b776a240290cdc,1


In [131]:
steps = pd.read_pickle('../data/daily_hourly_fitbit_types/steps_hourly')
steps

Unnamed: 0_level_0,steps,id
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-05-24 00:00:00,134,621e2e8e67b776a24055b564
2021-05-24 01:00:00,0,621e2e8e67b776a24055b564
2021-05-24 02:00:00,0,621e2e8e67b776a24055b564
2021-05-24 03:00:00,15,621e2e8e67b776a24055b564
2021-05-24 04:00:00,0,621e2e8e67b776a24055b564
...,...,...
2021-08-01 20:00:00,243,621e375b67b776a240290cdc
2021-08-01 21:00:00,704,621e375b67b776a240290cdc
2021-08-01 22:00:00,436,621e375b67b776a240290cdc
2021-08-01 23:00:00,205,621e375b67b776a240290cdc


In [132]:
df_new = steps.merge(df, on=['date','id'], how='outer')
df_new

Unnamed: 0,date,steps,id,exercise
0,2021-05-24 00:00:00,134.0,621e2e8e67b776a24055b564,
1,2021-05-24 01:00:00,0.0,621e2e8e67b776a24055b564,
2,2021-05-24 02:00:00,0.0,621e2e8e67b776a24055b564,
3,2021-05-24 03:00:00,15.0,621e2e8e67b776a24055b564,
4,2021-05-24 04:00:00,0.0,621e2e8e67b776a24055b564,
...,...,...,...,...
128764,2021-07-26 10:00:00,,621e32af67b776a24045b4cf,1.0
128765,2021-07-26 08:00:00,,621e32af67b776a24045b4cf,1.0
128766,2021-10-11 18:00:00,,621e339967b776a240e502de,1.0
128767,2021-10-09 12:00:00,,621e339967b776a240e502de,1.0


In [133]:
df_new['exercise'].value_counts()

1.0    4051
Name: exercise, dtype: int64

In [134]:
df['exercise'].value_counts()

1    4051
Name: exercise, dtype: int64

In [135]:
df_new['exercise'].isna().sum()

124718

In [136]:
df_new['exercise'] = df_new['exercise'].fillna(0)
df_new = df_new[['date', 'id', 'exercise']]
df_new

Unnamed: 0,date,id,exercise
0,2021-05-24 00:00:00,621e2e8e67b776a24055b564,0.0
1,2021-05-24 01:00:00,621e2e8e67b776a24055b564,0.0
2,2021-05-24 02:00:00,621e2e8e67b776a24055b564,0.0
3,2021-05-24 03:00:00,621e2e8e67b776a24055b564,0.0
4,2021-05-24 04:00:00,621e2e8e67b776a24055b564,0.0
...,...,...,...
128764,2021-07-26 10:00:00,621e32af67b776a24045b4cf,1.0
128765,2021-07-26 08:00:00,621e32af67b776a24045b4cf,1.0
128766,2021-10-11 18:00:00,621e339967b776a240e502de,1.0
128767,2021-10-09 12:00:00,621e339967b776a240e502de,1.0


In [137]:
df_new.to_pickle('../data/daily_hourly_fitbit_types/exercise_hourly') 

In [138]:
df_new['date'].min()

Timestamp('2021-05-24 00:00:00')

In [139]:
df_new['date'].max()

Timestamp('2022-01-22 00:00:00')