In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Following along with Jaun Martinez II on YT

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
plt.style.use('ggplot')

In [None]:
# Prepare the data
df = pd.read_csv('/kaggle/input/fitbit/mturkfitbit_export_4.12.16-5.12.16/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head(10)

In [None]:
df.dtypes

In [None]:
# Cleaning the data phase

df['Id'] = df['Id'].astype(str)
df['ActivityDate'] = pd.to_datetime(df['ActivityDate'], format='%m/%d/%Y')
df.dtypes

In [None]:
df['distance_diff'] = df['TotalDistance'] - df['TrackerDistance']

In [None]:
df['distance_diff'].value_counts()

In [None]:
df.query('distance_diff > 0.0')

In [None]:
df.columns = df.columns.str.lower()
df.columns

In [None]:
df.rename(columns = {'activitydate': 'activity_date', 'totalsteps':'total_steps', 'totaldistance':"total_distance", 'trackerdistance':'tacker_distance',
       'loggedactivitiesdistance':'logged_activities_distance', 'veryactivedistance':'very_active_distance',
       'moderatelyactivedistance':'moderately_active_distance', 'lightactivedistance':'light_active_distance',
       'sedentaryactivedistance':'sedentary_active_distance', 'veryactiveminutes':'very_active_minutes', 'fairlyactiveminutes':'fairly_active_minutes',
       'lightlyactiveminutes':'lightly_active_minutes', 'sedentaryminutes':'sedentary_minutes'}, inplace=True)
df.columns

In [None]:
# Checking null values
df.isnull().sum()

In [None]:
# Create columns
day_of_week = df['activity_date'].dt.day_name()
df['day_of_week'] = day_of_week

df['n_day_of_week'] = df['activity_date'].dt.weekday 
#0 Monday 6 Sunday

In [None]:
df.head(4)
df.isnull().sum()

In [None]:
# Checking duplicates
df.duplicated().sum()

In [None]:
df.columns

In [None]:
# Subset the data
df = df[['id', 'activity_date', 'total_steps', 'total_distance',
       'very_active_minutes', 'fairly_active_minutes',
       'lightly_active_minutes', 'sedentary_minutes', 'calories',
       'distance_diff', 'day_of_week', 'n_day_of_week']].copy()

In [None]:
df.head(3)

In [None]:
df ['id'].unique()

In [None]:
df['activity_date'].value_counts()

In [None]:
# Analysis phase

# Category
# sedentary:  less than 6000 on average
# active: Between 6000 and 12000 on average
# very active: more than 12000 on average

In [None]:
id_grp = df.groupby(['id'])
id_avg_step = id_grp['total_steps'].mean().sort_values(ascending=False)
id_avg_step = id_avg_step.to_frame()

conditions = [(id_avg_step <= 6000),
    (id_avg_step > 6000) & (id_avg_step < 12000),
    (id_avg_step >= 12000)
     ]    
     
     
values = ['sedentary', 'active', 'very_active']     
     
id_avg_step['activity_level'] = np.select(conditions,values)
     
id_activity_level = id_avg_step['activity_level']
     
df['activity_level'] = [id_activity_level[c] for c in df['id']]

In [None]:
df['id'].value_counts()

In [None]:
df.describe()

In [None]:
# Share phase
# Correlation between steps and calories burned

ax = sns.scatterplot(x='total_steps', y='calories', data=df, hue=df ['activity_level'])

plt.title('Correlation Calories vs. Steps')
plt.show()

In [None]:
# Average steps per day

day_of_week = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
fig, ax = plt.subplots(1,1,figsize=(8,5))

day_grp = df.groupby(['day_of_week'])
avg_daily_steps = day_grp['total_steps'].mean()
avg_steps = df ['total_steps'].mean()

plt.bar(avg_daily_steps.index, avg_daily_steps)

ax.set_xticks(range(len(day_of_week)))
ax.set_xticklabels(day_of_week)

ax.axhline(y=avg_daily_steps.mean(), color='blue', label='Avg Daily Steps')

ax.set_ylabel('Steps')
ax.set_xlabel('Day of Week')
ax.set_title('Average Number of Steps Per Day')
plt.legend()
plt.show()

In [None]:
# Percentage of activity in minutes

very_active_mins = df['very_active_minutes'].sum()
fairly_active_mins = df['fairly_active_minutes'].sum()
lightly_active_mins = df['lightly_active_minutes'].sum()
sedentary_mins = df['sedentary_minutes'].sum()

slices = [very_active_mins, fairly_active_mins, lightly_active_mins, sedentary_mins]

labels = [very_active_mins, fairly_active_mins, lightly_active_mins, sedentary_mins]

explode = [0,0,0,0.1]

plt.pie(slices, labels = labels, explode = explode, autopct = '%1.1f%%')

plt.title('% of Activity Level in Minutes')
plt.show()

In [None]:
# Correlation between activity level in minutes and calories
n_day_of_week = [0, 1, 2, 3, 4, 5, 6]

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 9), dpi=70)

sns.scatterplot(data=df, x='calories', y='sedentary_minutes', hue='activity_level', ax=axes[0,0], legend=False)
sns.scatterplot(data=df, x='calories', y='lightly_active_minutes', hue='activity_level', ax=axes[0,1], legend=False)
sns.scatterplot(data=df, x='calories', y='fairly_active_minutes', hue='activity_level', ax=axes[1,0], legend=False)
sns.scatterplot(data=df, x='calories', y='very_active_minutes', hue='activity_level', ax=axes[1,1], legend=True)

plt.legend(title = 'Activity Level', fontsize=12, title_fontsize=14, bbox_to_anchor = (1.8, 2.2))

fig.suptitle('Correlation Between Activity Level Minutes and Calories', x=0.5, y=0.92, fontsize=24)
plt.show()