Click [here](https://nbviewer.org/github/ismailmo1/fit-analytics/blob/main/gym_eda.ipynb) to view the notebook - Github doesn't render the Plotly charts since it performs a static render

In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio

In [2]:
# renders plotly in nbviewer
pio.renderers.default = 'notebook_connected'

In [3]:
gym_df = pd.read_csv("fitnotes/FitNotes_Export_2022_02_09_18_51_35.csv")
print(gym_df.info())
gym_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41783 entries, 0 to 41782
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           41783 non-null  object 
 1   Exercise       41783 non-null  object 
 2   Category       41783 non-null  object 
 3   Weight (kgs)   41770 non-null  float64
 4   Reps           41770 non-null  float64
 5   Distance       13 non-null     float64
 6   Distance Unit  13 non-null     object 
 7   Time           13 non-null     object 
 8   Comment        1001 non-null   object 
dtypes: float64(3), object(6)
memory usage: 2.9+ MB
None


Unnamed: 0,Date,Exercise,Category,Weight (kgs),Reps,Distance,Distance Unit,Time,Comment
0,2015-10-19,Deadlift,Back,105.0,5.0,,,,
1,2015-10-19,Cable Face Pull,Shoulders,25.0,14.0,,,,
2,2015-10-19,Cable Face Pull,Shoulders,25.0,14.0,,,,
3,2015-10-19,Cable Face Pull,Shoulders,25.0,14.0,,,,
4,2015-10-19,Lat Pulldown,Back,41.0,7.0,,,,


In [4]:
# I don't track any distance/cardio activities so I'm surprised to see 13 values - (I do train cardio I promise!)
cardio_mask = ~gym_df['Distance'].isna()
gym_df[cardio_mask]

Unnamed: 0,Date,Exercise,Category,Weight (kgs),Reps,Distance,Distance Unit,Time,Comment
7612,2016-09-14,Rowing Machine,Cardio,,,2.0,km,0:08:28,
7694,2016-09-17,Rowing Machine,Cardio,,,2.0,km,0:08:05,
8050,2016-10-14,Rowing Machine,Cardio,,,0.5,km,0:02:00,
16226,2017-12-16,Elbow Plank (Time),Abs,,,0.0,m,0:01:00,
16227,2017-12-16,Elbow Plank (Time),Abs,,,0.0,m,0:01:00,
16228,2017-12-16,Elbow Plank (Time),Abs,,,0.0,m,0:01:00,
16229,2017-12-16,Elbow Plank (Time),Abs,,,0.0,m,0:01:00,
23831,2019-04-06,StairMaster,Cardio,,,0.0,m,0:10:00,
23914,2019-04-11,StairMaster,Cardio,,,0.0,m,0:30:00,
23936,2019-04-12,StairMaster,Cardio,,,0.0,m,0:10:00,


In [5]:
# drop cardio entries
gym_df.drop(gym_df[cardio_mask].index, inplace=True)

#drop cardio related columns
gym_df.dropna(axis=1, how='all', inplace=True)
gym_df['Date'] = pd.to_datetime(gym_df['Date'])
gym_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41770 entries, 0 to 41782
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          41770 non-null  datetime64[ns]
 1   Exercise      41770 non-null  object        
 2   Category      41770 non-null  object        
 3   Weight (kgs)  41770 non-null  float64       
 4   Reps          41770 non-null  float64       
 5   Comment       1001 non-null   object        
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 2.2+ MB


The app used for tracking allows you to enter comments for a particular set - i usually put in notes that say how difficult the set was, sometimes on a scale of 1-10, but the data is quite unstructured

In [6]:
gym_df['Comment'].value_counts()

8                                     22
10                                    21
8.5                                   18
9.5                                   18
Straps                                18
                                      ..
Mid and upper still rounding a bit     1
Bad form                               1
Failed 7th                             1
Seconds                                1
Dodgy second lockout (grip)            1
Name: Comment, Length: 678, dtype: int64

#### How many days of workout data do we have?

In [7]:
num_workouts = len(gym_df.groupby('Date'))
max_date = gym_df['Date'].max().date()
min_date = gym_df['Date'].min().date()
day_delta = ( max_date-min_date).days

print(f"The data spans from {min_date} until {max_date}"
        f" and I worked out (or atleast tracked a workout) on {round((num_workouts/day_delta) *100)}% of those days ({num_workouts} days)")

The data spans from 2015-10-19 until 2022-02-09 and I worked out (or atleast tracked a workout) on 76% of those days (1745 days)


#### Let's have a look at how often I workout and when

In [8]:
gym_df_days = gym_df.groupby('Date').count()
gym_df_days.drop(gym_df.columns[1:], inplace=True, axis=1)
gym_df_days.reset_index(inplace=True)
gym_df_days['Year'] = gym_df_days['Date'].apply(lambda x:x.year)
gym_df_days['Month'] = gym_df_days['Date'].apply(lambda x:x.month)
gym_df_days['Weekday'] = gym_df_days['Date'].apply(lambda x:x.weekday())

In [9]:
gym_df_days.head()

Unnamed: 0,Date,Year,Month,Weekday
0,2015-10-19,2015,10,0
1,2015-10-20,2015,10,1
2,2015-10-21,2015,10,2
3,2015-10-22,2015,10,3
4,2015-10-23,2015,10,4


#### interesting that 2015 and 2022 are higher % and we have fewer data points - maybe the start and end of the year are better and then it drops off?

In [10]:
gym_df_counts = gym_df_days.groupby('Year').agg({'Date':['max','min', 'count']})
gym_df_counts = gym_df_counts.droplevel(level=0, axis=1)
gym_df_counts['num_days'] = (gym_df_counts['max'] - gym_df_counts['min'])
gym_df_counts['num_days'] = gym_df_counts['num_days'].apply(lambda x:x.days)
gym_df_counts['days_worked_out_%'] = gym_df_counts['count']/gym_df_counts['num_days']
gym_df_counts['days_worked_out_%'] = gym_df_counts['days_worked_out_%'].apply(lambda x:round(x*100, 2))
gym_df_counts

Unnamed: 0_level_0,max,min,count,num_days,days_worked_out_%
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015,2015-12-17,2015-10-19,51,59,86.44
2016,2016-12-31,2016-01-03,293,363,80.72
2017,2017-12-31,2017-01-02,268,363,73.83
2018,2018-12-31,2018-01-02,263,363,72.45
2019,2019-12-31,2019-01-01,277,364,76.1
2020,2020-12-30,2020-01-02,284,363,78.24
2021,2021-12-30,2021-01-01,277,363,76.31
2022,2022-02-09,2022-01-01,32,39,82.05


In [11]:
gym_df_counts.reset_index(inplace=True)
fig = px.line(data_frame = gym_df_counts, x='Year', y='days_worked_out_%', hover_data = ['num_days'], title='Percentage of Days Worked Out')
fig.show()

so going to the gym often is cool but have I made any actual progress? Let's shift our focus to performance improvements

In [21]:
gym_df['Exercise'].value_counts().head(15)

Pull Up                         3656
Flat Barbell Bench Press        1674
Barbell Squat                   1438
V-Bar Push Down                 1146
Incline Dumbbell Bench Press    1126
Cable Crunch                    1041
Overhead Press                   941
Deadlift                         859
Sumo Deadlift                    816
Hammer Cable Curl                809
Seated Cable Row                 742
High Cable Fly                   731
Lateral Machine Raise            729
Lateral Dumbbell Raise           694
Rope Push Down                   642
Name: Exercise, dtype: int64

In [23]:
gym_df['Category'].value_counts()

Back          9960
Chest         7618
Shoulders     6082
Legs          6052
Triceps       5122
Biceps        4343
Abs           1854
Bodyweight     403
Neck           206
Forearms       130
Name: Category, dtype: int64

top 2 most common exercises per category

In [43]:
most_common_ex = gym_df.groupby('Category').apply(lambda x:x['Exercise'].value_counts()[0:2])
most_common_ex

Category                                
Abs         Cable Crunch                    1041
            Hanging Knee Raise               326
Back        Pull Up                         3656
            Deadlift                         859
Biceps      Hammer Cable Curl                809
            EZ-Bar Curl                      558
Bodyweight  Push Ups                         208
            Chin Up                          106
Chest       Flat Barbell Bench Press        1674
            Incline Dumbbell Bench Press    1126
Forearms    Forearm Curl                     130
Legs        Barbell Squat                   1438
            Sumo Deadlift                    816
Neck        Lying Neck Curl                  121
            Lying Neck Extension              85
Shoulders   Overhead Press                   941
            Lateral Machine Raise            729
Triceps     V-Bar Push Down                 1146
            Rope Push Down                   642
Name: Exercise, dtype: int64

In [44]:
# only focus on major muscle groups
most_common_ex.drop(['Abs', 'Bodyweight', 'Forearms', 'Neck'], inplace=True)

In [45]:
compound_ex = ['Deadlift', 'Sumo Deadlift', 'Barbell Squat', 'Flat Barbell Bench Press']

In [74]:
def plot_progress(exercise):
    """Plot line plot of exercise weight separated by reps"""
    
    exercise_df = gym_df[gym_df['Exercise'] == f'{exercise}'][['Date', 'Weight (kgs)', 'Reps']]
    exercise_df.drop(exercise_df[exercise_df['Reps']==0].index, inplace=True)
    return px.line(exercise_df, x='Date', y='Weight (kgs)', color= 'Reps', 
    markers=True, 
    title=f'{exercise} Progress', 
    range_x = [gym_df['Date'].min() - pd.DateOffset(months=1), gym_df['Date'].max()+ pd.DateOffset(months=1)])

In [75]:
for exercise in compound_ex:
    plot_progress(exercise).show()