## Strava Data EDA
EDA of my Strava run data during my training for the BMO marathon.

In [2]:
import pandas as pd
import altair as alt
import streamlit as st

## Data cleaning

In [3]:
# Strava API output of 3 most recent pages of data
data1 = pd.read_json('../data/raw/response-1.json')
data2 = pd.read_json('../data/raw/response-2.json')
data3 = pd.read_json('../data/raw/response-3.json')

data = pd.concat([data1,data2,data3])

# Convert distance from m to km
data['distance'] = data['distance']/1000
# Convert time from seconds to minutes
data['moving_time'] = data['moving_time']/60
data['elapsed_time'] = data['elapsed_time']/60

# Convert speed from m/s to mins/km
data['average_speed'] = (1/data['average_speed'])/60*1000
data['max_speed'] = (1/data['max_speed'])/60*1000

# Separate date and time from date column
data['start_time'] = pd.to_datetime(data['start_date_local']).dt.time
data['start_date'] = pd.to_datetime(data['start_date_local']).dt.date
data['week'] = pd.to_datetime(data['start_date_local']).dt.isocalendar().week - (pd.to_datetime(data['start_date_local']).dt.isocalendar().day < 1)# start week on monday
data.head()

Unnamed: 0,resource_state,athlete,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,...,elev_low,upload_id,upload_id_str,external_id,from_accepted_tag,pr_count,total_photo_count,has_kudoed,start_time,week
0,2,"{'id': 86393046, 'resource_state': 1}",Morning Run,9.2971,54.033333,54.55,59.5,Run,Run,0.0,...,6.3,11963357393,11963357393,FC7B0D1D-7C70-409C-88D7-94A0D85AC8DA.fit,False,0,0,False,06:52:49,16
1,2,"{'id': 86393046, 'resource_state': 1}",Ella’s bdayyyyyyy run!,32.5679,180.05,199.833333,290.1,Run,Run,0.0,...,6.6,11951061901,11951061901,47CBE8DD-AE99-49D5-8635-095E61143DEE-activity.fit,False,5,2,False,09:23:42,15
2,2,"{'id': 86393046, 'resource_state': 1}",Morning Run,4.5291,25.9,41.566667,0.0,Run,Run,0.0,...,0.0,11941358135,11941358135,F57A2AD5-5660-4DC5-98FD-F687806F3771.fit,False,0,0,False,09:17:26,15
3,2,"{'id': 86393046, 'resource_state': 1}",Morning Run,5.3553,29.116667,29.583333,53.9,Run,Run,0.0,...,6.5,11931684858,11931684858,0734234E-D3E3-4E41-9B11-6D6BF7B85F18.fit,False,0,0,False,07:20:15,15
4,2,"{'id': 86393046, 'resource_state': 1}",Afternoon Run,6.0495,33.066667,34.25,53.4,Run,Run,0.0,...,7.9,11920501352,11920501352,59CF4654-19C8-4357-B9AB-8303F50F659F.fit,False,2,1,False,17:34:46,15


In [4]:
# Look at available cols and select those of interest
data.columns.unique()
cols = ['name', 'start_date', 'start_time', 'distance','moving_time',  
        'total_elevation_gain','type', 'kudos_count','comment_count',
         'max_speed', 'average_speed', 'average_heartrate', 'max_heartrate', 'week']
data_processed = data[cols]
data_processed.head()

Unnamed: 0,name,start_date,start_time,distance,moving_time,total_elevation_gain,type,kudos_count,comment_count,max_speed,average_speed,average_heartrate,max_heartrate,week
0,Morning Run,2024-04-16,06:52:49,9.2971,54.033333,59.5,Run,20,2,3.405531,5.811251,160.9,172.0,16
1,Ella’s bdayyyyyyy run!,2024-04-14,09:23:42,32.5679,180.05,290.1,Run,28,2,3.043584,5.527916,,,15
2,Morning Run,2024-04-13,09:17:26,4.5291,25.9,0.0,Run,9,0,2.214251,5.719515,159.4,191.0,15
3,Morning Run,2024-04-12,07:20:15,5.3553,29.116667,53.9,Run,15,0,4.113195,5.437738,165.7,190.0,15
4,Afternoon Run,2024-04-10,17:34:46,6.0495,33.066667,53.4,Run,17,3,3.507295,5.466273,161.2,186.0,15


## Analysis of run data

In [5]:
runs = data_processed[data_processed['type']=='Run']
print(f'{runs.shape[0]} runs have been logged.')
runs.info()
runs.to_csv('../data/processed/processed.csv', index=False)

73 runs have been logged.
<class 'pandas.core.frame.DataFrame'>
Index: 73 entries, 0 to 28
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   name                  73 non-null     object 
 1   start_date            73 non-null     object 
 2   start_time            73 non-null     object 
 3   distance              73 non-null     float64
 4   moving_time           73 non-null     float64
 5   total_elevation_gain  73 non-null     float64
 6   type                  73 non-null     object 
 7   kudos_count           73 non-null     int64  
 8   comment_count         73 non-null     int64  
 9   max_speed             73 non-null     float64
 10  average_speed         73 non-null     float64
 11  average_heartrate     61 non-null     float64
 12  max_heartrate         61 non-null     float64
 13  week                  73 non-null     UInt32 
dtypes: UInt32(1), float64(7), int64(2), object(4)
memory us

In [9]:
aggregate_dictionary = {'distance':['sum','max'],'moving_time':'sum',
                       'total_elevation_gain':'sum', 'average_speed':'mean',
                       'average_heartrate': 'mean', 'kudos_count':'sum'}
weekly_dash = runs.groupby(['week'], as_index=False).aggregate(aggregate_dictionary)
weekly_dash.columns = ['week','Total Distance', 'Longest Run','moving_time','total_elevation_gain','average_speed',
                       'average_heartrate', 'kudos_count']
weekly_dash = weekly_dash[weekly_dash['week'] < 18]
weekly_dash.head()

Unnamed: 0,week,Total Distance,Longest Run,moving_time,total_elevation_gain,average_speed,average_heartrate,kudos_count
0,1,23.2434,10.06,134.866667,127.7,5.905161,160.433333,33
1,2,7.0613,6.2625,38.75,61.8,5.888238,147.45,11
2,4,27.2144,13.0123,155.983333,258.8,5.687797,162.366667,29
3,5,16.0192,7.0258,98.233333,165.1,6.093802,155.166667,26
4,6,50.8628,19.0402,285.6,378.8,5.533376,163.75,39


In [13]:
# Visualize weekly longest run only
single = alt.selection_single()
weekly_longest = alt.Chart(weekly_dash).mark_line(point=True, size=2).encode(
    alt.X('week:Q', title='Week #'),
    alt.Y('Longest Run:Q', title='Longest Run [km]'),
    color=alt.condition(single, alt.value('blue'),alt.value('lightgray')),
    tooltip = ['week', 'Longest Run']
).add_selection(
    single
)

weekly_longest



In [15]:
# Visualize weekly total distance only
single = alt.selection_single()
weekly_longest = alt.Chart(weekly_dash).mark_line(point=True, size=2).encode(
    alt.X('week:Q', title='Week #'),
    alt.Y('Total Distance:Q', title='Total Distance [km]'),
    color=alt.condition(single, alt.value('blue'),alt.value('lightgray')),
    tooltip = ['week', 'Total Distance']
).add_selection(
    single
)

weekly_longest



In [16]:
weekly_dash = pd.melt(weekly_dash, 
                      id_vars=['week', 'moving_time','total_elevation_gain','average_speed',
                       'average_heartrate', 'kudos_count'],
                      value_vars=['Longest Run', 'Total Distance'])

In [17]:
# Weekly summary view
single = alt.selection_single()
weekly_summary = alt.Chart(weekly_dash).mark_line(point=True, size=2).encode(
    alt.X('week:Q', title='Week #'),
    alt.Y('value:Q', title='Distance [km]'),
    color=alt.condition(single, 'variable',alt.value('lightgray')),
    tooltip = ['week', 'value']
).add_selection(
    single
)

weekly_summary



In [18]:
# Weekly view
week_num = 10
individual_week = runs[runs['week'] == week_num]
individual_week

Unnamed: 0,name,start_date,start_time,distance,moving_time,total_elevation_gain,type,kudos_count,comment_count,max_speed,average_speed,average_heartrate,max_heartrate,week
21,Afternoon Run,2024-03-10,13:45:37,27.0244,144.833333,109.7,Run,17,2,1.948634,5.359057,171.2,187.0,10
22,Evening Run,2024-03-06,17:56:46,5.4203,28.35,47.9,Run,12,1,3.620041,5.229578,,,10
23,Morning Run,2024-03-05,07:09:46,10.0448,53.283333,49.2,Run,10,0,2.855837,5.304477,162.8,174.0,10
