## Strava Data EDA
EDA of my Strava run data during my training for the BMO marathon.

In [1]:
import pandas as pd
import altair as alt
import streamlit as st

## Data cleaning

In [2]:
# Strava API output of 3 most recent pages of data
data1 = pd.read_json('../data/raw/response_1.json')
data2 = pd.read_json('../data/raw/response_2.json')
data3 = pd.read_json('../data/raw/response_3.json')
data4 = pd.read_json('../data/raw/response_4.json')

data = pd.concat([data1,data2,data3,data4])

# Convert distance from m to km
data['distance'] = data['distance']/1000
# Convert time from seconds to minutes
data['moving_time'] = data['moving_time']/60
data['elapsed_time'] = data['elapsed_time']/60

# Convert speed from m/s to mins/km
data['average_speed'] = (1/data['average_speed'])/60*1000
data['max_speed'] = (1/data['max_speed'])/60*1000

# Separate date and time from date column
data['start_time'] = pd.to_datetime(data['start_date_local']).dt.time
data['start_date'] = pd.to_datetime(data['start_date_local']).dt.date
data['week'] = pd.to_datetime(data['start_date_local']).dt.isocalendar().week - (pd.to_datetime(data['start_date_local']).dt.isocalendar().day < 1)# start week on monday
data.head()

Unnamed: 0,resource_state,athlete,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,...,elev_low,upload_id,upload_id_str,external_id,from_accepted_tag,pr_count,total_photo_count,has_kudoed,start_time,week
0,2,"{'id': 86393046, 'resource_state': 1}",Afternoon Ride,19.2511,90.3,153.716667,137.3,Ride,Ride,,...,5.9,12176810000.0,12176810000.0,47B21E8B-D955-4E3D-B391-4FE21E469133.fit,False,6,0,False,17:37:08,19
1,2,"{'id': 86393046, 'resource_state': 1}",Lunch Run to Ella’s,5.2344,30.883333,31.833333,72.3,Run,Run,0.0,...,25.0,12170440000.0,12170440000.0,A6DF5D49-641C-40D0-A5E2-8A18594FEEB4.fit,False,0,0,False,11:48:58,19
2,2,"{'id': 86393046, 'resource_state': 1}",Morning Run,5.2954,30.433333,31.116667,53.8,Run,Run,,...,6.5,12170440000.0,12170440000.0,441E72E8-84E2-4E04-945E-F29F9E8A2C05.fit,False,0,0,False,10:54:13,19
3,2,"{'id': 86393046, 'resource_state': 1}",Morning Run,4.084,25.75,26.266667,35.0,Run,Run,0.0,...,6.7,12143610000.0,12143610000.0,E20A250F-247A-4E36-BBBC-4A2FD5D9B0BE.fit,False,0,0,False,07:56:39,19
4,2,"{'id': 86393046, 'resource_state': 1}",Vancouver BMO Marathon!,42.2,230.8,230.8,0.0,Run,Run,0.0,...,,,,,False,0,5,False,08:35:09,18


In [3]:
# Look at available cols and select those of interest
data.columns.unique()
cols = ['name', 'start_date', 'start_time', 'distance','moving_time',  
        'total_elevation_gain','type', 'kudos_count','comment_count',
         'max_speed', 'average_speed', 'average_heartrate', 'max_heartrate', 'week']
data_processed = data[cols]
data_processed.head()

Unnamed: 0,name,start_date,start_time,distance,moving_time,total_elevation_gain,type,kudos_count,comment_count,max_speed,average_speed,average_heartrate,max_heartrate,week
0,Afternoon Ride,2024-05-12,17:37:08,19.2511,90.3,137.3,Ride,13,0,2.221038,4.690872,150.7,198.0,19
1,Lunch Run to Ella’s,2024-05-12,11:48:58,5.2344,30.883333,72.3,Run,16,0,4.082966,5.899705,158.3,176.0,19
2,Morning Run,2024-05-11,10:54:13,5.2954,30.433333,53.8,Run,9,0,4.00641,5.747126,158.5,179.0,19
3,Morning Run,2024-05-09,07:56:39,4.084,25.75,35.0,Run,18,0,4.221547,6.305965,153.7,169.0,19
4,Vancouver BMO Marathon!,2024-05-05,08:35:09,42.2,230.8,0.0,Run,34,17,inf,5.469861,,,18


## Analysis of run data

In [4]:
runs = data_processed[data_processed['type']=='Run']
print(f'{runs.shape[0]} runs have been logged.')
runs.info()
runs.to_csv('../data/processed/processed.csv', index=False)

99 runs have been logged.
<class 'pandas.core.frame.DataFrame'>
Index: 99 entries, 1 to 29
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   name                  99 non-null     object 
 1   start_date            99 non-null     object 
 2   start_time            99 non-null     object 
 3   distance              99 non-null     float64
 4   moving_time           99 non-null     float64
 5   total_elevation_gain  99 non-null     float64
 6   type                  99 non-null     object 
 7   kudos_count           99 non-null     int64  
 8   comment_count         99 non-null     int64  
 9   max_speed             99 non-null     float64
 10  average_speed         99 non-null     float64
 11  average_heartrate     86 non-null     float64
 12  max_heartrate         86 non-null     float64
 13  week                  99 non-null     UInt32 
dtypes: UInt32(1), float64(7), int64(2), object(4)
memory us

In [5]:
aggregate_dictionary = {'distance':['sum','max'],'moving_time':'sum',
                       'total_elevation_gain':'sum', 'average_speed':'mean',
                       'average_heartrate': 'mean', 'kudos_count':'sum'}
weekly_dash = runs.groupby(['week'], as_index=False).aggregate(aggregate_dictionary)
weekly_dash.columns = ['week','Total Distance', 'Longest Run','moving_time','total_elevation_gain','average_speed',
                       'average_heartrate', 'kudos_count']
weekly_dash = weekly_dash[weekly_dash['week'] < 18]
weekly_dash.head()

Unnamed: 0,week,Total Distance,Longest Run,moving_time,total_elevation_gain,average_speed,average_heartrate,kudos_count
0,1,23.2434,10.06,134.866667,127.7,5.905161,160.433333,33
1,2,7.0613,6.2625,38.75,61.8,5.888238,147.45,11
2,4,27.2144,13.0123,155.983333,258.8,5.687797,162.366667,29
3,5,16.0192,7.0258,98.233333,165.1,6.093802,155.166667,26
4,6,50.8628,19.0402,285.6,378.8,5.533376,163.75,39


In [6]:
# Visualize weekly longest run only
single = alt.selection_single()
weekly_longest = alt.Chart(weekly_dash).mark_line(point=True, size=2).encode(
    alt.X('week:Q', title='Week #'),
    alt.Y('Longest Run:Q', title='Longest Run [km]'),
    color=alt.condition(single, alt.value('blue'),alt.value('lightgray')),
    tooltip = ['week', 'Longest Run']
).add_selection(
    single
)

weekly_longest



In [7]:
# Visualize weekly total distance only
single = alt.selection_single()
weekly_longest = alt.Chart(weekly_dash).mark_line(point=True, size=2).encode(
    alt.X('week:Q', title='Week #'),
    alt.Y('Total Distance:Q', title='Total Distance [km]'),
    color=alt.condition(single, alt.value('blue'),alt.value('lightgray')),
    tooltip = ['week', 'Total Distance']
).add_selection(
    single
)

weekly_longest



In [8]:
weekly_dash = pd.melt(weekly_dash, 
                      id_vars=['week', 'moving_time','total_elevation_gain','average_speed',
                       'average_heartrate', 'kudos_count'],
                      value_vars=['Longest Run', 'Total Distance'])

In [9]:
# Weekly summary view
single = alt.selection_single()
weekly_summary = alt.Chart(weekly_dash).mark_line(point=True, size=2).encode(
    alt.X('week:Q', title='Week #'),
    alt.Y('value:Q', title='Distance [km]'),
    color=alt.condition(single, 'variable',alt.value('lightgray')),
    tooltip = ['week', 'value']
).add_selection(
    single
)

weekly_summary



In [10]:
# Weekly view
week_num = 10
individual_week = runs[runs['week'] == week_num]
individual_week

Unnamed: 0,name,start_date,start_time,distance,moving_time,total_elevation_gain,type,kudos_count,comment_count,max_speed,average_speed,average_heartrate,max_heartrate,week
8,Afternoon Run,2024-03-10,13:45:37,27.0244,144.833333,109.7,Run,17,2,1.948634,5.359057,171.2,187.0,10
9,Evening Run,2024-03-06,17:56:46,5.4203,28.35,47.9,Run,12,1,3.620041,5.229578,,,10
10,Morning Run,2024-03-05,07:09:46,10.0448,53.283333,49.2,Run,10,0,2.855837,5.304477,162.8,174.0,10
