In [139]:
import json
import requests
import time
import os
import pandas as pd

### making data a bit easier to see
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

### should be outside the repo
cred_location = r'/Users/jj/code/strava_creds/'
data_location = r'/Users/jj/code/strava_data/'

## Authorisation
This code will make sure you have the necessary tokens to authenticate to the strava API

full documentation for this can be found here https://developers.strava.com/

In [83]:
### You need to have created an app on strava - this is much easier than it sounds.
### Go through the first couple of steps here if unsure: 
###https://medium.com/@annthurium/getting-started-with-the-strava-api-a-tutorial-f3909496cd2d

### These client tokens don't change, so you should only have to enter these once. 
def get_client_tokens():
    print('please go here https://www.strava.com/settings/api, and copy the token information as prompted')
    client_id = input('please enter your client id')
    client_secret = input('please enter your client secret')
    redirect_uri = input('please enter a redirect uri (use http://localhost/ if unsure)')

    client_tokens = {
        'client_id':str(client_id),
        'client_secret':str(client_secret),
        'redirect_uri':str(redirect_uri)
    }

    with open(cred_location+'client_tokens.json', 'w') as outfile:
        json.dump(client_tokens, outfile, indent=4)

    print("client tokens saved")
    return None

### This will just check if the client tokens exist, not if they are valid.
### client token, client secret, and refresh token should all be held in a json file called client_tokens.json
def check_client_tokens():
    try:
        client_tokens = json.load(open(cred_location+'client_tokens.json'))
        if {'client_id', 'client_secret', 'redirect_uri'} == set(client_tokens.keys()):
            print('Client Tokens Found')
            return True
        else:
            return False
    except:
        return False


def prompt_authorization():

      client_tokens = json.load(open(cred_location+'client_tokens.json'))

      ### necessary to get athlete activities
      scopes = ','.join(['profile:read_all', 'activity:read_all', 'read_all'])

      # Authorization URL
      request_url = f'http://www.strava.com/oauth/authorize?client_id={client_tokens["client_id"]}' \
                        f'&response_type=code&redirect_uri={client_tokens["redirect_uri"]}' \
                        f'&approval_prompt=force' \
                        f'&scope={scopes}'

      # This will prompt you and ask for the code in the url
      print('Click here:', request_url)
      print('Please authorize the app and copy&paste below the generated code!')
      print('P.S: you can find the code in the URL between "code=" and the nexrt "&"')
      code = input('Insert the code from the url: ')

      # Get the access token
      token = requests.post(url='https://www.strava.com/api/v3/oauth/token',
                        data={'client_id': client_tokens["client_id"],
                              'client_secret': client_tokens["client_secret"],
                              'code': code,
                              'grant_type': 'authorization_code'})

      access_tokens = token.json()

      # save the token with the applied for scope in the cred_location
      access_tokens.update({'scopes':scopes})

      with open(cred_location+'access_tokens.json', 'w') as outfile:
            json.dump(access_tokens, outfile, indent=4)
      print('Authorisation Complete')



### access tokens expire every 6 hours.
### the access token can be refreshed using this code.
### it will have the same scope as the original access code.
def refresh_authorization():

      # get previously saved tokens
      client_tokens = json.load(open(cred_location+'client_tokens.json'))
      access_tokens = json.load(open(cred_location+'access_tokens.json'))

      refresh_url = "https://www.strava.com/oauth/token"

      payload = {
            'client_secret':client_tokens['client_secret'],
            'client_id':client_tokens['client_id'],
            'refresh_token':access_tokens['refresh_token'],
            'grant_type':'refresh_token',
            'f':'json'
                 }
      
      print("Requesting Token...\n")
      res = requests.post(refresh_url, data=payload, verify=False)
      access_token = res.json()

      # save the new token and copy the scope of the old token
      access_token.update({'scopes':access_tokens['scopes']})

      if res.status_code < 300:
            with open(cred_location+'access_tokens.json', 'w') as outfile:
                  json.dump(access_token, outfile, indent=4)
            print('Authorisation Complete')

      else:
            print('Authorisation Error')
            print(res.json())

      return True





def authorization_flow():

      ### Check if we have client tokens
      if check_client_tokens():
            
            ### check we have an access token
            if os.path.isfile(cred_location+'access_tokens.json'):
                  tokens = json.load(open(cred_location+'access_tokens.json'))
                  
                  ### check if we have the right keys in the access token file
                  if all(x in tokens.keys() for x in ['access_token', 'scopes', 'refresh_token']):

                        ### check if the access token is still valid
                        if time.time() < int(tokens['expires_at']):
                              print((int(tokens['expires_at']) - int(time.time())) / 60, 'minutes until token expires')
                        
                        else:
                              print('access_code_expired, requesting refresh')
                              refresh_authorization()
                              authorization_flow() ## restart the flow

                  else:
                        print('data missing from access_tokens.json')
                        prompt_authorization()
                        authorization_flow() ## restart the flow

            else: 
                  print('access_token.json file missing')
                  prompt_authorization()
                  authorization_flow()

      else:
           print('client tokens not found')
           get_client_tokens()
           authorization_flow()
                  

authorization_flow()

Client Tokens Found
access_code_expired, requesting refresh
Requesting Token...





Authorisation Complete
Client Tokens Found
359.98333333333335 minutes until token expires


## Get Data
this code will hit the 'all_activities' endpoint to get all data for the authenticated user

more here https://developers.strava.com/docs/reference/#api-Activities-getLoggedInAthleteActivities

In [84]:
### Makes one request to the API for n number of activities
def get_activities(n, page_no=1):
    activites_url = "https://www.strava.com/api/v3/athlete/activities"

    access_tokens = json.load(open(cred_location+'access_tokens.json'))

    header = {'Authorization': 'Bearer ' + access_tokens['access_token']}
    param = {'per_page': n, 'page': page_no}

    response = requests.get(activites_url, headers=header, params=param)
    return response

### preview of what they look like
two_activities = get_activities(2)
trial_df = pd.DataFrame(two_activities.json())
trial_df


Unnamed: 0,resource_state,athlete,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id,start_date,start_date_local,timezone,utc_offset,location_city,location_state,location_country,achievement_count,kudos_count,comment_count,athlete_count,photo_count,map,trainer,commute,manual,private,visibility,flagged,gear_id,start_latlng,end_latlng,average_speed,max_speed,average_temp,average_watts,kilojoules,device_watts,has_heartrate,average_heartrate,max_heartrate,heartrate_opt_out,display_hide_heartrate_option,elev_high,elev_low,upload_id,upload_id_str,external_id,from_accepted_tag,pr_count,total_photo_count,has_kudoed,suffer_score
0,2,"{'id': 23883723, 'resource_state': 1}",Afternoon Mountain Bike Ride,33168.0,11823,20633,1089.9,Ride,MountainBikeRide,10.0,8716585507,2023-03-15T00:14:37Z,2023-03-15T13:14:37Z,(GMT+12:00) Pacific/Auckland,46800.0,,,,19,2,0,1,0,"{'id': 'a8716585507', 'summary_polyline': 'v{_...",False,False,False,False,everyone,False,b11952969,"[-41.292270831763744, 174.77877404540777]","[-41.287265829741955, 174.76577061228454]",2.805,12.458,28,140.8,1664.1,False,True,144.5,183.0,False,True,183.1,7.7,9355786561,9355786561,garmin_ping_264429216355,False,8,0,False,159.0
1,2,"{'id': 23883723, 'resource_state': 1}",Morning Mountain Bike Ride,2300.3,449,525,2.0,Ride,MountainBikeRide,,8715133554,2023-03-14T20:49:41Z,2023-03-15T09:49:41Z,(GMT+12:00) Pacific/Auckland,46800.0,,,,0,0,0,1,0,"{'id': 'a8715133554', 'summary_polyline': 'rj_...",False,False,False,True,only_me,False,b4083088,"[-41.287781400606036, 174.76704055443406]","[-41.291415710002184, 174.77939656935632]",5.123,13.491,24,40.2,18.1,False,True,106.6,144.0,False,True,267.8,150.2,9354258948,9354258948,garmin_ping_264381823955,False,0,0,False,1.0


In [85]:
### this iterates through the get_activities() function to get all activities for an athlete
### You should check that the total number of activities matches what's on your strava profile
def get_all_activities():

    output = []
    i = 1
    valid_data = True
    while valid_data == True:
        response = get_activities(200, i)
        if response.status_code < 300 and len(response.json()) > 0: 
            [output.append(x) for x in response.json()]
            print(len(response.json()), 'rows added from page', i)
            i+=1
        else:
            print(response.json())
            valid_data = False

    return output

all_activities = get_all_activities()
df = pd.DataFrame(all_activities)

### save outside the repo
df.to_csv(data_location+'all_activities_raw.csv')
df.head()

200 rows added from page 1
200 rows added from page 2
200 rows added from page 3
200 rows added from page 4
200 rows added from page 5
200 rows added from page 6
200 rows added from page 7
200 rows added from page 8
200 rows added from page 9
200 rows added from page 10
200 rows added from page 11
3 rows added from page 12
[]


Unnamed: 0,resource_state,athlete,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id,start_date,start_date_local,timezone,utc_offset,location_city,location_state,location_country,achievement_count,kudos_count,comment_count,athlete_count,photo_count,map,trainer,commute,manual,private,visibility,flagged,gear_id,start_latlng,end_latlng,average_speed,max_speed,average_temp,average_watts,kilojoules,device_watts,has_heartrate,average_heartrate,max_heartrate,heartrate_opt_out,display_hide_heartrate_option,elev_high,elev_low,upload_id,upload_id_str,external_id,from_accepted_tag,pr_count,total_photo_count,has_kudoed,suffer_score,average_cadence,max_watts,weighted_average_watts
0,2,"{'id': 23883723, 'resource_state': 1}",Afternoon Mountain Bike Ride,33168.0,11823,20633,1089.9,Ride,MountainBikeRide,10.0,8716585507,2023-03-15T00:14:37Z,2023-03-15T13:14:37Z,(GMT+12:00) Pacific/Auckland,46800.0,,,,19,2,0,1,0,"{'id': 'a8716585507', 'summary_polyline': 'v{_...",False,False,False,False,everyone,False,b11952969,"[-41.292270831763744, 174.77877404540777]","[-41.287265829741955, 174.76577061228454]",2.805,12.458,28.0,140.8,1664.1,False,True,144.5,183.0,False,True,183.1,7.7,9355787000.0,9355786561,garmin_ping_264429216355,False,8,0,False,159.0,,,
1,2,"{'id': 23883723, 'resource_state': 1}",Morning Mountain Bike Ride,2300.3,449,525,2.0,Ride,MountainBikeRide,,8715133554,2023-03-14T20:49:41Z,2023-03-15T09:49:41Z,(GMT+12:00) Pacific/Auckland,46800.0,,,,0,0,0,1,0,"{'id': 'a8715133554', 'summary_polyline': 'rj_...",False,False,False,True,only_me,False,b4083088,"[-41.287781400606036, 174.76704055443406]","[-41.291415710002184, 174.77939656935632]",5.123,13.491,24.0,40.2,18.1,False,True,106.6,144.0,False,True,267.8,150.2,9354259000.0,9354258948,garmin_ping_264381823955,False,0,0,False,1.0,,,
2,2,"{'id': 23883723, 'resource_state': 1}",Evening Ride,8147.2,1247,1991,175.7,Ride,Ride,10.0,8710993986,2023-03-14T05:27:44Z,2023-03-14T18:27:44Z,(GMT+12:00) Pacific/Auckland,46800.0,,,,7,3,0,1,0,"{'id': 'a8710993986', 'summary_polyline': 'rh_...",False,False,False,False,everyone,False,b4083088,"[-41.28914144821465, 174.7641707584262]","[-41.28788600675762, 174.76564806886017]",6.533,14.914,22.0,184.3,229.8,False,True,118.6,140.0,False,True,247.9,130.0,9349758000.0,9349757958,garmin_ping_264280411416,False,6,1,False,4.0,,,
3,2,"{'id': 23883723, 'resource_state': 1}",Morning Mountain Bike Ride,12841.1,4215,5419,481.6,Ride,MountainBikeRide,10.0,8692779081,2023-03-10T19:39:37Z,2023-03-11T08:39:37Z,(GMT+12:00) Pacific/Auckland,46800.0,,,,19,5,0,1,0,"{'id': 'a8692779081', 'summary_polyline': 'rv`...",False,False,False,False,everyone,False,b11952969,"[-41.29657896235585, 174.72292349673808]","[-41.29738446325064, 174.72190467640758]",3.047,9.406,25.0,174.7,736.4,False,True,138.5,165.0,False,True,404.1,130.5,9330598000.0,9330598137,garmin_ping_263769427693,False,5,0,False,41.0,,,
4,2,"{'id': 23883723, 'resource_state': 1}",Afternoon Ride,4999.0,1170,1477,104.0,Ride,Ride,,8689017584,2023-03-10T01:05:02Z,2023-03-10T14:05:02Z,(GMT+12:00) Pacific/Auckland,46800.0,,,,1,0,0,1,0,"{'id': 'a8689017584', 'summary_polyline': 'xv_...",False,False,False,True,only_me,False,b4083088,"[-41.291484693065286, 174.77927134372294]","[-41.28720883280039, 174.76581294089556]",4.273,9.09,28.0,114.6,134.1,False,True,136.1,160.0,False,True,115.4,11.2,9326651000.0,9326651352,garmin_ping_263652567668,False,0,0,False,11.0,,,


## Test Data
A few tests to check the data from the API is as expected

In [108]:
### Test 1: Check that all your activities have been collected

### before doing this, go to https://www.strava.com/dashboard and make a note of your total number of activities

### check that the dataframe is of the same length.
print(len(df))

### test passed if the numbers match

2203


In [127]:
### Test 2: check that the columns are populated
columns_that_must_be_populated = ['id', 'name', 'type', 'start_date', 'private']
possible_empty_values = ['none', 'unknown', 'None', 'Unknown', '', None]

errors = []
for v in possible_empty_values:
    for c in columns_that_must_be_populated:
        if v in df[c]:
            errors.append({c:(df[c] == v).sum()+' instances of '+v})
if errors == []:
    print('TEST PASSED')
else:
    print('TEST FAILED')
    print(errors)

TEST PASSED


In [131]:
### Test 3: Check that columns which should be unique are unique
columns_that_must_be_unique = ['id', 'start_date']
if all([df[c].is_unique for x in columns_that_must_be_unique]):
    print('TEST PASSED')
else:
    print('TEST_FAILED')
    print([c for c in columns_that_must_be_unique if df[c].is_unique is False])

TEST PASSED


In [140]:
### Check that data formats are as expected
df.dtypes

resource_state                     int64
athlete                           object
name                              object
distance                         float64
moving_time                        int64
elapsed_time                       int64
total_elevation_gain             float64
type                              object
sport_type                        object
workout_type                     float64
id                                 int64
start_date                        object
start_date_local                  object
timezone                          object
utc_offset                       float64
location_city                     object
location_state                    object
location_country                  object
achievement_count                  int64
kudos_count                        int64
comment_count                      int64
athlete_count                      int64
photo_count                        int64
map                               object
trainer         