In [None]:
import json
import requests
import time
import os
import pandas as pd

### making data a bit easier to see
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

### should be outside the repo
cred_location = r'/Users/jj/code/strava_creds/'
data_location = r'/Users/jj/code/strava_data/'

## Authorisation
This code will make sure you have the necessary tokens to authenticate to the strava API

full documentation for this can be found here https://developers.strava.com/

In [None]:
### You need to have created an app on strava - this is much easier than it sounds.
### Go through the first couple of steps here if unsure: 
###https://medium.com/@annthurium/getting-started-with-the-strava-api-a-tutorial-f3909496cd2d

### These client tokens don't change, so you should only have to enter these once. 
def get_client_tokens():
    print('please go here https://www.strava.com/settings/api, and copy the token information as prompted')
    client_id = input('please enter your client id')
    client_secret = input('please enter your client secret')
    redirect_uri = input('please enter a redirect uri (use http://localhost/ if unsure)')

    client_tokens = {
        'client_id':str(client_id),
        'client_secret':str(client_secret),
        'redirect_uri':str(redirect_uri)
    }

    with open(cred_location+'client_tokens.json', 'w') as outfile:
        json.dump(client_tokens, outfile, indent=4)

    print("client tokens saved")
    return None

### This will just check if the client tokens exist, not if they are valid.
### client token, client secret, and refresh token should all be held in a json file called client_tokens.json
def check_client_tokens():
    try:
        client_tokens = json.load(open(cred_location+'client_tokens.json'))
        if {'client_id', 'client_secret', 'redirect_uri'} == set(client_tokens.keys()):
            print('Client Tokens Found')
            return True
        else:
            return False
    except:
        return False


def prompt_authorization():

      client_tokens = json.load(open(cred_location+'client_tokens.json'))

      ### necessary to get athlete activities
      scopes = ','.join(['profile:read_all', 'activity:read_all'])

      # Authorization URL
      request_url = f'http://www.strava.com/oauth/authorize?client_id={client_tokens["client_id"]}' \
                        f'&response_type=code&redirect_uri={client_tokens["redirect_uri"]}' \
                        f'&approval_prompt=force' \
                        f'&scope={scopes}'

      # This will prompt you and ask for the code in the url
      print('Click here:', request_url)
      print('Please authorize the app and copy&paste below the generated code!')
      print('P.S: you can find the code in the URL between "code=" and the nexrt "&"')
      code = input('Insert the code from the url: ')

      # Get the access token
      token = requests.post(url='https://www.strava.com/api/v3/oauth/token',
                        data={'client_id': client_tokens["client_id"],
                              'client_secret': client_tokens["client_secret"],
                              'code': code,
                              'grant_type': 'authorization_code'})

      access_tokens = token.json()

      # save the token with the applied for scope in the cred_location
      access_tokens.update({'scopes':scopes})

      with open(cred_location+'access_tokens.json', 'w') as outfile:
            json.dump(access_tokens, outfile, indent=4)
      print('Authorisation Complete')



### access tokens expire every 6 hours.
### the access token can be refreshed using this code.
### it will have the same scope as the original access code.
def refresh_authorization():

      # get previously saved tokens
      client_tokens = json.load(open(cred_location+'client_tokens.json'))
      access_tokens = json.load(open(cred_location+'access_tokens.json'))

      refresh_url = "https://www.strava.com/oauth/token"

      payload = {
            'client_secret':client_tokens['client_secret'],
            'client_id':client_tokens['client_id'],
            'refresh_token':access_tokens['refresh_token'],
            'grant_type':'refresh_token',
            'f':'json'
                 }
      
      print("Requesting Token...\n")
      res = requests.post(refresh_url, data=payload, verify=False)
      access_token = res.json()

      # save the new token and copy the scope of the old token
      access_token.update({'scopes':access_tokens['scopes']})

      if res.status_code < 300:
            with open(cred_location+'access_tokens.json', 'w') as outfile:
                  json.dump(access_token, outfile, indent=4)
            print('Authorisation Complete')

      else:
            print('Authorisation Error')
            print(res.json())

      return True





def authorization_flow():

      ### Check if we have client tokens
      if check_client_tokens():
            
            ### check we have an access token
            if os.path.isfile(cred_location+'access_tokens.json'):
                  tokens = json.load(open(cred_location+'access_tokens.json'))
                  
                  ### check if we have the right keys in the access token file
                  if all(x in tokens.keys() for x in ['access_token', 'scopes', 'refresh_token']):

                        ### check if the access token is still valid
                        if time.time() < int(tokens['expires_at']):
                              print((int(tokens['expires_at']) - int(time.time())) / 60, 'minutes until token expires')
                              print('current scopes:', tokens['scopes'])
                        
                        else:
                              print('access_code_expired, requesting refresh')
                              refresh_authorization()
                              authorization_flow() ## restart the flow

                  else:
                        print('data missing from access_tokens.json')
                        prompt_authorization()
                        authorization_flow() ## restart the flow

            else: 
                  print('access_token.json file missing')
                  prompt_authorization()
                  authorization_flow()

      else:
           print('client tokens not found')
           get_client_tokens()
           authorization_flow()

#prompt_authorization()                 
authorization_flow()


## Get Activity Data
this code will hit the 'all_activities' endpoint to get all data for the authenticated user

more here https://developers.strava.com/docs/reference/#api-Activities-getLoggedInAthleteActivities

In [None]:
### Makes one request to the API for n number of activities
def get_activities(n, page_no=1):
    activites_url = "https://www.strava.com/api/v3/athlete/activities"

    access_tokens = json.load(open(cred_location+'access_tokens.json'))

    header = {'Authorization': 'Bearer ' + access_tokens['access_token']}
    param = {'per_page': n, 'page': page_no}

    response = requests.get(activites_url, headers=header, params=param)
    return response

### preview of what they look like
two_activities = get_activities(2)
trial_df = pd.DataFrame(two_activities.json())
trial_df


In [None]:
### this iterates through the get_activities() function to get all activities for an athlete
### You should check that the total number of activities matches what's on your strava profile
def get_all_activities():

    output = []
    i = 1
    valid_data = True
    while valid_data == True:
        response = get_activities(200, i)
        if response.status_code < 300 and len(response.json()) > 0: 
            [output.append(x) for x in response.json()]
            print(len(response.json()), 'rows added from page', i)
            i+=1
        else:
            print(response.json())
            valid_data = False

    return output

all_activities = get_all_activities()
all_activities_df = pd.DataFrame(all_activities)

### save outside the repo
all_activities_df.to_csv(data_location+'all_activities_raw.csv')
all_activities_df.head()

## Test Activity Data
A few tests to check the data from the API is as expected

In [None]:
### Test 1: Check that all your activities have been collected

### before doing this, go to https://www.strava.com/dashboard and make a note of your total number of activities

### check that the dataframe is of the same length.
print(len(all_activities_df))

### test passed if the numbers match

In [None]:
### Test 2: check that the columns are populated
columns_that_must_be_populated = ['id', 'name', 'type', 'start_date', 'private']
possible_empty_values = ['none', 'unknown', 'None', 'Unknown', '', None]

errors = []
for v in possible_empty_values:
    for c in columns_that_must_be_populated:
        if v in all_activities_df[c]:
            errors.append({c:(df[c] == v).sum()+' instances of '+v})
if errors == []:
    print('TEST PASSED')
else:
    print('TEST FAILED')
    print(errors)

In [None]:
### Test 3: Check that columns which should be unique are unique
columns_that_must_be_unique = ['id', 'start_date']
if all([all_activities_df[c].is_unique for c in columns_that_must_be_unique]):
    print('TEST PASSED')
else:
    print('TEST_FAILED')
    print([c for c in columns_that_must_be_unique if df[c].is_unique is False])

## Get User Data

This code will hit the 'Athlete' endpoint.

https://developers.strava.com/docs/reference/#api-Athletes-getLoggedInAthlete

In [None]:
def get_athlete_data():
    athlete_url = "https://www.strava.com/api/v3/athlete"
    access_tokens = json.load(open(cred_location+'access_tokens.json'))

    header = {'authorization': 'Bearer ' + access_tokens['access_token']}

    response = requests.get(athlete_url, headers=header)

    return response.json()

athlete_data = get_athlete_data()
with open(data_location+'athlete_data_raw.json', 'w') as outfile:
    json.dump(athlete_data, outfile, indent=4)

athlete_data

## Test User Data

In [None]:
### Test 1: Check that some fields we expect to be populated are populated:
fields_which_should_be_populated = ['firstname', 'id', 'created_at', 'follower_count']

if all([x for x in fields_which_should_be_populated if x in athlete_data.keys() and athlete_data[x] == True]):
    print('TEST PASSED')
else:
    print('TEST FAILED')
    print('values missing: ', [x for x in fields_which_should_be_populated if x not in athlete_data.keys() or athlete_data[x] == False])

In [None]:
### Test 2: Check that athlete gear matches activity gear
activity_gear = set(all_activities_df['gear_id'].dropna().unique())
athlete_gear = {b['id'] for b in athlete_data['bikes']}

if athlete_gear == activity_gear:
    print('TEST PASSED')
else:
    print('TEST FAILED')

## Data Modification

Convering some data types, removing some columns, and making it easier to visualise.

In [35]:
### Flatteining the json columns

def flatten(s_name, df):
    flat_s = pd.json_normalize(df[s_name])
    flat_s.columns = [s_name+'_'+x for x in flat_s.columns]
    df = df.drop(columns=s_name)
    df = pd.concat([df, flat_s], axis=1)
    return df


### Joining on gear data

def get_gear_mapping(gear_type, df):
    new_cols = pd.json_normalize(athlete_data[gear_type])
    new_cols.columns = [gear_type[:-1]+'_'+x for x in new_cols.columns]
    df = df.merge(new_cols, left_on='gear_id', right_on=gear_type[:-1]+'_id', how='left')
    return df

### converting datetime data
def convert_datetime(df):
    for col in df.columns:
        if 'date' in col and 'local' in col:
            df[col] = pd.to_datetime(df[col])
            df[col] = df[col].dt.tz_localize(None)
            df['year'] = df[col].dt.to_period('Y')
            df['month'] = df[col].dt.to_period('M')
    return df

### converting speed
def convert_speed(df):
    for col in df.columns:
        if 'speed' in col:
            df[col] = df[col] * 3.6
            df = df.rename(columns={col:col+'_kph'})
    return df

### converting time
def convert_duration(df):
    for col in df.columns:
        if '_time' in col:
            df[col+'_minutes'] = df[col] / 60
            df[col+'_hours'] = df[col] / 3600
            df = df.rename(columns={col:col+'_seconds'})
    return df


### Won't need these columns
def drop_useless_columns(df):
    useless_columns = [
        'resource_state', 
        'start_date', 
        'timezone', 
        'utc_offset', 
        'location_city', 
        'location_state', 
        'location_country', 
        'upload_id', 
        'upload_id_str', 
        'external_id', 
        'from_accepted_tag', 
        'map_id', 
        'map_resource_state', 
        'bike_resource_state', 
        'bike_distance', 
        'bike_converted_distance']
    return df.drop(columns=useless_columns)


x_df = flatten('athlete', all_activities_df)
x_df = flatten('map', x_df)

x_df = get_gear_mapping('bikes', x_df)

x_df = convert_datetime(x_df)
x_df = convert_speed(x_df)
x_df = convert_duration(x_df)

x_df['calories'] = (x_df['kilojoules'] / 4.184).round()
x_df['distance'] = x_df['distance'] / 1000

x_df = drop_useless_columns(x_df)

x_df.to_csv(data_location+'all_activities_cleaned.csv')

In [36]:
x_df

Unnamed: 0,name,distance,moving_time_seconds,elapsed_time_seconds,total_elevation_gain,type,sport_type,workout_type,id,start_date_local,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,private,visibility,flagged,gear_id,start_latlng,end_latlng,average_speed_kph,max_speed_kph,average_temp,average_watts,kilojoules,device_watts,has_heartrate,average_heartrate,max_heartrate,heartrate_opt_out,display_hide_heartrate_option,elev_high,elev_low,pr_count,total_photo_count,has_kudoed,suffer_score,average_cadence,max_watts,weighted_average_watts,athlete_id,athlete_resource_state,map_summary_polyline,bike_id,bike_primary,bike_name,bike_nickname,bike_retired,year,month,moving_time_minutes,moving_time_hours,elapsed_time_minutes,elapsed_time_hours,calories
0,Morning Mountain Bike Ride,24.9629,9598,13548,930.4,Ride,MountainBikeRide,10.0,8744397493,2023-03-20 09:31:46,33,1,0,1,0,False,False,False,False,everyone,False,b11952969,"[-41.29796063527465, 174.72158012911677]","[-41.297213891521096, 174.7220081090927]",9.3636,35.0208,24.0,130.6,1253.9,False,True,144.1,178.0,False,True,404.5,126.1,19,0,False,123.0,,,,23883723,1,h_a{F{hli`@rE`Bx@jBjAZ`CuEdF|ATlAm@xD`@}DaA`DD...,b11952969,False,Canyon Spectral,Canyon Spectral,False,2023,2023-03,159.966667,2.666111,225.800000,3.763333,300.0
1,Afternoon Mountain Bike Ride,11.4628,5136,7891,389.0,Ride,MountainBikeRide,10.0,8744607827,2023-03-18 14:43:29,0,0,0,2,0,False,False,False,False,everyone,False,b11952969,"[-41.29956400953233, 174.78885202668607]","[-41.29894039593637, 174.78847458958626]",8.0352,33.9912,26.0,117.4,602.9,False,False,,,False,False,169.8,14.6,0,0,False,,,,,23883723,1,hia{Fimyi`@SMBHm@Qs@AQSFNCBIIBMGWWOCiAm@[F?DET...,b11952969,False,Canyon Spectral,Canyon Spectral,False,2023,2023-03,85.600000,1.426667,131.516667,2.191944,144.0
2,Lunch Ride,2.0372,278,775,0.0,Ride,Ride,,8726797885,2023-03-17 11:49:16,0,0,0,1,0,False,False,False,True,only_me,False,b4083088,"[-41.28829252906144, 174.7669970523566]","[-41.2935836892575, 174.77822947315872]",26.3808,56.9376,27.0,26.4,7.3,False,True,113.5,136.0,False,True,111.6,-14.0,0,0,False,0.0,,,,23883723,1,zj_{Fggui`@DK@GS]KIyCiAeAe@}@QkA]kCeAc@UKOBg@D...,b4083088,True,Avanti Giro 2,Avanti Giro 2,False,2023,2023-03,4.633333,0.077222,12.916667,0.215278,2.0
3,Lunch Ride,2.1672,727,773,132.0,Ride,Ride,,8721532095,2023-03-16 12:29:03,0,0,0,1,0,False,False,False,True,only_me,False,b4083088,"[-41.293124444782734, 174.77520930580795]","[-41.28730338066816, 174.76581109687686]",10.7316,19.8360,28.0,146.0,106.1,False,True,125.4,145.0,False,True,134.4,32.6,0,0,False,3.0,,,,23883723,1,`a`{F_xvi`@?NQv@?d@I^Wn@]lAi@xBGb@i@xAKl@M`A[x...,b4083088,True,Avanti Giro 2,Avanti Giro 2,False,2023,2023-03,12.116667,0.201944,12.883333,0.214722,25.0
4,Lunch Ride,2.1091,267,302,3.0,Ride,Ride,,8721256990,2023-03-16 11:56:29,0,0,0,1,0,False,False,False,True,only_me,False,b4083088,"[-41.28733104094863, 174.76634301245213]","[-41.29523408599198, 174.77632116526365]",28.4364,52.9488,28.0,33.5,8.9,False,True,105.8,121.0,False,True,114.0,2.0,0,0,False,0.0,,,,23883723,1,nj_{Fggui`@BGIUOQkB_AqAs@_HeCq@]ES?QFg@JIf@ErA...,b4083088,True,Avanti Giro 2,Avanti Giro 2,False,2023,2023-03,4.450000,0.074167,5.033333,0.083889,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2203,Radio dome ridge,19.8536,4766,4928,707.1,Ride,Ride,10.0,1121238293,2017-08-07 15:21:49,9,0,0,1,0,False,False,False,False,everyone,False,b4083088,"[-41.294885, 174.781158]","[-41.294907, 174.781201]",14.9976,71.2800,,161.6,770.2,False,False,,,False,False,483.2,12.0,1,1,False,,,,,23883723,1,j{`{F_pwi`@|C~A`ADh@n@UlDeBdIdAp@Hf@cEnQUzB}@d...,b4083088,True,Avanti Giro 2,Avanti Giro 2,False,2017,2017-08,79.433333,1.323889,82.133333,1.368889,184.0
2204,Morning Ride,37.3965,5493,5586,670.4,Ride,Ride,10.0,1117419963,2017-08-05 07:26:27,5,0,0,1,0,False,False,False,False,everyone,False,b4083088,"[-41.294838, 174.781546]","[-41.294958, 174.781226]",24.5088,56.8800,,166.1,912.1,False,False,,,False,False,144.0,-6.4,3,1,False,,,,,23883723,1,vn`{Fyhxi`@}EgBqAvBcGoA{B{@rB}JmCiEg@kMoDoLNsA...,b4083088,True,Avanti Giro 2,Avanti Giro 2,False,2017,2017-08,91.550000,1.525833,93.100000,1.551667,218.0
2205,Afternoon Ride,5.6325,1214,1397,238.9,Ride,Ride,10.0,1116331496,2017-08-04 14:34:58,0,0,0,1,0,False,False,False,False,everyone,False,b4083088,"[-41.294282, 174.781417]","[-41.287453, 174.77021]",16.7040,55.8000,,201.9,245.1,False,False,,,False,False,163.8,12.0,0,0,False,,,,,23883723,1,ly`{Fcqwi`@FFj@Nt@d@HBJHJ@VNRPd@LbAf@RPJ@HFFTI...,b4083088,True,Avanti Giro 2,Avanti Giro 2,False,2017,2017-08,20.233333,0.337222,23.283333,0.388056,59.0
2206,Morning Ride,12.6509,2588,3971,365.0,Ride,Ride,10.0,1116106078,2017-08-04 09:41:27,4,0,0,1,0,False,False,False,False,everyone,False,,"[-41.294817, 174.781001]","[-41.306831, 174.77904]",17.5968,56.8800,,167.5,433.4,False,False,,,False,False,184.9,5.1,2,1,False,,,,,23883723,1,rn`{Fiexi`@Nc@F_@OQaAe@kBk@OQWK[EBMs@c@sDuAE@F...,,,,,,2017,2017-08,43.133333,0.718889,66.183333,1.103056,104.0
