# Running Data Exploration

## Importing libaries

In [3]:
import numpy as np
import pandas as pd
import requests
import json

## Storing API credentials

In [4]:
api_credentials = json.load(open('.secret/strava_api_credentials.json', 'r'))
client_id = api_credentials['client_id']
client_secret = api_credentials['client_secret']
access_token = api_credentials['access_token']
refresh_token = api_credentials['refresh_token']

## Storing request fundementals

In [3]:
strava_base_url = "https://www.strava.com/api/v3"
headers = {"Authorization": "Bearer {}".format(access_token)}

## Client authentication

In [None]:
authorisation_code = "ad9c4b97c1bcb1e9ea035e95aa2081691bf53f73"
req = requests.post("https://www.strava.com/oauth/token?client_id={}&client_secret={}&code={}&grant_type=authororization_code".format(client_id, client_secret, refresh_token))

## Refresh token exchange

In [5]:
req = requests.post("https://www.strava.com/oauth/token?client_id={}&client_secret={}&refresh_token={}&grant_type=refresh_token".format(client_id, client_secret, refresh_token))
req.json()

{'token_type': 'Bearer',
 'access_token': '51a38b6eba1c36cf43e8736e5d4d161e5876b0d2',
 'expires_at': 1584061823,
 'expires_in': 21600,
 'refresh_token': 'f437acf70f96a2e3cadbc2416b63f2855fa9dfee'}

# Getting basic activity information

## Example response

In [4]:
req = requests.get("{}/athlete/activities".format(strava_base_url), headers = headers)
req.json()[0]

{'resource_state': 2,
 'athlete': {'id': 34272639, 'resource_state': 1},
 'name': 'Evening Run - Welwyn ',
 'distance': 9476.9,
 'moving_time': 2776,
 'elapsed_time': 2776,
 'total_elevation_gain': 144.4,
 'type': 'Run',
 'workout_type': 3,
 'id': 3134814172,
 'external_id': 'garmin_push_4594646451',
 'upload_id': 3349071297,
 'start_date': '2020-02-26T20:09:31Z',
 'start_date_local': '2020-02-26T20:09:31Z',
 'timezone': '(GMT+00:00) Europe/London',
 'utc_offset': 0.0,
 'start_latlng': [51.821439, -0.205277],
 'end_latlng': [51.821475, -0.20266],
 'location_city': None,
 'location_state': None,
 'location_country': None,
 'start_latitude': 51.821439,
 'start_longitude': -0.205277,
 'achievement_count': 0,
 'kudos_count': 8,
 'comment_count': 0,
 'athlete_count': 1,
 'photo_count': 0,
 'map': {'id': 'a3134814172',
  'summary_polyline': '}jh{H~ag@Ek@Uc@WWw@_@SQSWSs@Gq@BmANw@Tc@\\Wb@Kz@CRBDNLfANx@Vh@Z`@JVHZOnAGnBAv@D`A?t@H`@LXHHJ?FID[AYIc@Ym@IYA{@Hq@@yAB[FQJCXBXZp@lANj@R~@HvAAz@QfBS`EIf@C

## Looping over activities

In [20]:
activities = []
i = 1
# first request
req = requests.get("{}/athlete/activities".format(strava_base_url), headers = headers, params = {'page': i})
# looping over user activities in batches of 30 until request is empty
while len(req.json()) > 0:
    # storing attributes from each activity in a dictionary and adding to a list
    for activity in req.json():
        # extracting attributes from request, padding missing values with empty strings
        activity_name, activity_id, activity_type = activity['name'], activity['id'], activity['type']
        distance, time = activity.get('distance', np.nan), activity.get('elapsed_time', np.nan)
        elevation_gain = activity.get('total_elevation_gain', np.nan)
        kudos = activity.get('kudos_count', np.nan)
        start_date = activity.get('start_date', np.nan)
        average_speed, max_speed, average_cadence = activity.get('average_speed', np.nan), activity.get('max_speed', np.nan), activity.get('average_cadence', np.nan)
        average_hr, max_hr = activity.get('average_heartrate', np.nan), activity.get('max_heartrate', np.nan)
        suffer_score = activity.get('suffer_score', np.nan)
        # storing attributes in a dictionary
        activity_attributes = {'activity_name': activity_name, 'activity_id': activity_id, 'activity_type': activity_type, 'distance': distance, 'time': time, 'elevation_gain': elevation_gain, 'kudos': kudos, 'start_date': start_date, 'average_speed': average_speed, 'max_speed': max_speed,
'average_cadence': average_cadence, 'average_hr': average_hr, 'max_hr': max_hr, 'suffer_score': suffer_score}
        # appending dictionary to list
        activities.append(activity_attributes)
    # iterating to next page
    i += 1
    req = requests.get("{}/athlete/activities".format(strava_base_url), headers = headers, params = {'page': i})

In [21]:
activities_df = pd.DataFrame(activities)
activities_df = activities_df.loc[activities_df['activity_type'] == 'Run']

In [9]:
activities_df.to_csv('activities.csv')

In [2]:
activities_df = pd.read_csv('activities.csv').drop(columns = ['Unnamed: 0', 'activity_type'])
activities_df.head()

Unnamed: 0,activity_id,activity_name,average_cadence,average_hr,average_speed,distance,elevation_gain,kudos,max_hr,max_speed,start_date,suffer_score,time
0,3134814172,Evening Run - Welwyn,79.4,137.2,3.414,9476.9,144.4,8,163.0,4.9,2020-02-26T20:09:31Z,25.0,2776
1,3131802476,Tuesday intervals (including WU/WD),84.5,153.3,3.507,11244.2,56.2,15,186.0,7.4,2020-02-25T18:34:37Z,69.0,4038
2,3124325868,Morning Run - Welwyn,81.9,140.3,3.514,9944.3,118.4,10,170.0,5.2,2020-02-23T08:34:33Z,34.0,2830
3,3121033650,Panshanger PR (18:11 - 1st),85.7,173.4,4.49,5006.7,36.2,16,195.0,7.1,2020-02-22T09:02:55Z,58.0,1238
4,3121032425,PR WU,81.0,141.0,3.347,2604.2,2.1,1,158.0,5.3,2020-02-22T08:42:49Z,9.0,778


# Getting activity weather information

## Storing API credentials

In [40]:
ds_key = json.load(open('.secret/ds_api_credentials.json', 'r'))['key']

## Storing request fundementals

In [41]:
ds_base_url = "https://api.darksky.net/forecast"
exclude_blocks = "minutely,hourly,daily,alerts"

## Storing activity ids

In [11]:
activity_ids = list(activities_df['activity_id'])

## Example response

In [43]:
strava_req = requests.get("{}/activities/{}".format(strava_base_url, activity_ids[0]), headers = headers).json()
time, lat_lon = strava_req['start_date'], strava_req.get('start_latlng')
ds_req = requests.get("{}/{}/{},{},{}?exclude={}".format(ds_base_url, ds_key, lat_lon[0], lat_lon[1], time, exclude_blocks), params= {'units': 'si'})
ds_req.json()

{'latitude': 51.821439,
 'longitude': -0.205277,
 'timezone': 'Europe/London',
 'currently': {'time': 1582747771,
  'summary': 'Mostly Cloudy',
  'icon': 'partly-cloudy-night',
  'precipIntensity': 0,
  'precipProbability': 0,
  'temperature': 4.03,
  'apparentTemperature': 0.57,
  'dewPoint': 1.56,
  'humidity': 0.84,
  'pressure': 1007.4,
  'windSpeed': 4.15,
  'windGust': 8.71,
  'windBearing': 199,
  'cloudCover': 0.77,
  'uvIndex': 0,
  'visibility': 16.093,
  'ozone': 366.2},
 'flags': {'sources': ['cmc', 'gfs', 'icon', 'isd', 'madis'],
  'nearest-station': 8.682,
  'units': 'si'},
 'offset': 0}

## Looping over activities

In [46]:
activity_weather = []
for activity_id in activity_ids:
    strava_req = requests.get("{}/activities/{}".format(strava_base_url, activity_id), headers = headers).json()
    time, lat_lon = strava_req['start_date'], strava_req.get('start_latlng')
    if lat_lon is None:
        continue
    else:
        ds_req = requests.get("{}/{}/{},{},{}?exclude={}".format(ds_base_url, ds_key, lat_lon[0], lat_lon[1], time, exclude_blocks), params= {'units': 'si'}).json()
        ds_current = ds_req.get('currently')
        if ds_current is None:
            continue
        else:
            temp, wind_speed, weather = ds_current.get('temperature', np.nan), ds_current.get('windSpeed', np.nan), ds_current.get('icon', np.nan)
            weather_attributes = {'activity_id': activity_id, 'temp': temp, 'wind_speed': wind_speed, 'weather': weather}
            activity_weather.append(weather_attributes)

In [47]:
activity_weather = pd.DataFrame(activity_weather)

In [48]:
activity_weather.to_csv('activity_weather.csv')

In [3]:
activity_weather = pd.read_csv('activity_weather.csv').drop(columns = ['Unnamed: 0'])
activity_weather.head()

Unnamed: 0,activity_id,temp,weather,wind_speed
0,3134814172,4.03,partly-cloudy-night,4.15
1,3131802476,2.94,rain,4.53
2,3124325868,12.3,rain,12.13
3,3121033650,11.72,wind,10.66
4,3121032425,11.61,wind,10.59


# Getting activity split times

## Example response

In [29]:
req_2 = requests.get("{}/activities/{}/laps".format(strava_base_url, activity_ids[0]), headers = headers)
req_2.json()

[{'id': 10457399657,
  'resource_state': 2,
  'name': 'Lap 1',
  'activity': {'id': 3134814172, 'resource_state': 1},
  'athlete': {'id': 34272639, 'resource_state': 1},
  'elapsed_time': 296,
  'moving_time': 296,
  'start_date': '2020-02-26T20:09:31Z',
  'start_date_local': '2020-02-26T20:09:31Z',
  'distance': 1000.0,
  'start_index': 0,
  'end_index': 296,
  'total_elevation_gain': 20.0,
  'average_speed': 3.38,
  'max_speed': 4.0,
  'average_cadence': 79.0,
  'average_heartrate': 131.0,
  'max_heartrate': 151.0,
  'lap_index': 1,
  'split': 1,
  'pace_zone': 2},
 {'id': 10457399660,
  'resource_state': 2,
  'name': 'Lap 2',
  'activity': {'id': 3134814172, 'resource_state': 1},
  'athlete': {'id': 34272639, 'resource_state': 1},
  'elapsed_time': 288,
  'moving_time': 288,
  'start_date': '2020-02-26T20:14:28Z',
  'start_date_local': '2020-02-26T20:14:28Z',
  'distance': 1000.0,
  'start_index': 297,
  'end_index': 584,
  'total_elevation_gain': 58.0,
  'average_speed': 3.47,
  'm

## Looping over activities

In [30]:
activity_splits = []
for activity_id in activity_ids:
    req = requests.get("{}/activities/{}/laps".format(strava_base_url, activity_id), headers = headers)
    for split in req.json():
        split_index = split['split']
        split_time = split.get('elapsed_time', np.nan)
        split_distance, split_elevation_gain = split.get('distance', np.nan), split.get('total_elevation_gain', np.nan)
        split_average_speed, split_max_speed = split.get('average_speed', np.nan), split.get('max_speed', np.nan)
        split_average_hr, split_max_hr = split.get('average_heartrate', np.nan), split.get('max_heartrate', np.nan)
        split_average_cadence = split.get('average_cadence', np.nan)
        split_attributes = {'activity_id': activity_id, 'split_index': split_index, 'split_time': split_time, 'split_distance': split_distance, 'split_elevation_gain': split_elevation_gain, 'split_average_speed': split_average_speed, 'split_max_speed': split_max_speed, 'split_average_hr': split_average_hr, 'split_max_hr': split_max_hr, 'split_average_cadence': split_average_cadence}
        activity_splits.append(split_attributes)

In [31]:
activity_splits = pd.DataFrame(activity_splits)

In [32]:
activity_splits.to_csv('activity_splits.csv')

In [4]:
activity_splits = pd.read_csv('activity_splits.csv').drop(columns = ['Unnamed: 0'])
activity_splits.head()

Unnamed: 0,activity_id,split_average_cadence,split_average_hr,split_average_speed,split_distance,split_elevation_gain,split_index,split_max_hr,split_max_speed,split_time
0,3134814172,79.0,131.0,3.38,1000.0,20.0,1,151.0,4.0,296
1,3134814172,79.5,141.1,3.47,1000.0,58.0,2,149.0,4.5,288
2,3134814172,79.6,138.9,3.45,1000.0,58.0,3,152.0,3.9,290
3,3134814172,79.6,147.3,3.36,1000.0,54.0,4,163.0,4.1,298
4,3134814172,78.5,132.6,3.41,1000.0,52.0,5,148.0,4.9,293


# Getting activity HR zones

## Example response

In [93]:
req_3 = requests.get("{}/activities/{}/zones".format(strava_base_url, activity_ids[0]), headers = headers)
req_3.json()

[{'score': 25,
  'distribution_buckets': [{'max': 123, 'min': 0, 'time': 135},
   {'max': 153, 'min': 123, 'time': 2546},
   {'max': 169, 'min': 153, 'time': 95},
   {'max': 184, 'min': 169, 'time': 0},
   {'max': -1, 'min': 184, 'time': 0}],
  'type': 'heartrate',
  'resource_state': 3,
  'sensor_based': True,
  'points': 0,
  'custom_zones': False},
 {'score': 5,
  'distribution_buckets': [{'max': 3.270811744386874, 'min': 0, 'time': 814},
   {'max': 3.799913644214162, 'min': 3.270811744386874, 'time': 1403},
   {'max': 4.232815198618307, 'min': 3.799913644214162, 'time': 455},
   {'max': 4.521416234887737, 'min': 4.232815198618307, 'time': 64},
   {'max': 4.810017271157167, 'min': 4.521416234887737, 'time': 40},
   {'max': -1, 'min': 4.810017271157167, 'time': 0}],
  'type': 'pace',
  'resource_state': 3,
  'sensor_based': True}]

## Looping over activities

In [92]:
activity_zones = []
for activity_id in activity_ids:
    req = requests.get("{}/activities/{}/zones".format(strava_base_url, activity_id), headers = headers)
    if len(req.json()) > 0:
        activity_attributes = {'activity_id': activity_id}
        bucket_times = list(map(lambda x: list(map(lambda x: x['time'], x['distribution_buckets'])), req.json()))
        bucket_types = list(map(lambda x: x['type'], req.json()))
        for bucket in buckets:
            i = 1
            for zone in bucket[1]:
                activity_attributes["{}_zone_{}_time".format(bucket[0], i)] = zone
                i += 1
        activity_zones.append(activity_attributes)
        if 'heartrate' not in bucket_types:
            for i in range(1,6):
                activity_attributes['heartrate_zone_{}_time'.format(i)] = 0
        elif 'pace' not in bucket_types:
            for i in range(1,7):
                activity_attributes['pace_zone_{}_time'.format(i)] = 0
        buckets = list(zip(bucket_types, bucket_times))
    else:
        continue

In [96]:
activity_zones = pd.DataFrame(activity_zones).fillna(0)

In [97]:
activity_zones.to_csv('activity_zones.csv')

In [5]:
activity_zones = pd.read_csv('activity_zones.csv').drop(columns = ['Unnamed: 0'])
activity_zones.head()

Unnamed: 0,activity_id,heartrate_zone_1_time,heartrate_zone_2_time,heartrate_zone_3_time,heartrate_zone_4_time,heartrate_zone_5_time,pace_zone_1_time,pace_zone_2_time,pace_zone_3_time,pace_zone_4_time,pace_zone_5_time,pace_zone_6_time
0,3134814172,0.0,50.0,332.0,730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3131802476,135.0,2546.0,95.0,0.0,0.0,814.0,1403.0,455.0,64.0,40.0,0.0
2,3124325868,97.0,1362.0,730.0,609.0,18.0,753.0,695.0,299.0,36.0,106.0,927.0
3,3121033650,118.0,2528.0,183.0,1.0,0.0,458.0,1749.0,545.0,56.0,22.0,0.0
4,3121032425,19.0,240.0,109.0,239.0,505.0,0.0,0.0,94.0,342.0,448.0,228.0


In [28]:
import datetime
activities_df['start_date'] = pd.to_datetime(activities_df['start_date'])
activities_df['activity_date'] = activities_df['start_date'].map(lambda x: x.date)
activities_df['activity_time'] = activities_df['start_date'].map(lambda x: x.time)

# ETL pipeline

In [126]:
def token_exchange():
    api_credentials_file = open('.secret/strava_api_credentials.json', 'r')
    api_credentials = json.load(api_credentials_file)
    client_id = api_credentials['client_id']
    client_secret = api_credentials['client_secret']
    refresh_token = api_credentials['refresh_token']
    api_credentials_file.close()
    req = requests.post("https://www.strava.com/oauth/token?client_id={}&client_secret={}&refresh_token={}&grant_type=refresh_token".format(client_id, client_secret, refresh_token)).json()
    api_credentials['access_token'] = req['access_token']
    api_credentials['refresh_token'] = req['refresh_token']
    api_credentials_file = open('.secret/strava_api_credentials.json', 'w')
    json.dump(api_credentials, api_credentials_file)
    api_credentials_file.close()
    return api_credentials

In [127]:
token_exchange()

{'client_id': '41870',
 'client_secret': 'f1bd14658a33ea8b696ffb8f51ca247075e5197d',
 'access_token': '889054b9f7e1ce35faa45e0cc0a52b7e2f7ac354',
 'refresh_token': 'f437acf70f96a2e3cadbc2416b63f2855fa9dfee'}

In [128]:
def request_activities(access_token, start_date):
    url = "https://www.strava.com/api/v3/athlete/activities"
    headers = {"Authorization": "Bearer {}".format(access_token)}
    params = {'after': start_date}
    req = requests.get(url, headers = headers, params = params).json()
    activities = []
    for activity in req:
        activity_info = {}
        activity_info['activity_name'] = activity['name']
        activity_info['activity_id'] = activity['id']
        activity_info['activity_type'] = activity['type']
        activity_info['distance'] = activity.get('distance', np.nan)
        activity_info['time'] = activity.get('elapsed_time', np.nan)
        activity_info['elevation_gain'] = activity.get('total_elevation_gain', np.nan)
        activity_info['kudos'] = activity.get('kudos_count', np.nan)
        activity_info['start_date'] = activity.get('start_date', np.nan)
        activity_info['average_speed'] = activity.get('average_speed', np.nan)
        activity_info['max_speed'] = activity.get('max_speed', np.nan)
        activity_info['average_cadence'] = activity.get('average_cadence', np.nan)
        activity_info['average_hr'] = activity.get('average_heartrate', np.nan)
        activity_info['max_hr'] = activity.get('max_heartrate', np.nan)
        activity_info['suffer_score'] = activity.get('suffer_score', np.nan)
        activities.append(activity_info)
    return activities

In [129]:
access_token = token_exchange()['access_token']

In [130]:
from datetime import datetime
import time

def iso_8601_to_unix(date_string):
    iso_8601_date = datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%SZ')
    date_tuple = iso_8601_date.timetuple()
    unix_date = int(time.mktime(date_tuple))
    return unix_date

In [131]:
test_date = iso_8601_to_unix('2020-02-27T00:00:00Z')
activities = request_activities(access_token, test_date)
activity_ids = list(map(lambda activity: activity['activity_id'], activities))

In [138]:
import csv

def append_activities(activities):
    n = len(activities)
    with open('activities.csv', 'r') as r, open('activities.csv', 'a', newline = '') as a:
        activity_fields = next(csv.reader(r))
        csv_writer = csv.DictWriter(a, fieldnames = activity_fields, restval = 0)
        csv_writer.writerows(activities)
    return print("{} activities appended".format(n))

In [139]:
append_activities(activities)

8 activities appended
