# ETL pipeline

In [174]:
from datetime import datetime, timedelta
import time
from ETL_pipeline_functions import last_date, iso_8601_to_unix, strava_token_exchange, request_activities, request_weather, request_splits, request_zones, append_requests

In [182]:
def ETL_pipeline():
    # storing access token
    access_token = strava_token_exchange()['access_token']

    # storing last activity date in unix format
    start_date = iso_8601_to_unix(last_date())

    # requesting activities from past week
    activities = request_activities(start_date, access_token)

    # checking for activities
    n = 0
    n += len(activities)

    # error handling for weeks with no activities
    if n == 0:
        return print("no activities to append")

    else:
        # appending activities to csv file
        append_requests(activities, 'activities.csv')

        # storing ids for activities
        activity_ids = list(map(lambda activity: activity['activity_id'], activities))

        # requesting weather info for activities and appending to csv file
        weather_list = request_weather(activity_ids, access_token)
        n += append_requests(weather_list, 'activity_weather.csv')

        # requesting split metrics for activities and appending to csv file
        activity_splits = request_splits(activity_ids, access_token)
        n += append_requests(activity_splits, 'activity_splits.csv')

        # requesting hr and pace zones for activities and appending to csv file
        activity_zones = request_zones(activity_ids, access_token)
        n += append_requests(activity_zones, 'activity_zones.csv')
    
    # logging requests
    date = datetime.now().strftime('%d/%m/%Y')
    with open('request_log.csv', 'w', newline = '') as log_file:
        csv_writer = csv.writer(log_file)
        csv_writer.writerow([date, n])
    
    return print("ETL pipeline complete")

In [4]:
ETL_pipeline()

activities.csv appended
activity_weather.csv appended
activity_splits.csv appended
activity_zones.csv appended
ETL pipeline complete


In [183]:
ETL_pipeline()

no activities to append


# Feature engineering

In [225]:
import numpy as np
import pandas as pd
import re

In [226]:
activities = pd.read_csv('activities.csv')

In [228]:
def interval_pattern_matcher(activity_name):
    key_words = ['Intervals', 'Track', 'Yasso']
    for key_word in key_words:
        if key_word in activity_name:
            return True
        else:
            continue
    return False

In [234]:
activities['run_type'] = ''
for index, row in activities.iterrows():
    if interval_pattern_matcher(row['activity_name']) == True:
        activities.at[index, 'run_type'] = 'I'
    else:
        if row['distance'] < 8000:
            activities.at[index, 'run_type'] = 'SR'
        elif row['distance'] < 16000:
            activities.at[index, 'run_type'] = 'MR'
        else:
            activities.at[index, 'run_type'] = 'LR'

In [284]:
def get_position(activity_name):
    position_pattern = re.compile('.*\s(\d+)[a-z]{2}.*')
    position_pattern_2 = re.compile('.*\((\d+)[a-z]{2}.*')
    if len(re.findall(position_pattern, activity_name)) == 1:
        return re.findall(position_pattern, activity_name)[0]
    elif len(re.findall(position_pattern_2, activity_name)) == 1:
        return re.findall(position_pattern_2, activity_name)[0]
    else:
        return 0

In [305]:
activities['position'] = activities['activity_name'].map(lambda x: int(get_position(x)))

In [306]:
def parkrun_pattern_matcher(activity_name):
    if ('PR' in activity_name) & ('WU' not in activity_name):
        return True
    else:
        return False

In [307]:
activities['event_type'] = ''
for index, row in activities.iterrows():
    if parkrun_pattern_matcher(row['activity_name']) == True:
        activities.at[index, 'event_type'] = 'PR'
    else:
        if row['position'] > 0:
            if 'XC' in row['activity_name']:
                activities.at[index, 'event_type'] = 'XCR'
            else:
                activities.at[index, 'event_type'] = 'RR'
        elif row['run_type'] == 'I':
            activities.at[index, 'event_type'] = 'I'
        else:
            activities.at[index, 'event_type'] = 'W'

In [308]:
activities.head()

Unnamed: 0,activity_name,activity_id,activity_type,distance,time,elevation_gain,kudos,start_date,average_speed,max_speed,average_cadence,average_hr,max_hr,suffer_score,run_type,event_type,position
0,Panshanger PR (18:12 - 2nd),1807281354,Run,5000.0,1092,0.0,1,2018-08-25T08:00:00Z,4.579,0.0,,,,,SR,PR,2
1,Evening Run - Panshanger,1817344422,Run,4903.3,1127,58.0,5,2018-09-03T19:14:06Z,4.339,7.0,,169.9,178.0,55.0,SR,W,0
2,Hatfield 5k Series - race 1 (17:55 - 32nd),1821995859,Run,5000.0,1075,0.0,10,2018-09-05T18:45:13Z,4.651,0.0,,,,,SR,RR,32
3,Panshanger PR (18:21 - 2nd),1826965116,Run,5000.0,1101,0.0,6,2018-09-08T08:00:56Z,4.541,0.0,,,,,SR,PR,2
4,Evening Run - Welwyn,1830984968,Run,8093.0,1998,97.0,6,2018-09-09T18:49:18Z,4.038,6.1,,167.9,190.0,85.0,MR,W,0
