# ETL pipeline

In [1]:
from datetime import datetime, timedelta
import time
from ETL_pipeline_functions import last_date, iso_8601_to_unix, strava_token_exchange, request_activities, request_weather, request_splits, request_zones, append_requests

In [2]:
def ETL_pipeline():
    # storing access token
    access_token = strava_token_exchange()['access_token']

    # storing last activity date in unix format
    start_date = iso_8601_to_unix(last_date())

    # requesting activities from past week
    activities = request_activities(start_date, access_token)

    # checking for activities
    n = 0
    n += len(activities)

    # error handling for weeks with no activities
    if n == 0:
        return print("no activities to append")

    else:
        # appending activities to csv file
        append_requests(activities, 'activities.csv')

        # storing ids for activities
        activity_ids = list(map(lambda activity: activity['activity_id'], activities))

        # requesting weather info for activities and appending to csv file
        weather_list = request_weather(activity_ids, access_token)
        n += append_requests(weather_list, 'activity_weather.csv')

        # requesting split metrics for activities and appending to csv file
        activity_splits = request_splits(activity_ids, access_token)
        n += append_requests(activity_splits, 'activity_splits.csv')

        # requesting hr and pace zones for activities and appending to csv file
        activity_zones = request_zones(activity_ids, access_token)
        n += append_requests(activity_zones, 'activity_zones.csv')
    
    # logging requests
    date = datetime.now().strftime('%d/%m/%Y')
    with open('request_log.csv', 'a', newline = '') as log_file:
        csv_writer = csv.writer(log_file)
        csv_writer.writerow([date, n])
    
    return print("ETL pipeline complete")

In [4]:
ETL_pipeline()

activities.csv appended
activity_weather.csv appended
activity_splits.csv appended
activity_zones.csv appended
ETL pipeline complete


In [183]:
ETL_pipeline()

no activities to append


# Data Cleaning

In [3]:
import numpy as np
import pandas as pd
import re

In [4]:
activities = pd.read_csv('activities.csv')

In [5]:
def interval_pattern_matcher(activity_name):
    key_words = ['Intervals', 'Track', 'Yasso']
    for key_word in key_words:
        if key_word in activity_name:
            return True
        else:
            continue
    return False

In [6]:
activities['run_type'] = ''
for index, row in activities.iterrows():
    if interval_pattern_matcher(row['activity_name']) == True:
        activities.at[index, 'run_type'] = 'I'
    else:
        if row['distance'] < 8000:
            activities.at[index, 'run_type'] = 'SR'
        elif row['distance'] < 16000:
            activities.at[index, 'run_type'] = 'MR'
        else:
            activities.at[index, 'run_type'] = 'LR'

In [7]:
def get_position(activity_name):
    position_pattern = re.compile('.*\s(\d+)[a-z]{2}.*')
    position_pattern_2 = re.compile('.*\((\d+)[a-z]{2}.*')
    if len(re.findall(position_pattern, activity_name)) == 1:
        return re.findall(position_pattern, activity_name)[0]
    elif len(re.findall(position_pattern_2, activity_name)) == 1:
        return re.findall(position_pattern_2, activity_name)[0]
    else:
        return 0

In [8]:
activities['position'] = activities['activity_name'].map(lambda x: int(get_position(x)))

In [9]:
def parkrun_pattern_matcher(activity_name):
    if ('PR' in activity_name) & ('WU' not in activity_name):
        return True
    else:
        return False

In [10]:
activities['event_type'] = ''
for index, row in activities.iterrows():
    if parkrun_pattern_matcher(row['activity_name']) == True:
        activities.at[index, 'event_type'] = 'PR'
    else:
        if row['position'] > 0:
            if 'XC' in row['activity_name']:
                activities.at[index, 'event_type'] = 'XCR'
            else:
                activities.at[index, 'event_type'] = 'RR'
        elif row['run_type'] == 'I':
            activities.at[index, 'event_type'] = 'I'
        else:
            activities.at[index, 'event_type'] = 'W'

In [86]:
activities['location'] = ''
for index, row in activities.iterrows():
    if row['event_type'] == 'PR':
        location_pattern = re.compile('(\w+)\s.*')
        activities.at[index, 'location'] = re.findall(location_pattern, row['activity_name'])[0]
    elif row['event_type'] == 'XCR':
        activities.at[index, 'location'] = 'Other'
    elif row['event_type'] == 'W':
        location_pattern_1 = re.compile('.*\s-\s(\w+)')
        location_pattern_2 = re.compile('Treadmill.*')
        if len(re.findall(location_pattern_1, row['activity_name'])) == 1:
            activities.at[index, 'location'] = re.findall(location_pattern_1, row['activity_name'])[0]
        elif len(re.findall(location_pattern_2, row['activity_name'])) == 1:
            activities.at[index, 'location'] = 'Treadmill'
        else:
            activities.at[index, 'location'] = 'Other'    
    elif row['event_type'] == 'I':
        location_pattern = re.compile('Track.*')
        if len(re.findall(location_pattern, row['activity_name'])) == 1:
            activities.at[index, 'location'] = 'Track'
        else:
            activities.at[index, 'location'] = 'Welwyn'
    else:
        location_pattern_1 = re.compile('Hatfield\s5.*')
        if len(re.findall(location_pattern_1, row['activity_name'])) == 1:
            activities.at[index, 'location'] = 'Hatfield'
        else:
            activities.at[index, 'location'] = 'Other'

In [89]:
activities['location'] = activities['location'].map(lambda x: 'Hatfield' if x == 'Ellenbrook' else x)
activities['location'] = activities['location'].map(lambda x: 'Other' if x not in ['Welwyn', 'Panshanger', 'Hatfield', 'Treadmill', 'Track'] else x)