# ETL pipeline

In [1]:
from datetime import datetime, timedelta
import time, csv
import ETL_pipeline_functions 

In [2]:
def ETL_pipeline():
    # storing credentials for Strava and Google Geocoding API's
    strava_access_token = ETL_pipeline_functions.strava_token_exchange('.secret/strava_api_credentials.json')
    geocode_key = ETL_pipeline_functions.geocode_key_getter('.secret/geocode_api_credentials.json')

    # storing most recent date from activities file
    start_date_iso = ETL_pipeline_functions.last_date('data/activities.csv')
    # converting date from iso-8601 format to unix format
    start_date_unix = ETL_pipeline_functions.iso_8601_to_unix(start_date_iso)

    # making requests to activities endpoint for Strava API
    activities = ETL_pipeline_functions.request_activities(strava_access_token, geocode_key, start_date_unix)

    # storing number of activities
    n = len(activities)

    # checking for activities
    if n:
        # appending activities to csv file
        ETL_pipeline_functions.append_requests(activities, 'data/activities.csv')

        # storing ids for activities
        activity_ids = list(map(lambda activity: activity['activity_id'], activities))

        # making requests to laps endpoint for Strava API
        splits = ETL_pipeline_functions.request_splits(strava_access_token, activity_ids)
        # appending splits to csv file
        ETL_pipeline_functions.append_requests(splits, 'data/activity_splits.csv')

        # making requests to zones endpoint for Strava API
        zones = ETL_pipeline_functions.request_zones(strava_access_token, activity_ids)
        # appending zones to csv file
        ETL_pipeline_functions.append_requests(zones, 'data/activity_zones.csv')

    # exception handling for no activities
    else:
        return print("no activities to append")
    
    # storing current date
    date = datetime.now().strftime('%d/%m/%Y')
    # logging requests to a csv file
    with open('data/request_log.csv', 'a', newline = '') as a:
        csv_writer = csv.writer(a)
        csv_writer.writerow([date, n])
        a.close()
    
    return print("ETL pipeline complete")

In [4]:
ETL_pipeline()

no activities to append


In [17]:
activities = pd.read_csv('activities.csv')
weather = pd.read_csv('activity_weather.csv')
locations = pd.read_csv('activity_locations.csv')

In [25]:
activities_merged = pd.merge(activities, locations, left_on = 'activity_id', right_on = 'activity_id', how = 'left')

In [31]:
activities_merged['location'].value_counts()

Welwyn Garden City    143
Hertford               31
Hatfield               20
London                  3
Barnet                  2
Watford                 2
St Albans               1
Manchester              1
Royston                 1
Waltham Cross           1
Name: location, dtype: int64

In [29]:
activities_merged.loc[activities_merged['location'].isna()]

Unnamed: 0,activity_name,activity_id,activity_type,distance,time,elevation_gain,kudos,start_date,average_speed,max_speed,average_cadence,average_hr,max_hr,suffer_score,location
0,Panshanger PR (18:12 - 2nd),1807281354,Run,5000.0,1092,0.0,1,2018-08-25T08:00:00Z,4.579,0.0,0.0,0.0,0.0,0.0,
2,"Hatfield 5k Series, race 1 (17:55 - 32nd)",1821995859,Run,5000.0,1075,0.0,10,2018-09-05T18:45:13Z,4.651,0.0,0.0,0.0,0.0,0.0,
3,Panshanger PR (18:21 - 2nd),1826965116,Run,5000.0,1101,0.0,6,2018-09-08T08:00:56Z,4.541,0.0,0.0,0.0,0.0,0.0,
6,Panshanger PR (18:12 - 3rd),1842987085,Run,5000.0,1092,0.0,6,2018-09-15T08:00:00Z,4.579,0.0,0.0,0.0,0.0,0.0,
7,Morning Run - Welwyn,1847294196,Run,10000.0,2550,0.0,5,2018-09-17T08:24:27Z,3.922,0.0,0.0,0.0,0.0,0.0,
8,"Hatfield 5k Series, race 2 (18:16 - 35th)",1853006370,Run,5000.0,1096,0.0,9,2018-09-19T18:45:00Z,4.562,0.0,0.0,0.0,0.0,0.0,
10,Panshanger PR (18:19 - 2nd),1857902033,Run,5000.0,1099,0.0,7,2018-09-22T08:00:00Z,4.55,0.0,0.0,0.0,0.0,0.0,
11,Morning Run - Welwyn,1862331364,Run,10000.0,2622,0.0,6,2018-09-24T08:20:47Z,3.814,0.0,0.0,0.0,0.0,0.0,
13,Panshanger PR (18:05 - 1st),1872555370,Run,5000.0,1085,0.0,12,2018-09-29T08:00:00Z,4.608,0.0,0.0,0.0,0.0,0.0,
16,"Hatfield 5k Series, race 3 (17:54 - 28th)",1882361101,Run,5000.0,1074,0.0,10,2018-10-03T18:45:00Z,4.655,0.0,0.0,0.0,0.0,0.0,


# Data Cleaning

In [2]:
import numpy as np
import pandas as pd
import re

In [3]:
activities = pd.read_csv('activities.csv')

In [5]:
def interval_pattern_matcher(activity_name):
    key_words = ['Intervals', 'Track', 'Yasso']
    for key_word in key_words:
        if key_word in activity_name:
            return True
        else:
            continue
    return False

In [6]:
activities['run_type'] = ''
for index, row in activities.iterrows():
    if interval_pattern_matcher(row['activity_name']) == True:
        activities.at[index, 'run_type'] = 'I'
    else:
        if row['distance'] < 8000:
            activities.at[index, 'run_type'] = 'SR'
        elif row['distance'] < 16000:
            activities.at[index, 'run_type'] = 'MR'
        else:
            activities.at[index, 'run_type'] = 'LR'

In [7]:
def get_position(activity_name):
    position_pattern = re.compile('.*\s(\d+)[a-z]{2}.*')
    position_pattern_2 = re.compile('.*\((\d+)[a-z]{2}.*')
    if len(re.findall(position_pattern, activity_name)) == 1:
        return re.findall(position_pattern, activity_name)[0]
    elif len(re.findall(position_pattern_2, activity_name)) == 1:
        return re.findall(position_pattern_2, activity_name)[0]
    else:
        return 0

In [8]:
activities['position'] = activities['activity_name'].map(lambda x: int(get_position(x)))

In [9]:
def parkrun_pattern_matcher(activity_name):
    if ('PR' in activity_name) & ('WU' not in activity_name):
        return True
    else:
        return False

In [10]:
activities['event_type'] = ''
for index, row in activities.iterrows():
    if parkrun_pattern_matcher(row['activity_name']) == True:
        activities.at[index, 'event_type'] = 'PR'
    else:
        if row['position'] > 0:
            if 'XC' in row['activity_name']:
                activities.at[index, 'event_type'] = 'XCR'
            else:
                activities.at[index, 'event_type'] = 'RR'
        elif row['run_type'] == 'I':
            activities.at[index, 'event_type'] = 'I'
        else:
            activities.at[index, 'event_type'] = 'W'

In [86]:
activities['location'] = ''
for index, row in activities.iterrows():
    if row['event_type'] == 'PR':
        location_pattern = re.compile('(\w+)\s.*')
        activities.at[index, 'location'] = re.findall(location_pattern, row['activity_name'])[0]
    elif row['event_type'] == 'XCR':
        activities.at[index, 'location'] = 'Other'
    elif row['event_type'] == 'W':
        location_pattern_1 = re.compile('.*\s-\s(\w+)')
        location_pattern_2 = re.compile('Treadmill.*')
        if len(re.findall(location_pattern_1, row['activity_name'])) == 1:
            activities.at[index, 'location'] = re.findall(location_pattern_1, row['activity_name'])[0]
        elif len(re.findall(location_pattern_2, row['activity_name'])) == 1:
            activities.at[index, 'location'] = 'Treadmill'
        else:
            activities.at[index, 'location'] = 'Other'    
    elif row['event_type'] == 'I':
        location_pattern = re.compile('Track.*')
        if len(re.findall(location_pattern, row['activity_name'])) == 1:
            activities.at[index, 'location'] = 'Track'
        else:
            activities.at[index, 'location'] = 'Welwyn'
    else:
        location_pattern_1 = re.compile('Hatfield\s5.*')
        if len(re.findall(location_pattern_1, row['activity_name'])) == 1:
            activities.at[index, 'location'] = 'Hatfield'
        else:
            activities.at[index, 'location'] = 'Other'

In [89]:
activities['location'] = activities['location'].map(lambda x: 'Hatfield' if x == 'Ellenbrook' else x)
activities['location'] = activities['location'].map(lambda x: 'Other' if x not in ['Welwyn', 'Panshanger', 'Hatfield', 'Treadmill', 'Track'] else x)