In [1]:
#Let's read in some athletes, check their races, and see if a fit model on their first race
#can be used to predict future races.

In [3]:
#Imports borrowed from pymc3_modeling.ipynb
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import sympy as sy
from scipy.integrate import odeint
size = (12, 9)

import json
import datetime as dt

import re
import os
import glob

import arviz as az
import theano
import theano.tensor as tt
import statsmodels.api as sm
import pymc3 as pm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from pandas.plotting import register_matplotlib_converters
plt.style.use('seaborn')
register_matplotlib_converters()

In [4]:
#Read in the athlete dictionary:
with open("./data/athletes.json") as json_file: 
    athletes = json.load(json_file)

In [6]:
def analyze_file(filepath, sport, athlete_id): #Athlete id is a number no need to pass it the string:
    '''
    This function takes in a file, converts all the columns to numeric and date that we need,
    creates a speed column for running that converts pace to speed in ft/s, and prepares the 
    columns for the fit function that will later be used to predict the running pace. It will 
    create and output all the graphs we have here, for each of the races in the filepath.
    '''
    #Read in the dataframe:
    raw = pd.read_csv(filepath)
    sport = sport
    athlete_max_hr = athletes[str(athlete_id)]['max_hr']
    #Filter down just to the sport:
    df = raw[raw['activity_type'] == sport]
    
    
    #Setting average pace to a timedelta, we use a try in case the data does not have avg_pace column:
    try:
        df['avg_pace'] = pd.to_timedelta(df['avg_pace'])
    except:
        print("No avg_pace column!")
    
    try:
        df['distance'] = pd.to_numeric(df['distance'])
    except:
        print(f"Check athlete #{athlete_id} at filepath ###{filepath}###")
    
    try:
        df['elev_gain'] = pd.to_numeric(df['elev_gain'].str.replace(',', ''))
    except:
        try:
            df['elev_gain'] = pd.to_numeric(df['elev_gain'])
        except:
            print(f"Check athlete #{athlete_id} at filepath ###{filepath}###")
    
    #Create ft/mile column and intensity for analysis:
    df['ft_per_mile'] = (df['elev_gain']/df['distance'])
    df['intensity'] = df['avg_hr']/athlete_max_hr
    
    
    
    #Do the same for calculating speed in ft/s
    df['spd_ft_s'] = 0.0
    df['spd_mph'] = 0.0
    df.reset_index(inplace=True)
    
    for i in range(len(df['avg_pace'])):
        #.total_seconds returns the total seconds in a timedelta object.
        df['spd_mph'][i] = float(60/(df['avg_pace'][i].total_seconds()/60.0))
        #takes the speed in miles per hour and converts down to ft/s
        df['spd_ft_s'][i] = float(60.0/(df['avg_pace'][i].total_seconds()/60.0))*float(5280.0/3600.0)
        
    #The default is to be equal to the speed not taking into account elevation gain.
    df['GAP_ft_s'] = df['spd_ft_s']
    
    #Now we try to make a GAP column:
    for i in range(len(df)):
        #Try to make a GAP column:
        try:
            #using trig to calculate the change to the pace:
            factor = (np.sqrt((df['distance'][i]*5280)**2+df['elev_gain'][i]**2)/(df['distance'][i]*5280))
            df['GAP_ft_s'][i] = factor*df['spd_ft_s'][i]
        except:
            print("No GAP_ft_s was able to be calculated. Data is equal to spd_ft_s.")
        
    #Why divide by intensity? This brings average run pace up to the speed that we would expect if the athlete was at their highest average
    #sustainable heartrate for this speed. 
    df['GAP_by_intensity']  = df['GAP_ft_s']/df['intensity']
    
    #This creates a finalized dataframe with all of the columns I need for creating the pace curves.
    df.to_csv(filepath[:-4]+'_'+sport+'_extra_cols.csv')

In [7]:
files_to_convert = glob.glob('./data/clean/garmin_clean/*')

In [8]:
files_to_convert

['./data/clean/garmin_clean/clean_6.csv',
 './data/clean/garmin_clean/clean_7.csv',
 './data/clean/garmin_clean/clean_5.csv',
 './data/clean/garmin_clean/clean_4.csv',
 './data/clean/garmin_clean/clean_0.csv',
 './data/clean/garmin_clean/clean_1.csv',
 './data/clean/garmin_clean/clean_3.csv',
 './data/clean/garmin_clean/clean_2.csv',
 './data/clean/garmin_clean/clean_12.csv',
 './data/clean/garmin_clean/clean_11.csv',
 './data/clean/garmin_clean/clean_10.csv',
 './data/clean/garmin_clean/clean_3_Running_extra_cols.csv',
 './data/clean/garmin_clean/clean_9.csv',
 './data/clean/garmin_clean/clean_8.csv']

In [25]:
output_list = []
for i in range(len(files_to_convert)):
    athlete = i
    filepath = ''
    for j in range(len(files_to_convert)):
        print(int(files_to_convert[j][32:][:2].replace('.', '')))
        print(i)
        try:
            if int(files_to_convert[j][32:][:2]) == i:
                print(f"Matched athlete {i} to filepath {files_to_convert[j]}")
                filepath = files_to_convert[j]
            output_list.append((str(i), (filepath)))   
        except:
            print(f'No match for {files_to_convert[j]}')
output_list

6
0
No match for ./data/clean/garmin_clean/clean_6.csv
7
0
No match for ./data/clean/garmin_clean/clean_7.csv
5
0
No match for ./data/clean/garmin_clean/clean_5.csv
4
0
No match for ./data/clean/garmin_clean/clean_4.csv
0
0
No match for ./data/clean/garmin_clean/clean_0.csv
1
0
No match for ./data/clean/garmin_clean/clean_1.csv
3
0
No match for ./data/clean/garmin_clean/clean_3.csv
2
0
No match for ./data/clean/garmin_clean/clean_2.csv
12
0
11
0
10
0


ValueError: invalid literal for int() with base 10: '3_'