In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, date
from meteostat import Stations, Daily, Hourly
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import tensorflow as tf
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import requests
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
# Retrieve Strava activity data as JSON response
auth_url = "https://www.strava.com/oauth/token"
activites_url = "https://www.strava.com/api/v3/athlete/activities"

payload = {
    'client_id': "<CLIENT_ID_HERE>",
    'client_secret': '<CLIENT_SECRET_HERE>',
    'refresh_token': '<REFRESH_TOKEN_HERE>',
    'grant_type': "refresh_token",
    'f': 'json'
}

# First API call to request access token
# Access tokens expire, so this saves you manual work of regenerating access tokens manually
print("Requesting Token...\n")
res = requests.post(auth_url, data=payload, verify=False)
access_token = res.json()['access_token']
print("Access Token = {}\n".format(access_token))

# Make API request to retrieve the last 400 activities
header = {'Authorization': 'Bearer ' + access_token}
param1 = {'per_page': 200, 'page': 1}
param2 = {'per_page': 200, 'page': 2}
param3 = {'per_page': 200, 'page': 3}
param4 = {'per_page': 200, 'page': 4}
my_dataset1 = requests.get(activites_url, headers=header, params=param1).json()
my_dataset2 = requests.get(activites_url, headers=header, params=param2).json()
my_dataset3 = requests.get(activites_url, headers=header, params=param3).json()
my_dataset4 = requests.get(activites_url, headers=header, params=param4).json()

print(my_dataset1[0]["name"])
print(my_dataset1[0]["map"]["summary_polyline"])

print(my_dataset2[0]["name"])
print(my_dataset2[0]["map"]["summary_polyline"])

print(my_dataset3[0]["name"])
print(my_dataset3[0]["map"]["summary_polyline"])

print(my_dataset4[0]["name"])
print(my_dataset4[0]["map"]["summary_polyline"])

In [None]:
# Flatten nested JSON response into a Dataframe
activities1 = pd.json_normalize(my_dataset1)
activities2 = pd.json_normalize(my_dataset2)
activities3 = pd.json_normalize(my_dataset3)
activities4 = pd.json_normalize(my_dataset4)

# Combine into one dataframe
activities = pd.concat([activities1, activities2, activities3, activities4])
activities = activities.reset_index()
activities = activities.drop(columns=["index"])

print("Dataframe shape: ", activities.shape, "\n")
print("Dataframe columns: \n", activities.columns)

In [None]:
# Flag race activities
# only keep run activties
running_activities = activities.loc[activities["type"] == "Run"]
running_activities = running_activities.drop(columns=["type"])
running_activities["race"] = False

# manually mark race activities as "race"=True
# Races: 11-11-2023 , 2-19-2023, 1-22-2023, 2-27-2022, 1-23-2022, 5-6-2023, 11-24-2022, 12-4-2022
# Relays: 3-28-2021, 3-27-2021, 3-27-2021, 3-26-2023, 3-25-2023, 3-25-2023, 8-26-2023, 8-25-2023, 8-25-2023
# NOT in current data pull -  1-23-2022, 2-27-2022, 3-28-2021, 3-27-2021, 3-27-2021
race_activity_ids = [10270133740,10201538669,8589932304,8434045492,9022328776,8162552673,8207198428]
relay_activity_ids = [8779880223,8777939900,8776825449,9721567283,9719999180,9725419043]

# uncomment to locate race activities and retrieve ids
#display(running_activities.loc[(pd.to_datetime(running_activities["start_date_local"])).dt.date == date(2023,11,23)])
#display(running_activities.loc[running_activities.id == 10201538669])

for id in race_activity_ids:
        running_activities.loc[running_activities.id == id, "race"] = True        

race_activities = running_activities.loc[running_activities["race"] == True]
display(race_activities)

In [None]:
# Helper functions
def ft_to_meters(dist_ft):
    return dist_ft * 0.3048

def miles_to_meters(dist_miles):
    return dist_miles * 1609.34

def z_score(df, col_name):
    df2 = df.copy()
    df2[col_name] = ( (df[col_name] - df[col_name].mean()) / df[col_name].std() )
    return df2

def normalize(df, col_name):
    df2 = df.copy()
    df2[col_name] = ( (df2[col_name] - df2[col_name].min()) / (df2[col_name].max() - df2[col_name].min()) )
    return df2

def seconds_to_timedelta(seconds):
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)

    return str("%d:%02d:%02d" % (hours, minutes, seconds))

def min_per_mile(seconds, pace):
    
    return
    
    
# Check distribution of numeric columns to decide proper encoding method
def plot_elev_high(df):
    df["elev_high"].plot(kind="hist", title="Elevation High Distribution")
    print("elev_high mean:",df["elev_high"].mean(),"\nelev_high std distribution:",df["elev_high"].std())
    
def plot_avg_temp(df):
    df["avg_temp"].plot(kind="hist", title="Average Temperature Distribution")
    print("avg_temp mean:",df["avg_temp"].mean(),"\navg_temp std distribution:",df["avg_temp"].std())
    
def plot_avg_cadence(df):
    df["average_cadence"].plot(kind="hist", title="Average Cadence Distribution")
    print("average_cadence mean:",df["average_cadence"].mean(),"\naverage_cadence std distribution:",df["average_cadence"].std())

def plot_max_hr(df):
    df["max_heartrate"].plot(kind="hist", title="Maximum Heartrate Distribution")
    print("max_heartrate mean:",df["max_heartrate"].mean(),"\nmax_heartrate std distribution:",df["max_heartrate"].std())

def plot_avg_hr(df):   
    df["average_heartrate"].plot(kind="hist", title="Average Heartrate Distribution")
    print("average_heartrate mean:",df["average_heartrate"].mean(),"\naverage_heartrate std distribution:",df["average_heartrate"].std())

def plot_max_speed(df):
    df["max_speed"].plot(kind="hist", title="Maxiumum Speed Distribution")
    print("max_speed mean:",df["max_speed"].mean(),"\nmax_speed std distribution:",df["max_speed"].std())

def plot_elev_gain(df):
    df["total_elevation_gain"].plot(kind="hist", title="Total Elevation Gain Distribution")
    print("total_elevation_gain mean:",df["total_elevation_gain"].mean(),"\ntotal_elevation_gain std distribution:",df["total_elevation_gain"].std())

In [None]:
def handle_nulls(df, method):
    if method == "interpolation":
        df = df.interpolate()
        return df

def handle_nulls_race(input_df, training_df):
    race_activities = training_df.loc[training_df["race"] == True]
    display(input_df)
    

In [None]:
# Retrieves weather data from a given set of locations (loc_df) across a date range (min_date -> max_date)
# Returns input timestamped dataframe (df) with an additional column, avg_temp
def get_weather(min_date, max_date, loc_df, df):

    # Get unique list of weather stations from locations
    print("Retrieving list of weather stations...")
    weather_stations = pd.DataFrame()
    loc_dict = {}
    for location in loc_df:
        stations = Stations()
        nearby_station = stations.nearby(location[0], location[1]).fetch(1)
        loc_dict[("[ "+str(location[0])+" "+str(location[1])+"]")] = nearby_station.index[0]
        weather_stations = pd.concat([weather_stations, nearby_station])

    weather_stations = weather_stations.drop_duplicates(subset='name')
    print("Weather stations retrieved.")
    display(weather_stations.head())
    
    # Retrieving Weather Data from Nearby Stations
    # Create dictionary to hold dataframes of weather data. Dict key is weather station id.
    print("Retrieving weather data...")
    weather_df_dict = {}
    for coord, station_id in loc_dict.items():
        weather_df_dict[station_id] = pd.DataFrame()
    
    # Retrieve at least a year of weather data
    today = date.today()
    last_year = today.year - 1
    a_year_ago_today = date(last_year, today.month, today.day)
    if a_year_ago_today < min_date:
        min_date = a_year_ago_today

    for idx, station in weather_stations.iterrows():
        # Daily Weather
        weather_df_dict[idx] = Daily(idx, start=datetime.combine(min_date, datetime.min.time()), end=datetime.combine(max_date, datetime.min.time())).fetch()

        # Hourly Weather
        #weather_df_dict[idx] = Hourly(idx, start=datetime.combine(min_date, datetime.min.time()), end=datetime.combine(max_date, datetime.min.time())).fetch()
    print("Weather data retrieved.")
    
    # Add weather data to loc_df
    print("Adding weather data to df...")
    
    # Add weather data to run_activities dataframe
    df["weather_station"] = None
    df["avg_temp"] = None

    for idx, activity in df.iterrows():
        dict_key = "[ " + str(activity['start_lat']) + " " + str(activity['start_lng']) + "]"
        if dict_key in loc_dict:
            weather_df = weather_df_dict[loc_dict[dict_key]]
            # check if date is in the future
            if today < activity["start_date_local"]:
                # if date is in future, use weather from last year
                new_date = date(activity["start_date_local"].year - 1, activity["start_date_local"].month, activity["start_date_local"].day)
                try:
                    df.at[idx,"weather_station"] = loc_dict[dict_key]
                    df.at[idx,"avg_temp"] = weather_df.loc[str(new_date)]["tavg"]
                except:
                    # weather not found for date / station
                    df.at[idx,"weather_station"] = loc_dict[dict_key]
                    df.at[idx,"avg_temp"] = np.nan
            try:
                df.at[idx,"weather_station"] = loc_dict[dict_key]
                df.at[idx,"avg_temp"] = weather_df.loc[str(activity["start_date_local"])]["tavg"]
            except:
                # weather not found for date / station
                df.at[idx,"weather_station"] = loc_dict[dict_key]
                df.at[idx,"avg_temp"] = np.nan
    
    print("Weather data added.")
    df = df.drop(labels=["weather_station"], axis=1)
    display(df.head())
                
    return df

In [None]:
# Returns df with encoded categorical and numeric columns.
def data_encoding(df, skip_encoding):
    df_encoded = df
    
    # encode time cyclically
    # split date time into seperate columns
    df_encoded.loc[:,"date-time"] = pd.to_datetime(df_encoded.start_date_local.astype(str)+ " " + df_encoded.start_time.astype(str ))
    df_encoded["year"] = df_encoded["date-time"].dt.year
    df_encoded["month"] = df_encoded["date-time"].dt.month
    df_encoded["day"] = df_encoded["date-time"].dt.day
    df_encoded["hour"] = df_encoded["date-time"].dt.hour
    df_encoded["min"] = df_encoded["date-time"].dt.minute
    df_encoded["sec"] = df_encoded["date-time"].dt.second
    df_encoded["total_seconds"] = df_encoded["hour"]*60*60 +  df_encoded["min"]*60 +  df_encoded["sec"]
    
    # cyclically encode date and time with sin / cos functions
    seconds_in_day = 24*60*60

    df_encoded["month_sin"] = np.sin(2*np.pi*df_encoded.month/12)
    df_encoded["month_cos"] = np.cos(2*np.pi*df_encoded.month/12)

    df_encoded["day_sin"] = np.sin(2*np.pi*df_encoded.day/31)
    df_encoded["day_cos"] = np.cos(2*np.pi*df_encoded.day/31)

    df_encoded["sec_sin"] = np.sin(2*np.pi*df_encoded.total_seconds/seconds_in_day)
    df_encoded["sec_cos"] = np.cos(2*np.pi*df_encoded.total_seconds/seconds_in_day)
    
    # one-hot encode year
    one_hot_year = pd.get_dummies(df_encoded["year"])
    df_encoded = df_encoded.join(one_hot_year)
    
    if not skip_encoding:
        # normalize or standardize numeric columns based on distribution of values (normal or not)
        df_encoded = z_score(df_encoded, "average_cadence")
        df_encoded = z_score(df_encoded, "average_heartrate")
        df_encoded = z_score(df_encoded, "max_heartrate")
        df_encoded = normalize(df_encoded, "total_elevation_gain")
        df_encoded = normalize(df_encoded, "elev_high")
        df_encoded = z_score(df_encoded, "max_speed")
        df_encoded = z_score(df_encoded, "avg_temp")

    
    # drop uncoded columns
    df_encoded = df_encoded.drop(columns=["date-time","year","month","day","hour","min","sec","total_seconds","start_time","start_date_local"])
    
    return df_encoded

In [None]:
def data_preprocessing(df, skip_encoding=False):
    
    # modify date/time/location columns
    df['start_date_local'] = pd.to_datetime(df['start_date_local'])
    df['start_time'] = df['start_date_local'].dt.time
    df['start_date_local'] = df['start_date_local'].dt.date
    df['start_lat'] = round(df['start_latlng'].str[0], 2)
    df['start_lng'] = round(df['start_latlng'].str[1], 2)
    df = df.drop(columns=['start_latlng'])
    
    # add weather data
    # if date is in future (i.e. for race prediction), get the weather from that location last year
    min_date, max_date = df['start_date_local'].min(), df['start_date_local'].max()
    locations = df.loc[:,['start_lat', 'start_lng']].drop_duplicates().values
    
    df = handle_nulls(df, "interpolation")
    df = get_weather(min_date, max_date, locations, df)
    df["avg_temp"].fillna((df["avg_temp"].mean()), inplace = True) # handle any null values in new avg_temp col
    df = df.drop(columns=['start_lat','start_lng'])
    print("How many nulls?") # verify there are no null values
    print(df.isnull().sum())
    df = data_encoding(df, skip_encoding)
    
    return df

In [None]:
# Model predicts moving time / average pace during a race based on race's location, elevation, and distance.
def race_predictor(model_training_df, input_param_dict, model_name):
    # Verify format of input parameters - race_date, start_latlng, distance, elev_gain, elev_high
    if len(input_param_dict) < 5:
        print("Missing input parameters. race_predictor is expecting a dictionary with keys: race_date, start_latlng, distance, elev_gain, elev_high.")
        return
    
    cols = ['start_date_local', 'start_latlng', 'distance', 'moving_time',  'total_elevation_gain', 'max_speed',
            'average_cadence', 'average_heartrate', 'max_heartrate', 'elev_high', 'race']
    model_training_df = model_training_df[cols]
    race_activities_df = model_training_df.loc[model_training_df["race"] == True]
    
    # populate input_df with avg values from other race activities
    input_df = pd.DataFrame( {
        "start_date_local": [input_param_dict["race_date"]],
        "start_latlng": [input_param_dict["start_latlng"]],
        "distance":  [miles_to_meters(input_param_dict["distance_miles"])],
        "moving_time": [None],
        "total_elevation_gain": [ft_to_meters(input_param_dict["elev_gain_ft"])],
        "max_speed": [race_activities_df["max_speed"].mean()],
        "average_cadence": [race_activities_df["average_cadence"].mean()],
        "average_heartrate": [race_activities_df["average_heartrate"].mean()],
        "max_heartrate": [race_activities_df["max_heartrate"].mean()],
        "elev_high": [ft_to_meters(input_param_dict["elev_high_ft"])],
        "race": True,
        "prediction": True}
                            
        )
    model_training_df["prediction"] = False
    print("Model Training DF:")
    display(model_training_df)

    # Create single df before pre-processing
    model_df = pd.concat([model_training_df, input_df])
    training_data = data_preprocessing(model_df)
    
    if model_name == "linear_reg":
        x_race = training_data.loc[training_data["prediction"] == True]
        x_train = training_data.loc[training_data["prediction"] == False]
        
        df = x_train.drop(columns=["moving_time","month_sin","month_cos","day_sin","day_cos"])
        x = np.array(x_train.drop(columns=["moving_time","month_sin","month_cos","day_sin","day_cos"]))
        y = np.array(x_train["moving_time"])
        
        # Build Multiple Linear Regression model with scikit-learn
        model = LinearRegression().fit(x,y)
        
        # Print model results
        r_sq = model.score(x,y)
        print(f"coefficient of determination: {r_sq}")
        print(f"intercept: {model.intercept_}")
        print()
        print(f"coefficients: {model.coef_}")

        # summarize feature importance
        importance = model.coef_
        for i,v in enumerate(importance):
            print('Feature: %0d, Score: %.5f' % (i,v))
        
        # plot feature importance
        plt.bar([x for x in range(len(importance))], importance)
        plt.show()
        
        # Use model to predict moving_time for race
        x_pred = np.array(x_race.drop(columns=["moving_time","month_sin","month_cos","day_sin","day_cos"]))
    
        y_pred = model.predict(x_pred)
        
        print("prediciton:",y_pred)
        
        print(seconds_to_timedelta(y_pred))

    elif model_name == "nn":

        device = (
            "cuda"
            if torch.cuda.is_available()
            else "mps"
            if torch.backends.mps.is_available()
            else "cpu"
        )

        print("Using {} device for nn training".format(device))

        # Define Neural Network
        class NeuralNetwork(nn.Module):
            def __init__(self, input_size):
                super().__init__()
                self.linear_relu_stack = nn.Sequential(
                    nn.Linear(input_size, 15),
                    nn.ReLU(),
                    nn.Linear(15,30),
                    nn.ReLU(),
                    nn.Linear(30,45),
                    nn.ReLU(),
                    nn.Linear(45,60),
                    nn.ReLU(),
                    nn.Linear(60,45),
                    nn.ReLU(),
                    nn.Linear(45,30),
                    nn.ReLU(),
                    nn.Linear(30,15),
                    nn.ReLU(),
                    nn.Linear(15,1),
                )
                
            def forward(self, x):
                x = x.to(torch.float32)
                logits = self.linear_relu_stack(x)
                return logits
                
        training_data['moving_time'] = training_data['moving_time'].astype(float)
        x_race = training_data.loc[training_data["prediction"] == True]
        x_train = training_data.loc[training_data["prediction"] == False]
        
        
        x = np.array(x_train.drop(columns=["moving_time"]))
        y = np.array(x_train["moving_time"])
 
        #TODO: incorporate dataloader
        
        model = NeuralNetwork(len(x[0]))
        model.to(device)
        opt = torch.optim.Adam(model.parameters())
        loss_func = nn.MSELoss()

        epochs = 10
        for epoch in range(epochs):
            print(f"Starting epoch {epoch}....")
            for x_item, y_item in zip(x, y):
                x_item = np.asarray(list(x_item))
                x_item = torch.from_numpy(x_item)
                y_item = torch.from_numpy(np.asarray([float(y_item)]))
                y_item_pred = model(x_item)
                loss = loss_func(y_item_pred, y_item.to(torch.float32))
                opt.zero_grad()
                loss.backward()
                opt.step()
        
        x_race_arr = np.array(x_race.drop(columns=["moving_time"]))
        x_race_item = np.array(list(x_race_arr[0]))
        x_race_item = torch.from_numpy(x_race_item) #TODO: fix
        y_race_pred = model(x_race_item)[0] # might have to feed in zeroth index
        moving_time_prediction = seconds_to_timedelta(y_race_pred.item())
        
        
    return "Predicted race time for {distance} mile race is {pred}".format(distance=input_param_dict["distance_miles"], pred=moving_time_prediction)


In [None]:
# Model Training data sets
marathon_test_stop = date(2023,11,11)
marathon_test_stop_idx = (running_activities.loc[running_activities.index[(pd.to_datetime(running_activities["start_date_local"])).dt.date == marathon_test_stop]]).index[0]
fifteenK_stop = date(2023,11,22)
fifteenK_stop_idx = (running_activities.loc[running_activities.index[(pd.to_datetime(running_activities["start_date_local"])).dt.date == fifteenK_stop]]).index[0]

marathon_test_activities = running_activities[marathon_test_stop_idx : -1]
fifteenK_test_activities = running_activities[fifteenK_stop_idx : -1]

display(marathon_test_activities.head())
display(fifteenK_test_activities.head())

In [None]:
# Test models on upcoming races
richmond_marathon_input_dict = {
        "race_date": "2023-11-11T07:00:00Z",
        "start_latlng": [37.5407, -77.4360],
        "distance_miles": 26.2,
        "elev_gain_ft": 490,
        "elev_high_ft": 272
}

seattle_15k_input_dict = {
        "race_date": "2023-11-23T09:30:00Z",
        "start_latlng": [47.6798, -122.2536],
        "distance_miles": 9.32,
        "elev_gain_ft": 342,
        "elev_high_ft": 67
}

# Uncomment to test different models
race_predictor(marathon_test_activities, richmond_marathon_input_dict, "linear_reg")
#race_predictor(fifteenK_test_activities, seattle_15k_input_dict, "linear_reg")
#race_predictor(marathon_test_activities, richmond_marathon_input_dict, "nn")
#race_predictor(fifteenK_test_activities, seattle_15k_input_dict, "nn")