In [1]:
import json
import pandas as pd
import tqdm
import numpy as np
import plotly.express as px
from marathon_analysis.helpers import time_to_seconds, seconds_to_time, pretty_print, seconds_to_time_short, save_figure, seconds_to_pace
import statsmodels.api as sm



# Load Runner Data

If you provide a file with this same format, you can run the analysis yourself

In [458]:
data = pd.read_csv("data/all_runners_with_splits.csv")
data.head(5)

Unnamed: 0,5km.time,10km.time,15km.time,20km.time,half.time,25km.time,30km.time,35km.time,40km.time,finish.time
0,00:14:43,00:29:15,00:44:21,00:59:13,01:02:29,01:14:42,01:30:06,01:45:01,01:59:44,02:06:12
1,00:14:43,00:29:25,00:44:23,00:59:24,01:02:40,01:14:44,01:30:07,01:45:02,01:59:53,02:06:35
2,00:14:43,00:29:17,00:44:21,00:59:13,01:02:29,01:14:42,01:30:06,01:45:01,02:00:05,02:06:51
3,00:14:44,00:29:16,00:44:22,00:59:15,01:02:30,01:14:44,01:30:07,01:45:30,02:01:48,02:08:50
4,00:14:36,00:29:15,00:44:06,00:59:10,01:02:29,01:14:43,01:30:08,01:45:59,02:02:16,02:09:39


In [473]:
df = data.copy()

df["0km.time"] = "00:00:00"

# convert to seconds to make easier to work with
time_cols = [c for c in df.columns if ".time" in c]
df[time_cols] = df[time_cols].applymap(time_to_seconds)


# drop row with missing splits for now
df = df.dropna(axis=0)

# calculate pace
df["finish.avg_pace"] = (df["finish.time"] / 26.2).astype(int)
df["half.avg_pace"] = (df["half.time"] / 13.1).astype(int)
df["2nd_half.time"] = df["finish.time"] - df["half.time"]
df["2nd_half.avg_pace"] = (df["2nd_half.time"] / 13.1).astype(int)
df["2nd_half.time_diff"] = df["2nd_half.time"] - df["half.time"]
df["2nd_half.time_diff_percent"] = df["2nd_half.time_diff"]/df["2nd_half.time"]*100
df["2nd_half_faster"] = df["2nd_half.time_diff"] < 0
df["below_four_hours"] = df["finish.time"] < time_to_seconds("04:00:00")


five_km_to_miles = 3.10686
km_to_miles = 0.621371
half_km = 21.0975
full_km = 42.195
splits = [5, 10, 15, 20, 25, 30, 35, 40]
def splits_list(df):
    l = []
    for s in splits:
        mile_pace = int((df[str(s) + "km.time"] - df[str(s-5) + "km.time"]) / five_km_to_miles)
        l.append(mile_pace)
    return l
df["5km_split_pace"] = df.apply(splits_list, axis=1)



for s in splits:
    # handle separate due to half and 40km times
    if s != 25:            
        df[str(s) + "km.pace"] = ((df[str(s) + "km.time"] - df[str(s-5) + "km.time"]) / five_km_to_miles).astype(int)
    
    df[str(s) + "km.avg_pace"] = df[str(s) + "km.time"] / (s*km_to_miles)
    
df["20km_to_half.pace"] = (df["half.time"] - df["20km.time"]) / (half_km - 20) / km_to_miles
df["half_to_25km.pace"] = (df["25km.time"] - df["half.time"]) / (25 - half_km) / km_to_miles
df["40km_to_finish.pace"] = (df["finish.time"] - df["40km.time"]) / (full_km - 40) / km_to_miles
    

df["finish.avg_pace_norm"] = df["finish.avg_pace"] / df["finish.time"]

df["std_split_pace"] = df["5km_split_pace"].apply(np.std).astype(int)
df["std_split_pace_norm"] = df["std_split_pace"]/df["finish.avg_pace"]


# max split difference 
all_pace_cols = ['5km.pace',
 '10km.pace',
 '15km.pace',
 '20km.pace',
 '20km_to_half.pace',                      
 'half_to_25km.pace',
 '30km.pace',
 '35km.pace',
 '40km.pace',
 '40km_to_finish.pace']
all_pace_names = ["0 - 5km", "5 - 10km", "10 - 15km", "15 - 20km", "20km - Half", "Half - 25km", "25 - 30km", "30 - 35km", "35 - 40km", "40km - Finish"]
# remove any rows that have a negative pace
# for some reason there are a few in the dataset.
df = df[~(df[all_pace_cols] < 0).any(axis=1)]
df["fastest_split"] = df[all_pace_cols].min(axis=1)
df["slowest_split"] = df[all_pace_cols].max(axis=1)
df["max_split_diff"] = df["slowest_split"] - df["fastest_split"]
df["max_split_diff_norm"] = df["max_split_diff"]/df["finish.avg_pace"]
df["slowest_split.name"] = df[all_pace_cols].idxmax(axis=1)
df["fastest_split.name"] = df[all_pace_cols].idxmin(axis=1)


def calculate_trend(row, first_half=False, second_half=False):
    Y = row.values
    X = []
    if first_half:
        X += [5, 10, 15, 20, half_km]
    if second_half:
        X += [25, 30, 35, 40, full_km]
    X = sm.add_constant(X)
    model = sm.OLS(Y,X)
    results = model.fit()
    return results.params[1] / km_to_miles
    

df["split_trend"] = df[all_pace_cols].apply(calculate_trend, axis=1, first_half=True, second_half=True)
df["1st_half.split_trend"] = df[all_pace_cols[:5]].apply(calculate_trend, axis=1, first_half=True)
df["2nd_half.split_trend"] = df[all_pace_cols[5:]].apply(calculate_trend, axis=1, second_half=True)

In [463]:
four_hours = time_to_seconds("04:00:00")
delta = 5*60
close_beat = df[df["finish.time"].between(four_hours - delta, four_hours, inclusive="left")]
close_miss = df[df["finish.time"].between(four_hours, four_hours + delta)]
on_pace = df[(df["finish.time"] > four_hours + delta) & df["half.time"].between(time_to_seconds("01:45:00"), time_to_seconds("01:55:00"))]
around_four = pd.concat([close_beat, close_miss])#, on_pace])

## Slowest Split Breakdown

In [488]:
r = dict(zip(all_pace_cols, all_pace_names))
x = similar_pace["slowest_split.name"].replace(r).value_counts() / len(similar_pace)
d = (x.round(3)*100).to_frame().reset_index()
d.columns = ["Split", "% Worst Split"]
d

Unnamed: 0,Split,% Worst Split
0,35 - 40km,60.5
1,40km - Finish,13.6
2,30 - 35km,11.6
3,25 - 30km,4.6
4,0 - 5km,3.6
5,20km - Half,2.3
6,5 - 10km,1.7
7,Half - 25km,1.4
8,15 - 20km,0.4
9,10 - 15km,0.3


## Finishing Times

In [472]:
fig = px.histogram(df, 
                   x="finish.time", 
                   title = "2021 Chicago Marathon Finishing Times", 
                   labels={"finish.time": "Finish Time (h:mm)"},
                   hover_name=df["finish.time"].apply(seconds_to_time_short),
                   template="seaborn")

fig.update_yaxes(title_text='Number of Runners')
bins = pd.Series(range(int(df["finish.time"].min()) - (600 - 228), int(df["finish.time"].max()), 60*15))
fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_time_short),
        "tickangle":35
    }
)

name = "finish_time_distribution"

save_figure(fig, name)

## Finishing Times with Goal Line

In [489]:
fig.add_vline(x=time_to_seconds("03:00:00"), line_width=3, line_dash="dash", line_color="green")
fig.add_vline(x=time_to_seconds("03:30:00"), line_width=3, line_dash="dash", line_color="yellow")
fig.add_vline(x=time_to_seconds("04:0:00"), line_width=3, line_dash="dash", line_color="blue")
fig.add_vline(x=time_to_seconds("04:30:00"), line_width=3, line_dash="dash", line_color="red")
name = "finish_time_distribution_with_lines"
save_figure(fig, name)

## Average Split 

In [542]:
similar_pace = df[df["finish.avg_pace"].between(time_to_seconds("00:08:40"), time_to_seconds("00:09:20"))]

graph_data = similar_pace[all_pace_cols]
graph_data.columns = all_pace_names 
graph_data = graph_data.mean()

fig = px.bar(graph_data , title="Average Pace Per Split<br><sub>8:40/mi - 9:20/mi Runners</sub>", template="seaborn")

bins = pd.Series(range(time_to_seconds("00:08:00"), time_to_seconds("00:10:30"), 15))
fig.update_layout(
    yaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_pace),
        "title_text":"Pace"
    },
    xaxis={
#         "range": [-20, 60],
        "title_text":"Split",
        
    }
)
fig.update_layout(showlegend=False)
fig.add_hline(y=time_to_seconds("00:09:09"), line_width=3, line_dash="dash", line_color="black")



name = "average_split_pace"
save_figure(fig, name)

In [None]:
df[""]

## Max Split Diff
For each of these runners, I calculate the difference between their fastest and slowest split pace. 

This difference would be zero if people ran the same speed the entire race. But, as you can see above, very few runners run even splits. In fact, the median runner like me had a difference of 1:47 minutes between their fastest and slowest pace! Of the runners I looked at, only 6.1% percent had split paces that varied by less than 30 seconds. 

In [471]:
fig = px.histogram(
    similar_pace,
    x="max_split_diff",
    title="Difference Between Fastest and Slowest Split<br><sub>Runners with 08:45/mi- 09:15/mi pace",
    template="seaborn")

bins = pd.Series(range(time_to_seconds("00:00:00"), time_to_seconds("00:06:00"), 30))
fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_pace),
        "title_text":"Pace Slow Down"
    },
    yaxis={
#         "range": [-20, 60],
        "title_text":"Number Runners"
    }
)

mean_max_split_diff = similar_pace["max_split_diff"].median()
percent_less_60 = (similar_pace["max_split_diff"]<30).mean()*100

fig.add_annotation(text="Median Difference: %s" % seconds_to_pace(mean_max_split_diff),
                  xref="paper", yref="paper",
                  x=0.9, y=0.7, showarrow=False)

fig.add_annotation(text="Percent less than 30 seconds: %.1f" % percent_less_60 + "%",
                  xref="paper", yref="paper",
                  x=0.9, y=0.6, showarrow=False)

name = "max_split_diff_distribution"
save_figure(fig, name)

## Close Beat Runners

In [521]:
print("Number of Close Beat:", len(close_beat))
print("Number of Close Miss:", len(close_miss))
print("Close Beat Mean Finish Time:", seconds_to_time(close_beat["finish.time"].mean()))
print("Close Miss Mean Finish Time:", seconds_to_time(close_miss["finish.time"].mean()))
print("Close Beat Mean Half Time:", seconds_to_time(close_beat["half.time"].mean()))
print("Close Miss Mean Half Time:", seconds_to_time(close_miss["half.time"].mean()))
print("Close Beat Mean 2nd Half Time:", seconds_to_time(close_beat["2nd_half.time"].mean()))
print("Close Miss Mean 2nd Half Time:", seconds_to_time(close_miss["2nd_half.time"].mean()))
print("Close Beat Average Pace:", seconds_to_pace(close_beat["finish.avg_pace"].mean()))
print("Close Miss Average Pace:", seconds_to_pace(close_miss["finish.avg_pace"].mean()))

Number of Close Beat: 868
Number of Close Miss: 708
Close Beat Mean Finish Time: 03:57:34
Close Miss Mean Finish Time: 04:02:33
Close Beat Mean Half Time: 01:52:01
Close Miss Mean Half Time: 01:53:31
Close Beat Mean 2nd Half Time: 02:05:32
Close Miss Mean 2nd Half Time: 02:09:02
Close Beat Average Pace: 09:03
Close Miss Average Pace: 09:14


## First Half Paces, Close to Goal Finishers

In [533]:
four_hours = time_to_seconds("04:00:00")

around_four["Finish Time Group"] = around_four["below_four_hours"].map({False: "4:00-4:05 Runners", True: "3:55-4:00 Runners"})

# make 5 minutes bins, fudge so 4 hours is an edge
bins = pd.Series(range(time_to_seconds("01:40:00"), time_to_seconds("02:10:00"), 60))

fig = px.histogram(around_four, 
                   x=["half.time"],# "2nd_half.time"], 
                   color="Finish Time Group",
                   title = "First Half Paces <br><sub>2021 Chicago Marathon</sub>", 
                   labels={"half.time": "Half Marathon Time"},
#                    hover_name=age_gender_match["finish.time"].apply(seconds_to_time),
                   template="seaborn",
                   histnorm='percent',
                   barmode="overlay"
                  )

fig.update_yaxes(title_text='Percent of Runners')

fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_time_short),
        "title_text":"First Half Time",
#                 "tickangle":35

    }
)

fig.add_vline(x=time_to_seconds("02:00:00"), line_width=3, line_dash="dash", line_color="black")

fig.add_vline(x=close_beat["half.time"].mean(), line_width=3, line_dash="solid", line_color="blue")
fig.add_vline(x=close_miss["half.time"].mean(), line_width=3, line_dash="solid", line_color="orange")


name = "first_half_distribution"
save_figure(fig, name)

In [112]:
print("Close Beat Fastest Split:", seconds_to_time(close_beat["fastest_split"].mean()))
print("Close Miss Fastest Split:", seconds_to_time(close_miss["fastest_split"].mean()))
print("Close Beat Slowest Split:", seconds_to_time(close_beat["slowest_split"].mean()))
print("Close Miss Slowest Split:", seconds_to_time(close_miss["slowest_split"].mean()))

Close Beat Fastest Split: 00:08:16
Close Miss Fastest Split: 00:08:20
Close Beat Slowest Split: 00:10:21
Close Miss Slowest Split: 00:10:47


In [43]:
print("Percent 3:55-4:00 slower than 4:00-4:05 average:", (close_beat["half.time"] >= close_miss["half.time"].mean()).mean())


Percent 3:55-4:00 slower than 4:00-4:05 average: 0.43663594470046085


In [None]:
print("Percent Close Beat under 2hr:", (close_beat["half.time"] < time_to_seconds("02:00:00")).mean())
print("Percent Close Miss under 2hr:", (close_miss["half.time"] < time_to_seconds("02:00:00")).mean())

## Correlations with Finish Time

In [492]:
# avg pace same as time
drop =[c for c in around_four.columns if "avg_pace" in c] + ["below_four_hours"]


# interesting = 

rank = around_four.drop(columns=drop).corr()["finish.time"].abs().sort_values(ascending=False).iloc[1:]

normailized = rank / rank.max()
rank

40km.time                     0.782724
35km.time                     0.400930
2nd_half.time                 0.324425
30km.time                     0.241075
40km.pace                     0.235082
30km.pace                     0.212829
half_to_25km.pace             0.206690
35km.pace                     0.178026
20km.pace                     0.174403
25km.time                     0.171257
20km_to_half.pace             0.155659
std_split_pace                0.142492
half.time                     0.137375
20km.time                     0.129416
slowest_split                 0.126942
std_split_pace_norm           0.125103
15km.pace                     0.124171
10km.pace                     0.109104
15km.time                     0.104578
2nd_half.time_diff            0.101615
10km.time                     0.091247
split_trend                   0.087788
max_split_diff                0.086603
fastest_split                 0.082162
1st_half.split_trend          0.081071
2nd_half.time_diff_percen

In [497]:
(close_beat["2nd_half.time"] / close_beat["half.time"]).mean()

1.126641800399946

In [498]:
(close_miss["2nd_half.time"] / close_miss["half.time"]).mean()

1.1431514465366714

In [506]:
seconds_to_pace((4*60*60 / 2.12) / 13.1)

'08:38'

## Pace Difference

In [540]:
pace_per_split = close_beat[all_pace_cols].mean().to_frame(name="Close Beat")
pace_per_split["Close Miss"] = close_miss[all_pace_cols].mean()
pace_per_split["diff"] = pace_per_split["Close Miss"] - pace_per_split["Close Beat"]

pace_per_split.index = all_pace_names

fig = px.bar(pace_per_split,
             y="diff",
             title="Pace Difference, Close Beat and Near Miss Runners",
             labels={
                "diff": "Pace Difference (Seconds)",
                "index": "Split"
             },
             template="seaborn")

fig.add_hline(y=11, line_width=3, line_dash="dash", line_color="black")


save_figure(fig, "pace_difference")

In [532]:
around_four["Finish Time Group"] = around_four["below_four_hours"].map({False: "4:00-4:05 Runners", True: "3:55-4:00 Runners"})

# make 5 minutes bins, fudge so 4 hours is an edge
bins = pd.Series(range(time_to_seconds("01:40:00"), time_to_seconds("02:10:00"), 60))

fig = px.histogram(around_four, 
                   x=["2nd_half.time"],# "2nd_half.time"], 
                   color="Finish Time Group",
                   title = "Second Half Time <br><sub>2021 Chicago Marathon</sub>", 
                   labels={"half.time": "Half Marathon Time"},
#                    hover_name=age_gender_match["finish.time"].apply(seconds_to_time),
                   template="seaborn",
                   histnorm='percent',
                   barmode="overlay"
                  )

fig.update_yaxes(title_text='Percent of Runners')

fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_time),
        "title_text":"Second Half Time"
    }
)

fig.add_vline(x=time_to_seconds("02:00:00"), line_width=3, line_dash="dash", line_color="black")



    
save_figure(fig, "second_half_distribution")

In [106]:
print("3:55-4:00 Mean Half Time:", seconds_to_time(close_beat["2nd_half.time"].mean()))
print("3:55-4:00 Median Half Time:", seconds_to_time(close_beat["2nd_half.time"].median()))

print("4:00-4:05 Mean Half Time:", seconds_to_time(close_miss["2nd_half.time"].mean()))
print("4:00-4:05 Median Half Time:", seconds_to_time(close_miss["2nd_half.time"].median()))

3:55-4:00 Mean Half Time: 02:05:32
3:55-4:00 Median Half Time: 02:04:49
4:00-4:05 Mean Half Time: 02:09:02
4:00-4:05 Median Half Time: 02:08:25


In [None]:
print("3:55-4:00 2nd Half Diff", seconds_to_time(close_beat["2nd_half.time_diff"].mean()))
print("4:05-4:05 2nd Half Diff", seconds_to_time(close_miss["2nd_half.time_diff"].mean()))

In [None]:
print("Percent 3:55-4:00 slower than 4:00-4:05 average 2nd half:", (close_beat["2nd_half.time"] >= close_miss["2nd_half.time"].mean()).mean())

In [None]:
print("Percent of all who ran 2nd half faster:", df["2nd_half_faster"].mean())

In [None]:
print("Average Slow down percent", df[df["2nd_half.time_diff"]>0]["2nd_half.time_diff_percent"].mean())
print("Average Speed up percent", df[df["2nd_half.time_diff"]<0]["2nd_half.time_diff_percent"].mean())

In [361]:
# Distribtuon of percent change
fig = px.histogram(df, 
             x="2nd_half.time_diff_percent",
#              histnorm='percent', 
             title = "Percent Change, First and Second Half Times<br><sub>2021 Chicago Marathon</sub>", 
             labels={"2nd_half.time_diff_percent": "Percent Change<br><sub>(Positive means slowed down)</sub>"}, 
             template="seaborn")

fig.update_layout(
    xaxis={
        "range": [-40, 60],
    }
        
)
# fig.add_vline(x=df["2nd_half.time_diff_percent"].mean(), line_width=3, line_dash="dash", line_color="white")


name = "2nd_half_time_diff_percent"
save_figure(fig, name)

In [362]:

fig = px.scatter(df[df["half.time"]<time_to_seconds("03:00:00")], 
                 x="finish.time",
                 y="2nd_half.time_diff_percent",
                 trendline="ols",
                 opacity=0.2,
                 trendline_color_override="#222",
                 title = "Finish Time vs. Percent Slow Down<br><sub>2021 Chicago Marathon</sub>", 
                             template="seaborn")
bins = pd.Series(range(time_to_seconds("02:00:00"), time_to_seconds("07:30:00"), 60*30))
fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_time),
        "title_text":"Finish Time"
    },
    yaxis={
        "range": [-20, 60],
        "title_text":"Percent Slow Down"
    }
)


name = "finish_time_vs_percent_slowdown"
save_figure(fig, name)

In [364]:
fig = px.scatter(around_four, 
                 x="half.time",
                 y="35km.time",
                 color="below_four_hours",
                 trendline="ols",
                 trendline_scope="overall",
                 opacity=0.2,
                 trendline_color_override="#222",
                 title = "Finish Time vs. Percent Slow Down<br><sub>2021 Chicago Marathon</sub>", 
                             template="seaborn")
fig

In [365]:
below_four["2nd_half.time"].describe().apply(seconds_to_time)

NameError: name 'below_four' is not defined

In [366]:
above_four["half.time"].describe().apply(seconds_to_time)

NameError: name 'above_four' is not defined

In [None]:

finish_similar = df[df["finish.time"].between(four_hours - 10*60, four_hours + 10*60)]
"Number of similar runners:", finish_similar.shape[0], "or", "%.2f" % (finish_similar.shape[0] / df.shape[0] * 100) + "%"

In [None]:
close_beat["split_trend"].mean()

# px.scatter(df.sample(frac=.1), x="split_trend", y="max_split_diff", opacity=.2, color="below_four_hours")

In [None]:
fig, ax = plt.subplots()
below_four = df[df["finish.time"].between(four_hours - 1*60, four_hours)]
below_four["half.avg_pace"].plot.hist(ax=ax, alpha=.5)
below_four["2nd_half.avg_pace"].plot.hist(ax=ax, alpha=.5)
ax.xaxis.set_major_formatter(lambda a, b: seconds_to_time(a))
print(seconds_to_time(below_four["half.avg_pace"].mean()))
print(seconds_to_time(below_four["2nd_half.avg_pace"].mean()))

In [None]:
seconds_to_time(df[(df["finish.time"].between(time_to_seconds("03:50:00"),time_to_seconds("04:00:00")))]["half.time"].mean())

In [None]:
on_pace_but_above_four = df[(df["half.time"].between(time_to_seconds("01:45:00"),time_to_seconds("01:52:01")))]


print(seconds_to_time(on_pace_but_above_four["finish.time"].mean()))
print(on_pace_but_above_four.shape[0])

In [18]:
bins = pd.Series(range(time_to_seconds("02:00:00"), time_to_seconds("07:30:00"), 60*30))

fig = px.histogram(on_pace_but_above_four, x="finish.time")

fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_time),
        "title_text":"Finish Time"
    }
)

fig

NameError: name 'on_pace_but_above_four' is not defined

In [560]:
bins = pd.Series(range(time_to_seconds("00:08:00"), time_to_seconds("00:11:30"), 30))

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Bar(x=pace_per_split.index, y=pace_per_split["diff"], name="Difference Between Groups"),
    secondary_y=False,
)
fig.add_trace(
    go.Scatter(x=pace_per_split.index, y=pace_per_split["Close Miss"], name="Above Four Hours"),
    secondary_y=True,
)
fig.add_trace(
    go.Scatter(x=pace_per_split.index, y=pace_per_split["Close Beat"], name="Below Four Hours"),
    secondary_y=True,
)



# Add figure title
fig.update_layout(
    title_text="Difference in Pace and Average Pace"
)

# Set x-axis title
fig.update_xaxes(title_text="Split")

# Set y-axes titles
fig.update_yaxes(title_text="Pace Difference (seconds)", secondary_y=False)
fig.update_yaxes(title_text="Average Pace", secondary_y=True)

fig.show()

# fig.update_layout(
#     yaxis={
#         "range": [
#             bins.min(),
#             bins.max(),
#         ],
#         "tickmode": "array",
#         "tickvals": bins,
#         "ticktext": bins.apply(seconds_to_time),
#         "title_text":"Split Pace"
#     }
# )
fig.data = fig.data[::-1]

In [None]:
pace_columns = ['5km.pace', '10km.pace', '15km.pace', '20km.pace', 'half.pace', '25km.pace',
       '30km.pace', '35km.pace', '40km.pace', 'finish.pace']

pace_times = pd.DataFrame({
    "Below Four": below_four[pace_columns].mean(),
    "Above Four": above_four[pace_columns].mean()
})

pace_times["Diff"] = pace_times["Below Four"] - pace_times["Above Four"]
pace_times.applymap(seconds_to_time)

In [563]:
px.scatter(similar_pace, y="max_split_diff", x="finish.avg_pace", trendline="ols", trendline_color_override="black")

In [564]:
finish_similar["half.time"].quantile([.8, .9, .95, .99]).apply(seconds_to_time)

NameError: name 'finish_similar' is not defined

I've also heard about the concept of "hitting the wall" which is a loss of energy at the end of a race caused by the depletion of glycogen in the muscles. There could also be many other things that go wrong for people like an injury or bad pacing. To include these runners. I added any runners who ran the first have the race in less 2:00 but finished over 4:10. 

In [None]:
pace_similar = df[df["half.time"] < time_to_seconds("02:00:00")]
pace_similar = pace_similar[pace_similar["finish.time"] > time_to_seconds("04:10:00")]
"Number of pace similar runners:", pace_similar.shape[0], "or", "%.2f" % (pace_similar.shape[0] / df.shape[0] * 100) + "%"

In [None]:
all_similar = pd.concat([finish_similar, pace_similar])
"Number of all similar runners:", all_similar.shape[0], "or", "%.2f" % (all_similar.shape[0] / df.shape[0] * 100) + "%"

In [None]:
df_half_finish = all_similar[["half_pace", "finish_pace"]].copy()
df_half_finish["diff"] = df_half_finish["finish_pace"] - df_half_finish["half_pace"]
df_half_finish.plot.scatter(x="half_pace", y="diff", figsize=(10,6))

In [None]:
# all_similar.groupby("age_class")["bib"].count().plot.bar()

Now I have a list of ~4000 runners that I would consider similar to me. 

In [None]:
all_similar["under_four_hours"].value_counts() / all_similar.shape[0]

In [None]:
all_similar

## Whats best strategy for running 4 hr marathon? 

My intuition and experience running races in high school cross country tells me the primary strategic consideration is what pace to run. 

In [None]:
df

In [520]:
step = 5 # seconds
pace_columns = ['5km.avg_pace', '10km.avg_pace', '15km.avg_pace', '20km.avg_pace',"half.avg_pace", '25km.avg_pace', '30km.avg_pace', '35km.avg_pace', '40km.avg_pace', "finish.avg_pace"]
# pace_columns = ['5km.pace']

dfx = df

data = []
bins = pd.Series(range(int(8*60), int(9.5*60), step))
for pace in bins:
    to_add = [pace]
    for c in pace_columns:
        hit_pace = dfx[(dfx[c].between(pace, pace+step))]
        percent = hit_pace["finish.avg_pace"].mean()
        to_add.append(percent)
    data.append(to_add)

# fig, ax = plt.subplots()
data = pd.DataFrame(data, columns=["pace"]+pace_columns)
fig = px.line(data, x="pace", y=pace_columns)
fig.add_hline(y=time_to_seconds("00:09:09"), line_width=3, line_dash="dash", line_color="grey")

fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_pace),
        "tickangle":35
    },
    yaxis={
        "range": [
            bins.min()*1.1,
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_pace),
        "tickangle":35
    }
)
fig
# data

In [None]:
data

based on this, 

# What is a reasonable target time?

What is the typical time by age group? 

25-34 year olds all seem to run about the same, so I should compare myselfs to them

In [None]:
# average time and 
df_stats = df.groupby("age_class")["finish.time"].agg(["count", "mean", "median", "std"])
df_stats["mean_pace"] = (df_stats["mean"] / 26.2).apply(seconds_to_time)
df_stats["mean"] = df_stats["mean"]#.apply(seconds_to_time)
df_stats["median"] = df_stats["median"].apply(seconds_to_time)
df_stats["std"] = df_stats["std"].apply(seconds_to_time)
df_stats["mean_hit_wall_ratio"] = df.groupby("age_class")["hit_wall_ratio"].mean()
df_stats["mean_maximum_split_diff"] = df.groupby("age_class")["max_split_diff"].mean().apply(seconds_to_time)
df_stats

In [None]:
df_stats.plot.bar(y="mean")

In [None]:
#  break runners up into n groups based on time
# label runner by quantile
(df["finish.time"] / 26.2).quantile(np.linspace(0, 1, 11)).apply(seconds_to_time)

dir(df["finish.time"])

In [None]:
help(df.quantile)

In [None]:
# average difference in splits
df_similar["5km_split_pace"].apply(np.std).describe().apply(seconds_to_time)

In [None]:
df_similar[["finish_pace", "mean_split_pace", "mean_split_pace_norm",  "std_split_pace", "std_split_pace_norm", "hit_wall_ratio"]].corr()

In [None]:
df.iloc[1]

In [None]:
df.plot.scatter(x="finish_pace", y="mean_split_pace")

In [None]:
# easiest / hard split
splits_times = pd.DataFrame(df_similar["5km_split_pace"].array, columns=["5km.split", "10km.split", "15km.split", "20km.split", "25km.split", "30km.split", "35km.split", "40km.split"])
splits_times.describe().astype(int).applymap(seconds_to_time)

In [None]:
df.sort_values("hit_wall_ratio")

In [None]:
# todo look for people who were on pace, but failed
# todo look for people who weren't on pace, but succeeded
# did time of day when someone start matter?
# which was easiest / hardest split?
# do people who run even splits do better?
# open question = how could I generalize this for other runners / marathons?


# visualization
# x time, y total distane, plot everyone

In [None]:
splits

In [None]:
def make_feature(runner):
    features = {
     "age_class": runner["age_class"],   
     "country": runner["country"],
     "h"
    }

    print(runner)



            
    return features


data 
df = pd.DataFrame([make_feature(r) for i,r in tqdm.tqdm(all_similar.iterrows())])

In [None]:
df

In [None]:
df.groupby("age_class")["finish_elapsed"].mean().map(seconds_to_time)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

In [None]:
select = ["age_class", "5km_elapsed", "10km_elapsed", "10km_diff", "15km_elapsed", "15km_diff", "15km_std", "20km_elapsed", "20km_diff", "20km_std", "25km_elapsed", "25km_diff", "25km_std","finish_elapsed"]
select = [s for s in select if "diff" not in s]
df_select = df[select].fillna(np.nan)
df_select["age_class"] = df_select["age_class"].str[:2].astype(int)
df_select = df_select[['20km_elapsed', "finish_elapsed"]]



In [None]:
df_select

In [None]:
df.corr()["finish_elapsed"].sort_values(ascending=False)

In [None]:
X = df_select.copy()
y = X.pop("finish_elapsed")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
clf = RandomForestClassifier(max_depth=2, n_jobs=-2, random_state=0, verbose=True)

p = make_pipeline(imp, clf)
p.fit(X_train, y_train)

In [None]:
print("Predict mean time:", mean_absolute_percentage_error(y_test, [y_test.mean()]*len(y_test)))
print("Predict median time:", mean_absolute_percentage_error(y_test, [y_test.median()]*len(y_test)))

In [None]:
y_pred = p.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred)

In [None]:
pd.DataFrame(zip(p.named_steps.randomforestclassifier.feature_importances_, p.feature_names_in_)).sort_values(0, ascending=False)

In [None]:
from sklearn.cluster import KMeans

In [None]:
p.fit(X_train, y_train)

In [None]:
# hit_wall_ratio
def hit_wall_ratio(s):
    return s[-1] / np.mean(s[:-1])
df['hit_wall_ratio'] = df["5km_split_pace"].apply(hit_wall_ratio)

In [None]:
fig.axes[0].axvline(time_to_seconds("03:30:00"), color='g', linestyle='dashed', linewidth=2)
fig.axes[0].axvline(time_to_seconds("03:00:00"), color='y', linestyle='dashed', linewidth=2)
fig.axes[0].axvline(time_to_seconds("04:00:00"), color='k', linestyle='dashed', linewidth=2)
fig.axes[0].axvline(time_to_seconds("04:30:00"), color='k', linestyle='dashed', linewidth=2)
fig

In [None]:
data.sample(100).dropna(axis=0).applymap(time_to_seconds).applymap(lambda x: x + (np.random.rand() - .5)*10).applymap(seconds_to_time).to_csv("sample_data.csv", index=None)