# Analyzing Marathon Data

In [1]:
from marathon_analysis.helpers import (
    time_to_seconds, seconds_to_time, 
    pretty_print, seconds_to_time_short, 
    save_figure, seconds_to_pace, prep_data, all_pace_cols, all_pace_names)

import pandas as pd
import plotly.express as px

# Load Runner Data

If you provide a file with this same format, you can run the analysis yourself

In [2]:
data = pd.read_csv("data/all_runners_with_splits.csv")
data.head(5)

Unnamed: 0,5km.time,10km.time,15km.time,20km.time,half.time,25km.time,30km.time,35km.time,40km.time,finish.time
0,00:14:43,00:29:15,00:44:21,00:59:13,01:02:29,01:14:42,01:30:06,01:45:01,01:59:44,02:06:12
1,00:14:43,00:29:25,00:44:23,00:59:24,01:02:40,01:14:44,01:30:07,01:45:02,01:59:53,02:06:35
2,00:14:43,00:29:17,00:44:21,00:59:13,01:02:29,01:14:42,01:30:06,01:45:01,02:00:05,02:06:51
3,00:14:44,00:29:16,00:44:22,00:59:15,01:02:30,01:14:44,01:30:07,01:45:30,02:01:48,02:08:50
4,00:14:36,00:29:15,00:44:06,00:59:10,01:02:29,01:14:43,01:30:08,01:45:59,02:02:16,02:09:39


## Prep Data

Add some statistics to our data

In [3]:
df = prep_data(data)
df

Unnamed: 0,5km.time,10km.time,15km.time,20km.time,half.time,25km.time,30km.time,35km.time,40km.time,finish.time,...,std_split_pace_norm,fastest_split,slowest_split,max_split_diff,max_split_diff_norm,slowest_split.name,fastest_split.name,split_trend,1st_half.split_trend,2nd_half.split_trend
0,883.0,1755.0,2661.0,3553.0,3749.0,4482.0,5406.0,6301.0,7184.0,7572.0,...,0.020761,280.0,302.280466,22.280466,0.077095,half_to_25km.pace,10km.pace,0.120360,0.551771,-1.803267
1,883.0,1765.0,2663.0,3564.0,3760.0,4484.0,5407.0,6302.0,7193.0,7595.0,...,0.013841,283.0,298.568973,15.568973,0.053872,half_to_25km.pace,10km.pace,0.320522,0.579057,-0.792237
2,883.0,1757.0,2661.0,3553.0,3749.0,4482.0,5406.0,6301.0,7205.0,7611.0,...,0.017241,281.0,302.280466,21.280466,0.073381,half_to_25km.pace,10km.pace,0.492036,0.508070,-0.712189
3,884.0,1756.0,2662.0,3555.0,3750.0,4484.0,5407.0,6330.0,7308.0,7730.0,...,0.030508,280.0,314.000000,34.000000,0.115254,40km.pace,10km.pace,1.283587,0.463621,1.101437
4,876.0,1755.0,2646.0,3550.0,3749.0,4483.0,5408.0,6359.0,7336.0,7779.0,...,0.037162,281.0,324.801646,43.801646,0.147979,40km_to_finish.pace,5km.pace,1.758640,1.101659,2.102019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26073,5512.0,8247.0,11033.0,13807.0,14463.0,16752.0,19792.0,22959.0,26264.0,27575.0,...,0.264259,880.0,1774.000000,894.000000,0.849810,5km.pace,10km.pace,-12.983173,-66.396641,5.520979
26074,2842.0,5899.0,9124.0,12450.0,13170.0,15737.0,19176.0,22671.0,26433.0,28085.0,...,0.078431,914.0,1211.224196,297.224196,0.277520,40km_to_finish.pace,5km.pace,11.553071,14.706224,14.887796
26075,2741.0,5730.0,8688.0,11817.0,12516.0,15220.0,18796.0,22667.0,26564.0,28274.0,...,0.120482,882.0,1254.000000,372.000000,0.344764,40km.pace,5km.pace,17.420499,12.438511,14.159115
26076,2746.0,5670.0,8624.0,11855.0,12582.0,15829.0,19987.0,24146.0,28305.0,30130.0,...,0.165217,883.0,1339.024109,456.024109,0.396543,half_to_25km.pace,5km.pace,23.204654,17.203653,-0.073627


## Finishing Times

In [48]:
fig = px.histogram(df, 
                   x="finish.time", 
                   title = "2021 Chicago Marathon Finishing Times", 
                   labels={"finish.time": "Finish Time (h:mm)"},
                   hover_name=df["finish.time"].apply(seconds_to_time_short),
                   template="seaborn")

fig.update_yaxes(title_text='Number of Runners')
bins = pd.Series(range(int(df["finish.time"].min()) - (600 - 228), int(df["finish.time"].max()), 60*15))
fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_time_short),
        "tickangle":35
    }
)

name = "finish_time_distribution"

save_figure(fig, name)

## Finishing Times with Goal Line

In [49]:
fig.add_vline(x=time_to_seconds("03:00:00"), line_width=3, line_dash="dash", line_color="green")
fig.add_vline(x=time_to_seconds("03:30:00"), line_width=3, line_dash="dash", line_color="violet")
fig.add_vline(x=time_to_seconds("04:0:00"), line_width=3, line_dash="dash", line_color="blue")
fig.add_vline(x=time_to_seconds("04:30:00"), line_width=3, line_dash="dash", line_color="orange")
name = "finish_time_distribution_with_lines"
save_figure(fig, name)

## Average Split 

In [51]:
similar_pace = df[df["finish.avg_pace"].between(time_to_seconds("00:08:40"), time_to_seconds("00:09:20"))]

graph_data = similar_pace[all_pace_cols]
graph_data.columns = all_pace_names 
graph_data = graph_data.mean()

fig = px.bar(graph_data , title="Average Pace Per Split<br><sub>8:40/mi - 9:20/mi Runners</sub>", template="seaborn")

bins = pd.Series(range(time_to_seconds("00:08:00"), time_to_seconds("00:10:30"), 15))
fig.update_layout(
    yaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_pace),
        "title_text":"Pace"
    },
    xaxis={
#         "range": [-20, 60],
        "title_text":"Split",
        
    }
)
fig.update_layout(showlegend=False)
fig.add_hline(y=time_to_seconds("00:09:09"), line_width=3, line_dash="dash", line_color="black")



name = "average_split_pace"
save_figure(fig, name)

## Slowest Split Breakdown

In [17]:
r = dict(zip(all_pace_cols, all_pace_names))
x = similar_pace["slowest_split.name"].replace(r).value_counts() / len(similar_pace)
d = (x.round(3)*100).to_frame().reset_index()
d.columns = ["Split", "% Worst Split"]
d

Unnamed: 0,Split,% Worst Split
0,35 - 40km,60.5
1,40km - Finish,13.6
2,30 - 35km,11.6
3,25 - 30km,4.6
4,0 - 5km,3.6
5,20km - Half,2.3
6,5 - 10km,1.7
7,Half - 25km,1.4
8,15 - 20km,0.4
9,10 - 15km,0.3


## Max Split Diff
For each of these runners, I calculate the difference between their fastest and slowest split pace. 

This difference would be zero if people ran the same speed the entire race. But, as you can see above, very few runners run even splits. In fact, the median runner like me had a difference of 1:47 minutes between their fastest and slowest pace! Of the runners I looked at, only 3% percent had split paces that varied by less than 30 seconds. 


In [50]:
fig = px.histogram(
    similar_pace,
    x="max_split_diff",
    title="Difference Between Fastest and Slowest Split<br><sub>Runners with 08:45/mi- 09:15/mi pace",
    template="seaborn")

bins = pd.Series(range(time_to_seconds("00:00:00"), time_to_seconds("00:06:00"), 30))
fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_pace),
        "title_text":"Pace Slow Down"
    },
    yaxis={
#         "range": [-20, 60],
        "title_text":"Number Runners"
    }
)

mean_max_split_diff = similar_pace["max_split_diff"].median()
percent_less_60 = (similar_pace["max_split_diff"]<30).mean()*100

fig.add_annotation(text="Median Difference: %s" % seconds_to_pace(mean_max_split_diff),
                  xref="paper", yref="paper",
                  x=0.9, y=0.7, showarrow=False)

fig.add_annotation(text="Percent less than 30 seconds: %.1f" % percent_less_60 + "%",
                  xref="paper", yref="paper",
                  x=0.9, y=0.6, showarrow=False)

name = "max_split_diff_distribution"
save_figure(fig, name)

## Identify Close Beat and Close Miss Runners

In [20]:
four_hours = time_to_seconds("04:00:00")
delta = 5*60
close_beat = df[df["finish.time"].between(four_hours - delta, four_hours, inclusive="left")]
close_miss = df[df["finish.time"].between(four_hours, four_hours + delta)]
on_pace = df[(df["finish.time"] > four_hours + delta) & df["half.time"].between(time_to_seconds("01:45:00"), time_to_seconds("01:55:00"))]
around_four = pd.concat([close_beat, close_miss])#, on_pace])

In [21]:
print("Number of Close Beat:", len(close_beat))
print("Number of Close Miss:", len(close_miss))
print("Close Beat Mean Finish Time:", seconds_to_time(close_beat["finish.time"].mean()))
print("Close Miss Mean Finish Time:", seconds_to_time(close_miss["finish.time"].mean()))
print("Close Beat Mean Half Time:", seconds_to_time(close_beat["half.time"].mean()))
print("Close Miss Mean Half Time:", seconds_to_time(close_miss["half.time"].mean()))
print("Close Beat Mean 2nd Half Time:", seconds_to_time(close_beat["2nd_half.time"].mean()))
print("Close Miss Mean 2nd Half Time:", seconds_to_time(close_miss["2nd_half.time"].mean()))
print("Close Beat Average Pace:", seconds_to_pace(close_beat["finish.avg_pace"].mean()))
print("Close Miss Average Pace:", seconds_to_pace(close_miss["finish.avg_pace"].mean()))

Number of Close Beat: 868
Number of Close Miss: 708
Close Beat Mean Finish Time: 03:57:34
Close Miss Mean Finish Time: 04:02:33
Close Beat Mean Half Time: 01:52:01
Close Miss Mean Half Time: 01:53:31
Close Beat Mean 2nd Half Time: 02:05:32
Close Miss Mean 2nd Half Time: 02:09:02
Close Beat Average Pace: 09:03
Close Miss Average Pace: 09:14


## First Half Paces, Close to Goal Finishers

In [22]:
four_hours = time_to_seconds("04:00:00")

around_four["Finish Time Group"] = around_four["below_four_hours"].map({False: "4:00-4:05 Runners", True: "3:55-4:00 Runners"})

# make 5 minutes bins, fudge so 4 hours is an edge
bins = pd.Series(range(time_to_seconds("01:40:00"), time_to_seconds("02:10:00"), 60))

fig = px.histogram(around_four, 
                   x=["half.time"],# "2nd_half.time"], 
                   color="Finish Time Group",
                   title = "First Half Paces <br><sub>2021 Chicago Marathon</sub>", 
                   labels={"half.time": "Half Marathon Time"},
#                    hover_name=age_gender_match["finish.time"].apply(seconds_to_time),
                   template="seaborn",
                   histnorm='percent',
                   barmode="overlay"
                  )

fig.update_yaxes(title_text='Percent of Runners')

fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_time_short),
        "title_text":"First Half Time",
#                 "tickangle":35

    }
)

fig.add_vline(x=time_to_seconds("02:00:00"), line_width=3, line_dash="dash", line_color="black")

fig.add_vline(x=close_beat["half.time"].mean(), line_width=3, line_dash="solid", line_color="blue")
fig.add_vline(x=close_miss["half.time"].mean(), line_width=3, line_dash="solid", line_color="orange")


name = "first_half_distribution"
save_figure(fig, name)

In [23]:
print("Close Beat Fastest Split:", seconds_to_time(close_beat["fastest_split"].mean()))
print("Close Miss Fastest Split:", seconds_to_time(close_miss["fastest_split"].mean()))
print("Close Beat Slowest Split:", seconds_to_time(close_beat["slowest_split"].mean()))
print("Close Miss Slowest Split:", seconds_to_time(close_miss["slowest_split"].mean()))

Close Beat Fastest Split: 00:08:13
Close Miss Fastest Split: 00:08:18
Close Beat Slowest Split: 00:10:29
Close Miss Slowest Split: 00:10:54


In [24]:
print("Percent 3:55-4:00 slower than 4:00-4:05 average:", (close_beat["half.time"] >= close_miss["half.time"].mean()).mean())

Percent 3:55-4:00 slower than 4:00-4:05 average: 0.43663594470046085


In [25]:
print("Percent Close Beat under 2hr:", (close_beat["half.time"] < time_to_seconds("02:00:00")).mean())
print("Percent Close Miss under 2hr:", (close_miss["half.time"] < time_to_seconds("02:00:00")).mean())

Percent Close Beat under 2hr: 0.9539170506912442
Percent Close Miss under 2hr: 0.9011299435028248


In [46]:
two_hours = 60*60*2
two = df[df["half.time"].between(two_hours, two_hours + 5*60)]
fig = px.histogram(two["finish.time"])
fig.add_vline(x=four_hours, line_width=3, line_dash="dash", line_color="black")
two["below_four_hours"].mean()

0.022904737116085372

## Correlations with Finish Time

In [27]:
# avg pace same as time
drop =[c for c in around_four.columns if "avg_pace" in c] + ["below_four_hours"]


# interesting = 

rank = df.drop(columns=drop).corr()["finish.time"].abs().sort_values(ascending=False).iloc[1:]

normailized = rank / rank.max()
rank

40km.time                     0.999097
35km.time                     0.992776
30km.time                     0.981670
2nd_half.time                 0.972934
25km.time                     0.967063
20km.pace                     0.954516
half.time                     0.954095
half_to_25km.pace             0.953732
20km.time                     0.950630
30km.pace                     0.947512
15km.pace                     0.940090
15km.time                     0.935079
35km.pace                     0.925485
10km.pace                     0.924153
10km.time                     0.919567
fastest_split                 0.908353
20km_to_half.pace             0.897358
5km.time                      0.890165
5km.pace                      0.890159
40km.pace                     0.873006
slowest_split                 0.841733
40km_to_finish.pace           0.794773
1st_half.split_trend          0.569194
std_split_pace                0.560306
max_split_diff                0.510587
2nd_half.time_diff       

In [None]:
over_two = df[df["half.time"].between()]

In [None]:
(close_beat["2nd_half.time"] / close_beat["half.time"]).mean()

In [None]:
(close_miss["2nd_half.time"] / close_miss["half.time"]).mean()

In [None]:
seconds_to_pace((4*60*60 / 2.12) / 13.1)

## Pace Difference

In [None]:
pace_per_split = close_beat[all_pace_cols].mean().to_frame(name="Close Beat")
pace_per_split["Close Miss"] = close_miss[all_pace_cols].mean()
pace_per_split["diff"] = pace_per_split["Close Miss"] - pace_per_split["Close Beat"]

pace_per_split.index = all_pace_names

fig = px.bar(pace_per_split,
             y="diff",
             title="Pace Difference, Close Beat and Near Miss Runners",
             labels={
                "diff": "Pace Difference (Seconds)",
                "index": "Split"
             },
             template="seaborn")

fig.add_hline(y=11, line_width=3, line_dash="dash", line_color="black")


save_figure(fig, "pace_difference")

In [None]:
around_four["Finish Time Group"] = around_four["below_four_hours"].map({False: "4:00-4:05 Runners", True: "3:55-4:00 Runners"})

# make 5 minutes bins, fudge so 4 hours is an edge
bins = pd.Series(range(time_to_seconds("01:40:00"), time_to_seconds("02:10:00"), 60))

fig = px.histogram(around_four, 
                   x=["2nd_half.time"],# "2nd_half.time"], 
                   color="Finish Time Group",
                   title = "Second Half Time <br><sub>2021 Chicago Marathon</sub>", 
                   labels={"half.time": "Half Marathon Time"},
#                    hover_name=age_gender_match["finish.time"].apply(seconds_to_time),
                   template="seaborn",
                   histnorm='percent',
                   barmode="overlay"
                  )

fig.update_yaxes(title_text='Percent of Runners')

fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_time),
        "title_text":"Second Half Time"
    }
)

fig.add_vline(x=time_to_seconds("02:00:00"), line_width=3, line_dash="dash", line_color="black")



    
save_figure(fig, "second_half_distribution")

In [None]:
print("3:55-4:00 Mean Half Time:", seconds_to_time(close_beat["2nd_half.time"].mean()))
print("3:55-4:00 Median Half Time:", seconds_to_time(close_beat["2nd_half.time"].median()))

print("4:00-4:05 Mean Half Time:", seconds_to_time(close_miss["2nd_half.time"].mean()))
print("4:00-4:05 Median Half Time:", seconds_to_time(close_miss["2nd_half.time"].median()))

In [None]:
print("3:55-4:00 2nd Half Diff", seconds_to_time(close_beat["2nd_half.time_diff"].mean()))
print("4:05-4:05 2nd Half Diff", seconds_to_time(close_miss["2nd_half.time_diff"].mean()))

In [None]:
print("Percent 3:55-4:00 slower than 4:00-4:05 average 2nd half:", (close_beat["2nd_half.time"] >= close_miss["2nd_half.time"].mean()).mean())

In [None]:
print("Percent of all who ran 2nd half faster:", df["2nd_half_faster"].mean())

In [None]:
print("Average Slow down percent", df[df["2nd_half.time_diff"]>0]["2nd_half.time_diff_percent"].mean())
print("Average Speed up percent", df[df["2nd_half.time_diff"]<0]["2nd_half.time_diff_percent"].mean())

In [None]:
# Distribtuon of percent change
fig = px.histogram(df, 
             x="2nd_half.time_diff_percent",
#              histnorm='percent', 
             title = "Percent Change, First and Second Half Times<br><sub>2021 Chicago Marathon</sub>", 
             labels={"2nd_half.time_diff_percent": "Percent Change<br><sub>(Positive means slowed down)</sub>"}, 
             template="seaborn")

fig.update_layout(
    xaxis={
        "range": [-40, 60],
    }
        
)
# fig.add_vline(x=df["2nd_half.time_diff_percent"].mean(), line_width=3, line_dash="dash", line_color="white")


name = "2nd_half_time_diff_percent"
save_figure(fig, name)

In [None]:

fig = px.scatter(df[df["half.time"]<time_to_seconds("03:00:00")], 
                 x="finish.time",
                 y="2nd_half.time_diff_percent",
                 trendline="ols",
                 opacity=0.2,
                 trendline_color_override="#222",
                 title = "Finish Time vs. Percent Slow Down<br><sub>2021 Chicago Marathon</sub>", 
                             template="seaborn")
bins = pd.Series(range(time_to_seconds("02:00:00"), time_to_seconds("07:30:00"), 60*30))
fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_time),
        "title_text":"Finish Time"
    },
    yaxis={
        "range": [-20, 60],
        "title_text":"Percent Slow Down"
    }
)


name = "finish_time_vs_percent_slowdown"
save_figure(fig, name)

In [None]:
fig = px.scatter(around_four, 
                 x="half.time",
                 y="35km.time",
                 color="below_four_hours",
                 trendline="ols",
                 trendline_scope="overall",
                 opacity=0.2,
                 trendline_color_override="#222",
                 title = "Finish Time vs. Percent Slow Down<br><sub>2021 Chicago Marathon</sub>", 
                             template="seaborn")
fig

In [None]:
below_four["2nd_half.time"].describe().apply(seconds_to_time)

In [None]:
above_four["half.time"].describe().apply(seconds_to_time)

In [None]:

finish_similar = df[df["finish.time"].between(four_hours - 10*60, four_hours + 10*60)]
"Number of similar runners:", finish_similar.shape[0], "or", "%.2f" % (finish_similar.shape[0] / df.shape[0] * 100) + "%"

In [None]:
close_beat["split_trend"].mean()

# px.scatter(df.sample(frac=.1), x="split_trend", y="max_split_diff", opacity=.2, color="below_four_hours")

In [None]:
fig, ax = plt.subplots()
below_four = df[df["finish.time"].between(four_hours - 1*60, four_hours)]
below_four["half.avg_pace"].plot.hist(ax=ax, alpha=.5)
below_four["2nd_half.avg_pace"].plot.hist(ax=ax, alpha=.5)
ax.xaxis.set_major_formatter(lambda a, b: seconds_to_time(a))
print(seconds_to_time(below_four["half.avg_pace"].mean()))
print(seconds_to_time(below_four["2nd_half.avg_pace"].mean()))

In [None]:
seconds_to_time(df[(df["finish.time"].between(time_to_seconds("03:50:00"),time_to_seconds("04:00:00")))]["half.time"].mean())

In [None]:
on_pace_but_above_four = df[(df["half.time"].between(time_to_seconds("01:45:00"),time_to_seconds("01:52:01")))]


print(seconds_to_time(on_pace_but_above_four["finish.time"].mean()))
print(on_pace_but_above_four.shape[0])

In [None]:
bins = pd.Series(range(time_to_seconds("02:00:00"), time_to_seconds("07:30:00"), 60*30))

fig = px.histogram(on_pace_but_above_four, x="finish.time")

fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_time),
        "title_text":"Finish Time"
    }
)

fig

In [None]:
bins = pd.Series(range(time_to_seconds("00:08:00"), time_to_seconds("00:11:30"), 30))

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Bar(x=pace_per_split.index, y=pace_per_split["diff"], name="Difference Between Groups"),
    secondary_y=False,
)
fig.add_trace(
    go.Scatter(x=pace_per_split.index, y=pace_per_split["Close Miss"], name="Above Four Hours"),
    secondary_y=True,
)
fig.add_trace(
    go.Scatter(x=pace_per_split.index, y=pace_per_split["Close Beat"], name="Below Four Hours"),
    secondary_y=True,
)



# Add figure title
fig.update_layout(
    title_text="Difference in Pace and Average Pace"
)

# Set x-axis title
fig.update_xaxes(title_text="Split")

# Set y-axes titles
fig.update_yaxes(title_text="Pace Difference (seconds)", secondary_y=False)
fig.update_yaxes(title_text="Average Pace", secondary_y=True)

fig.show()

# fig.update_layout(
#     yaxis={
#         "range": [
#             bins.min(),
#             bins.max(),
#         ],
#         "tickmode": "array",
#         "tickvals": bins,
#         "ticktext": bins.apply(seconds_to_time),
#         "title_text":"Split Pace"
#     }
# )
fig.data = fig.data[::-1]

In [None]:
pace_columns = ['5km.pace', '10km.pace', '15km.pace', '20km.pace', 'half.pace', '25km.pace',
       '30km.pace', '35km.pace', '40km.pace', 'finish.pace']

pace_times = pd.DataFrame({
    "Below Four": below_four[pace_columns].mean(),
    "Above Four": above_four[pace_columns].mean()
})

pace_times["Diff"] = pace_times["Below Four"] - pace_times["Above Four"]
pace_times.applymap(seconds_to_time)

In [None]:
px.scatter(similar_pace, y="max_split_diff", x="finish.avg_pace", trendline="ols", trendline_color_override="black")

In [None]:
finish_similar["half.time"].quantile([.8, .9, .95, .99]).apply(seconds_to_time)

I've also heard about the concept of "hitting the wall" which is a loss of energy at the end of a race caused by the depletion of glycogen in the muscles. There could also be many other things that go wrong for people like an injury or bad pacing. To include these runners. I added any runners who ran the first have the race in less 2:00 but finished over 4:10. 

In [None]:
pace_similar = df[df["half.time"] < time_to_seconds("02:00:00")]
pace_similar = pace_similar[pace_similar["finish.time"] > time_to_seconds("04:10:00")]
"Number of pace similar runners:", pace_similar.shape[0], "or", "%.2f" % (pace_similar.shape[0] / df.shape[0] * 100) + "%"

In [None]:
all_similar = pd.concat([finish_similar, pace_similar])
"Number of all similar runners:", all_similar.shape[0], "or", "%.2f" % (all_similar.shape[0] / df.shape[0] * 100) + "%"

In [None]:
df_half_finish = all_similar[["half_pace", "finish_pace"]].copy()
df_half_finish["diff"] = df_half_finish["finish_pace"] - df_half_finish["half_pace"]
df_half_finish.plot.scatter(x="half_pace", y="diff", figsize=(10,6))

In [None]:
# all_similar.groupby("age_class")["bib"].count().plot.bar()

Now I have a list of ~4000 runners that I would consider similar to me. 

In [None]:
all_similar["under_four_hours"].value_counts() / all_similar.shape[0]

In [None]:
all_similar

## Whats best strategy for running 4 hr marathon? 

My intuition and experience running races in high school cross country tells me the primary strategic consideration is what pace to run. 

In [None]:
df

In [None]:
step = 5 # seconds
pace_columns = ['5km.avg_pace', '10km.avg_pace', '15km.avg_pace', '20km.avg_pace',"half.avg_pace", '25km.avg_pace', '30km.avg_pace', '35km.avg_pace', '40km.avg_pace', "finish.avg_pace"]
# pace_columns = ['5km.pace']

dfx = df

data = []
bins = pd.Series(range(int(8*60), int(9.5*60), step))
for pace in bins:
    to_add = [pace]
    for c in pace_columns:
        hit_pace = dfx[(dfx[c].between(pace, pace+step))]
        percent = hit_pace["finish.avg_pace"].mean()
        to_add.append(percent)
    data.append(to_add)

# fig, ax = plt.subplots()
data = pd.DataFrame(data, columns=["pace"]+pace_columns)
fig = px.line(data, x="pace", y=pace_columns)
fig.add_hline(y=time_to_seconds("00:09:09"), line_width=3, line_dash="dash", line_color="grey")

fig.update_layout(
    xaxis={
        "range": [
            bins.min(),
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_pace),
        "tickangle":35
    },
    yaxis={
        "range": [
            bins.min()*1.1,
            bins.max(),
        ],
        "tickmode": "array",
        "tickvals": bins,
        "ticktext": bins.apply(seconds_to_pace),
        "tickangle":35
    }
)
fig
# data

In [None]:
data

based on this, 

In [None]:
# todo look for people who were on pace, but failed
# todo look for people who weren't on pace, but succeeded
# did time of day when someone start matter?
# which was easiest / hardest split?
# do people who run even splits do better?
# open question = how could I generalize this for other runners / marathons?


# visualization
# x time, y total distane, plot everyone