In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
import re
from datetime import datetime, timedelta
from loguru import logger
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.dates as mdates

from IPython.display import HTML, display

In [None]:
sns.set_theme(style="whitegrid")

## Data preparation

In [None]:
shoes_data = pl.read_csv("./data/run-tracker/Sheet 1-Shoes.csv").select(
    "UUID", "brand", "model"
)
shoes_order = shoes_data["UUID"].to_list()
shoes_data = shoes_data.with_columns(pl.col("UUID").cast(pl.Enum(shoes_order)))
shoes_data.head()

In [None]:
def _parse_duration(duration_str: str) -> timedelta:
    values = [x.strip() for x in duration_str.split(" ")]
    assert len(values) == 3, f"Unexpected duration format: {duration_str}"
    return timedelta(
        hours=int(values[0].split("h")[0]),
        minutes=int(values[1].split("m")[0]),
        seconds=int(values[2].split("s")[0]),
    )


runs_data = (
    pl.read_csv("./data/run-tracker/Sheet 1-Runs.csv")
    .filter(pl.col("Distance").is_not_null())
    .with_columns(pl.col("Shoe").cast(pl.Enum(shoes_order)))
    .join(shoes_data, left_on="Shoe", right_on="UUID", how="left")
    .with_columns(
        pl.col("Notes").fill_null(""),
        pl.col("Datetime").map_elements(
            lambda x: datetime.strptime(re.sub(r"\u202f", " ", x), "%m/%d/%y %H:%M %p"),
            return_dtype=pl.Datetime,
        ),
        pl.col("Total duration").map_elements(
            _parse_duration, return_dtype=pl.Duration
        ),
        pl.col("Time running").map_elements(_parse_duration, return_dtype=pl.Duration),
    )
    .with_columns(
        pl.col("Time running")
        .map_elements(lambda x: x.total_seconds() / 60, return_dtype=pl.Float64)
        .alias("time_running_minutes"),
        pl.col("Total duration")
        .map_elements(lambda x: x.total_seconds() / 60, return_dtype=pl.Float64)
        .alias("total_duration_minutes"),
    )
    .with_columns(
        (pl.col("total_duration_minutes") / pl.col("Distance")).alias(
            "total_duration_pace"
        ),
        (pl.col("time_running_minutes") / pl.col("Distance")).alias(
            "time_running_pace"
        ),
    )
    .with_columns(
        pl.col("Notes")
        .str.to_lowercase()
        .str.contains("commuting")
        .alias("is_commuting_run")
    )
)
runs_data.head()

In [None]:
TODAY = datetime.now()
for dt in runs_data["Datetime"]:
    if dt > TODAY:
        logger.error(f"Datetime in the future: {dt}")
        raise ValueError("Datetime in the future found in data")

In [None]:
MARATHON_TRAINING_STARTDATE = runs_data.filter(
    pl.col("Notes").str.to_lowercase() == "start marathon training"
)["Datetime"][0]
logger.info(f"Starting marathon training: {MARATHON_TRAINING_STARTDATE}")

In [None]:
runs_data = runs_data.with_columns(
    (pl.col("Datetime") >= MARATHON_TRAINING_STARTDATE).alias("is_marathon_training")
)

## Analysis

In [None]:
myFmt = mdates.DateFormatter("%m/%d")

In [None]:
def display_df(df: pl.DataFrame | pd.DataFrame) -> None:
    if isinstance(df, pl.DataFrame):
        df = df.to_pandas()
    display(HTML(df.to_html()))

In [None]:
plot_data = (
    runs_data.group_by("Shoe", "brand", "model")
    .agg(pl.col("total_duration_minutes").sum(), pl.col("Distance").sum())
    .sort(
        "Shoe",
    )
    .with_columns((pl.col("total_duration_minutes") / 60).alias("total_duration_hours"))
    .to_pandas()
)
plot_data["Shoe"] = plot_data["Shoe"].cat.remove_unused_categories()

display_df(plot_data)

fig, axes = plt.subplots(ncols=2, figsize=(6, 2), squeeze=True, sharey=True)
sns.barplot(
    plot_data,
    y="Shoe",
    x="total_duration_hours",
    ax=axes[0],
    linewidth=0,
    color="gray",
)
axes[0].set_xlabel("total duration (hours)")
sns.barplot(plot_data, y="Shoe", x="Distance", ax=axes[1], linewidth=0, color="gray")
axes[1].set_xlabel("total distance (miles)")
for ax in axes:
    sns.despine(ax=ax, left=False, bottom=True)
fig.tight_layout()
plt.show()

In [None]:
runs_data.filter(pl.col("is_marathon_training")).with_columns(
    (pl.col("total_duration_minutes") / 60).alias("total_duration_hours")
).group_by("is_marathon_training").agg(
    pl.col("total_duration_hours").sum(), pl.col("Distance").sum()
).drop("is_marathon_training").rename(
    {"total_duration_hours": "hours run", "Distance": "distance"}
)

In [None]:
plot_data = runs_data.filter(pl.col("is_marathon_training")).to_pandas()

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(6, 5), squeeze=True, sharex=True)

sns.lineplot(plot_data, x="Datetime", y="Distance", ax=axes[0])
sns.scatterplot(plot_data, x="Datetime", y="Distance", ax=axes[0], linewidth=0, s=15)
axes[0].set_ylim(0, None)
axes[0].set_ylabel("distance")

sns.lineplot(plot_data, x="Datetime", y="total_duration_pace", ax=axes[1])
sns.scatterplot(
    plot_data, x="Datetime", y="total_duration_pace", ax=axes[1], linewidth=0, s=15
)
axes[1].set_ylabel("avg. pace (min. / mile)")
axes[1].xaxis.set_major_formatter(myFmt)
axes[1].set_xlabel("date")

for ax in axes:
    sns.despine(ax=ax, left=True, bottom=True)
fig.tight_layout()
plt.show()

In [None]:
plot_data = runs_data.filter(pl.col("is_marathon_training")).to_pandas()

fig, ax = plt.subplots(figsize=(5, 3))
sns.scatterplot(
    plot_data,
    x="Distance",
    y="total_duration_pace",
    linewidth=1,
    s=30,
    hue="is_commuting_run",
    ax=ax,
    alpha=0.8,
    palette={True: "crimson", False: "darkslategray"},
)
sns.despine(ax=ax, left=True, bottom=True)
sns.move_legend(
    ax,
    "upper left",
    bbox_to_anchor=(1, 1),
    title="commuting",
    ncols=1,
    edgecolor="white",
)
ax.set_ylabel("avg. pace (min. / mile)")
ax.set_xlabel("distance (miles)")
fig.tight_layout()
plt.show()

---

## Session information

In [None]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m