In [None]:
import matplotlib.pyplot as plt
import transform
import request
import datetime
import seaborn as sns

In [None]:
colors = ["c", "g", "r", "y", "m", "b"]

In [None]:
years = [2018, 2019, 2020, 2021]
current_year = 2021
start_dates = {year: datetime.datetime(year, 1, 1).isoformat() + 'Z' for year in years}
end_dates = {year: datetime.datetime(year, 12, 31).isoformat() + 'Z' for year in years}

In [None]:
dfs = {year: transform.get_dataframe(request.get_filtered_events(
    start_dates[year], end_dates[year], "summary")) for year in years}

In [None]:
dfs[current_year]

In [None]:
def run_scatterplot(ax):
    dfs[current_year].plot.scatter("date", "distance", ax=ax)
    ax.set_title("Distances of individual runs")

In [None]:
def run_histogram_with_percentiles(ax):
    dfs[current_year].hist("distance", ax=ax)
    percentages = [.05, .25, .5, .75, .95]
    for percentage in percentages:
        percentile = dfs[current_year]["distance"].quantile(percentage)
        ax.axvline(percentile, linestyle = ":", color="yellow")
        ax.text(percentile+.1, 1, f"{int(percentage*100)}%", size = 12, alpha = 0.8, color="yellow")
    ax.grid(False)
    ax.set_xlabel("distance [km]")
    ax.set_ylabel("number of occurrences")
    ax.set_title("Histogram of running distances with selected percentiles.")

In [None]:
def monthly_scatterplot(ax, n_years_back=2):
    for offset in range(0, n_years_back + 1):
        dfs[current_year - offset].groupby(["month"], as_index=False)["distance"].sum().plot(
            x="month", y="distance", kind="scatter", ax=ax, color=colors[offset], label=current_year - offset)
    ax.legend()
    ax.set_title("Monthly distances across years")
    ax.set_ylabel("distance [km]")
    ax.set_xlabel("month")

In [None]:
def monthly_histogram(ax, n_years_back=2):
    labels = [current_year - offset for offset in range(0, n_years_back +1)]
    distances = [
        dfs[current_year - offset].groupby(["month"], as_index=False)["distance"].sum()["distance"]
        for offset in range(0, n_years_back + 1)
    ]
    ax.hist(distances, 5, density=False, label=labels, color=colors[:(1+n_years_back)])
    ax.legend()
    ax.set_title("Histogram of monthly distances across years. #bins=5.")
    ax.set_xlabel("distance [km]")

In [None]:
def monthly_avg_run_distance(ax, n_years_back=2):
    for offset in range(0, n_years_back + 1):
        dfs[current_year - offset].groupby(["month"], as_index=False)["distance"].mean().plot(
            x="month", y="distance", kind="scatter", ax=ax, color=colors[offset], label=current_year - offset)
    ax.legend()
    ax.set_title("Average individual run distance per month.")
    ax.set_ylabel("distance [km]")
    ax.set_xlabel("month")

In [None]:
def longest_duration_between_two_runs(year):
    df_copy = dfs[year].copy()
    df_copy.sort_values(by=["date"])
    t1 = df_copy["date"][0]
    max_diff = t1 - t1
    max_diff_t = t1
    for index, row in df_copy.iterrows():
        t2 = row["date"]
        if t2 - t1 > max_diff:
            max_diff_t = t1
            max_diff = t2 - t1
        t1 = t2
    print(f"The longest diference between two runs in {year} was: {max_diff}")
    print(f"The gap started on: {max_diff_t}")
    return max_diff, max_diff_t

In [None]:
def n_weeks_without_running(year):
    n_weeks = 52 - dfs[year]["week"].nunique()
    print(f"There have been {n_weeks} without running in {year}")
    return n_weeks

In [None]:
def max_distance_consecutive_days(ax, year, window_range=20):
    day_numbers = dfs[year]["date"].dt.dayofyear.tolist()
    distances = dfs[year]["distance"]
    missing_days = [day_number for day_number in range(1, 366) if day_number not in day_numbers]
    missing_distances = [0 for _ in missing_days]
    daily_distances = list(zip(day_numbers, distances)) + list(zip(missing_days, missing_distances))
    daily_distances.sort()
    daily_distances = [distance for _, distance in daily_distances]
    
    def get_max(numbers, window_size):
        current_left_pointer = 0
        current_value = sum(numbers[current_left_pointer : current_left_pointer + window_size])
        max_left_pointer = current_left_pointer
        max_value = current_value

        for current_left_pointer in range(1, len(numbers) - window_size + 1):
            current_value -= numbers[current_left_pointer - 1]
            current_value += numbers[current_left_pointer + window_size - 1]
            if current_value > max_value:
                max_value = current_value
                max_left_pointer = current_left_pointer

        return max_value, max_left_pointer
    
    window_sizes = list(range(1, window_range+1))
    max_sums = list(map(lambda window_size: get_max(daily_distances, window_size)[0], window_sizes))
    
    ax.scatter(window_sizes, max_sums)
    ax.set_title("Max distance run in k consecutive days.")
    ax.set_ylabel("Distance [km]")
    ax.set_xlabel("Number of days")
    
    return max_sums

In [None]:
def running_fraction_among_exercise(year):
    df_all = transform.get_dataframe(request.get_filtered_events(
        start_dates[year], end_dates[year], "color")
    )
    running_fraction = len(dfs[year]) / len(df_all)
    print(f"In {year}, {running_fraction} of all activities were running.")
    return running_fraction

In [None]:
def histogram_runs_per_hour(ax, year, n_bins=10):
    dfs[year]["hour"].plot.hist(bins=n_bins, ax=ax)
    ax.set_xlabel("hour of the day")
    ax.set_ylabel("number of occurrences")
    ax.set_title(f"Histogram of runs by hours of the day. #bins={n_bins}")

In [None]:
def distribution_distance_per_hour(ax, year):
    ax = sns.boxplot(x="hour", y="distance", data=dfs[year])
    ax.set_ylabel("distance [km]")
    ax.set_xlabel("hour of the day")
    ax.set_title("Empirical distributions of distances per hour. ")

# Analysis

## Yearly performance

Total distance

In [None]:
for year in years:
    print(f"{year}: {dfs[year]['distance'].sum()}")

Average distance

In [None]:
for year in years:
    print(f"{year}: {dfs[year]['distance'].mean()}")

Longest runs

In [None]:
dfs[current_year].sort_values(by=["distance"], ascending=False)["distance"][:10]

Longest duration between two runs

In [None]:
longest_duration_between_two_runs(current_year)

How many weeks without running?

In [None]:
n_weeks_without_running(current_year)

Max distance in consecutive days

In [None]:
fig, ax = plt.subplots()
max_distance_consecutive_days(ax, current_year, 25)

Fraction of running among activities

In [None]:
for offset in range(0, 3):
    running_fraction_among_exercise(current_year - offset)

How many runs?

In [None]:
for offset in range(0, 3):
    year = current_year - offset
    print(f"{year} had {len(dfs[year])} running activities.")

## Daily performance

In [None]:
fig, ax = plt.subplots()
run_scatterplot(ax)

In [None]:
fig, ax = plt.subplots()
run_histogram_with_percentiles(ax)

## Monthly performance

In [None]:
dfs[current_year].groupby("month")["distance"].sum()

In [None]:
fig, ax = plt.subplots()
monthly_scatterplot(ax, n_years_back=2)

In [None]:
fig, ax = plt.subplots()
monthly_histogram(ax, n_years_back=2)

In [None]:
fig, ax = plt.subplots()
monthly_avg_run_distance(ax, n_years_back=2)

## Weekly performance

In [None]:
fig, ax = plt.subplots()
dfs[current_year].groupby(["week"], as_index=False)["distance"].sum().plot(
    x="week", y="distance", kind="scatter", ax=ax
)
ax.set_ylabel("distance [km]")

In [None]:
fig, ax = plt.subplots()
dfs[current_year].groupby(["week"], as_index=False)["distance"].sum()["distance"].plot.hist(
    bins=10, density=False, ax=ax
)
ax.set_xlabel("distance [km]")
ax.set_title("Histogram of weekly distances. #bins=10.")

## Hourly performance

In [None]:
fig, ax = plt.subplots()
histogram_runs_per_hour(ax, current_year, 10)

In [None]:
fig, ax = plt.subplots()
distribution_distance_per_hour(ax, current_year)
# ax.get_figure().savefig("hourly_distance_boxplot.png")

# Compilation