In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Load your dataset
df = pd.read_csv(
    "/Users/deniz/Projects/grand_tours/results/segment_test_results_tdf_2024.csv"
)  # Replace with the correct path


# Format model names
df["Model"] = df["Model"].map(
    {
        "linear_regression": "Linear",
        "random_forest_regressor": "Random Forest",
        "xgboost": "XGBoost",
    }
)

# Shorten names to LASTNAME F
df["short_name"] = df["name"].apply(
    lambda x: f"{x.split()[0].upper()} {x.split()[1][0].upper()}"
)

# Plotting style
plt.style.use("bmh")
plt.rcParams.update({"font.size": 12, "figure.dpi": 100})

# Plot 1: RMSE
plt.figure(figsize=(14, 6))
sns.lineplot(
    data=df,
    x="short_name",
    y="test_rmse",
    hue="Model",
    style="Model",
    markers=True,
    dashes=False,
)
plt.xticks(rotation=90, ha="right", rotation_mode="anchor")
plt.title("Test RMSE per Rider by Model")
plt.xlabel("Rider")
plt.ylabel("Test RMSE")
plt.tight_layout()
plt.show()

# Plot 2: MAPE
plt.figure(figsize=(14, 6))
sns.lineplot(
    data=df,
    x="short_name",
    y="test_mape",
    hue="Model",
    style="Model",
    markers=True,
    dashes=False,
)
plt.xticks(rotation=90, ha="right", rotation_mode="anchor")
plt.title("Test MAPE per Rider by Model")
plt.xlabel("Rider")
plt.ylabel("Test MAPE")
plt.tight_layout()
plt.show()

In [None]:
# Remove RMSE and MAPE outliers based on 95th percentile
rmse_threshold = df["test_rmse"].quantile(0.95)
mape_threshold = df["test_mape"].quantile(0.95)

df_filtered = df[
    (df["test_rmse"] <= rmse_threshold) & (df["test_mape"] <= mape_threshold)
]


plt.style.use("bmh")
plt.rcParams.update({"font.size": 12, "figure.dpi": 100})

# Plot RMSE
plt.figure(figsize=(14, 6))
sns.lineplot(
    data=df_filtered,
    x="short_name",
    y="test_rmse",
    hue="Model",
    style="Model",
    markers=True,
    dashes=False,
)
plt.xticks(rotation=90, ha="right", rotation_mode="anchor")
plt.title("Test RMSE per Rider by Model (Outliers Removed)")
plt.xlabel("Rider")
plt.ylabel("Test RMSE")
plt.tight_layout()
plt.show()

# Plot MAPE
plt.figure(figsize=(14, 6))
sns.lineplot(
    data=df_filtered,
    x="short_name",
    y="test_mape",
    hue="Model",
    style="Model",
    markers=True,
    dashes=False,
)
plt.xticks(rotation=90, ha="right", rotation_mode="anchor")
plt.title("Test MAPE per Rider by Model (Outliers Removed)")
plt.xlabel("Rider")
plt.ylabel("Test MAPE")
plt.tight_layout()
plt.show()

In [None]:
# 3. Boxplot of RMSE grouped by model
plt.figure(figsize=(6, 5))
sns.boxplot(data=df, x="Model", y="test_rmse")
plt.title("Distribution of Test RMSE by Model")
plt.tight_layout()
plt.show()

# 4. Scatter plot of RMSE vs MAPE
plt.figure(figsize=(6, 5))
sns.scatterplot(data=df, x="test_rmse", y="test_mape", hue="Model", s=80)
plt.title("RMSE vs MAPE by Model")
plt.tight_layout()
plt.show()

In [None]:
# Group by model and calculate the mean RMSE
average_rmse_per_model = df.groupby("Model")["test_rmse"].mean().reset_index()

# Optional: round for nicer display
average_rmse_per_model["test_rmse"] = average_rmse_per_model["test_rmse"].round(2)

print("Average RMSE per model across all riders:")
print(average_rmse_per_model)