## Predicting the Champion of the 2025 Scotties Tournament of Hearts in Thunder Bay, Ontario

In [None]:
import pandas as pd
import shap
import numpy as np
from sklearn.preprocessing import MinMaxScaler 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from PIL import Image, ImageOps
from sklearn.metrics import ndcg_score

In [None]:
df = pd.read_csv("scotties_toh_raw_data.csv")
df.head()

In [None]:
#df.info()

In [None]:
print(list(df.columns))

In [None]:
features = ['shot_percentage', 'ave_points_for', 'ave_points_against', 'ave_ends_won', 'ave_ends_lost', 'ave_blank_ends', 'ave_stolen_ends']
print(len(features))

In [None]:
#Let x represent the training and y represent the testing
x = df[df['year'] < 2025].copy()
y = df[df['year'] == 2025].copy()

In [None]:
scaler = MinMaxScaler().fit(x[features])
x[features] = scaler.transform(x[features])
y[features] = scaler.transform(y[features])

In [None]:
randomForest = RandomForestRegressor(n_estimators=95, max_depth=3, min_samples_split=5, min_samples_leaf=2, random_state=5).fit(x[features], x["champion_share"])

In [None]:
predictions = randomForest.predict(y[features])
predictions = pd.DataFrame(predictions, columns=["predicted_share"], index=y.index)

In [None]:
frame = pd.concat([y[["year", "team", "skip", "champion_share"]], predictions], axis=1)
frame.sort_values(["year", "predicted_share"], inplace=True, ascending=[False, False])

In [None]:
explainer = shap.Explainer(randomForest)
shap_values = explainer.shap_values(y[features])

feature_importance = np.abs(shap_values).mean(axis=0)

imp_features = np.arange(len(features))
imp_features_list = [features[index] for index in imp_features]

shap_values_top = shap_values[:, imp_features]
y_imp_features = y[imp_features_list]

In [None]:
shap.summary_plot(shap_values_top, y_imp_features, feature_names=imp_features_list, plot_type="bar", color="firebrick", show=False)

plt.gcf().set_size_inches(10, 6)
plt.title("Ordering Features with the Highest Impact on Model")
plt.xlabel("Average Impact on Model Output Magnitude")
plt.ylabel("Feature")
plt.show()

In [None]:
print("Predicted Champion Share for 2025 Scotties")
print(frame[["team", "skip", "predicted_share"]])

In [None]:
# Average Precision Metric
def find_ap(frame):
    actual = frame.sort_values("champion_share", ascending=False).head(1)

    predicted = frame.sort_values("predicted_share", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["team"] in actual["team"].values:
            found += 1
            ps.append(found / seen)
        seen += 1
    return sum(ps) / len(ps)

In [None]:
all_predictions = []
aps = []
ndcgs = []

In [None]:
#Taking the last ten years
years = [2015,2016,2017,2018,2019,2020,2021,2022,2023,2024]

In [None]:
#Summarizing the Champion Average Precision metric, Champion Rank, and Normalized Discounted Cumulative Gain
for year in years:
    x = df[df["year"] < year].copy()
    y = df[df["year"] == year].copy()

    scaler = MinMaxScaler().fit(x[features])
    x[features] = scaler.transform(x[features])
    y[features] = scaler.transform(y[features])

    randomForest.fit(x[features], x['champion_share'])
    predictions = randomForest.predict(y[features])
    predictions_df = pd.DataFrame(predictions, columns=["predicted_share"], index=y.index)
    frame = pd.concat([y[["team", "champion_share", "year"]], predictions_df], axis=1)

    frame['team/year'] = frame['team'] + ' ' + frame['year'].astype(str)

    if not frame.empty:
        all_predictions.append(frame)
        ap = find_ap(frame)
        aps.append(ap)

        ndcg = ndcg_score(frame["champion_share"].values.reshape(1, -1), predictions.reshape(1, -1))
        ndcgs.append(ndcg)

        #Champion Rank is the reciprocal of the Champion AP to give a numerical rank value
        print((year), "Champion AP: ", round(ap, 3), "| Champion Rank: ", 1/(round(ap,3)), "| NDCG: ", round(ndcg, 3))

In [None]:
for idx, predictions in enumerate(all_predictions):
    year = years[idx]
    rankings = predictions.sort_values("predicted_share", ascending=False).head(10)
    
    average_ndcg = np.mean(ndcgs)
    average_ap = np.mean(aps)
    
    last_n_year_aps = aps[-10:]
    min_ap = np.min(last_n_year_aps)
    max_ap = np.max(last_n_year_aps)
    print(f"\n{year} Top 10 Predicted Teams")
    print(rankings[["team", "predicted_share", "champion_share"]])

In [None]:
print("Average Champion Rank:", round(average_ap, 2), "| NDCG:", round(average_ndcg, 2))
print("Lowest Champion Rank:", 1/(round(min_ap, 2)))
print("Highest Champion Rank:", 1/(round(max_ap, 2)))