In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_matches = pd.read_csv('../../data/preprocessed/preprocessed_1.csv')
df_matches.sort_values(by=["season", "stage", "date"], inplace=True)

season_dummies = pd.get_dummies(df_matches['season'], prefix='season', drop_first=True)
df_matches = df_matches.join(season_dummies)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score


def train_and_evaluate(train_df, val_df, test_df, season, stage):
    feature_cols = [col for col in train_df.columns if col not in
                    ["match_api_id", "result_match", "season", "stage", "date", "home_team", "away_team"]]

    X_train = train_df[feature_cols]
    y_train = train_df["result_match"]

    X_val = val_df[feature_cols]
    y_val = val_df["result_match"]

    X_test = test_df[feature_cols]
    y_test = test_df["result_match"]

    params = {'colsample_bytree': 0.8829682348067726,
              'gamma': 2.153140019195803,
              'learning_rate': 0.2839181641252695,
              'max_depth': 10,
              'n_estimators': 750,
              'reg_alpha': 0.05456053939633371,
              'reg_lambda': 0.014211434927705319,
              'subsample': 0.8413541436147373}

    model = XGBClassifier(
        **params,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss",

    )

    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=False)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average="binary")

    return f1, season, stage, y_test.tolist(), y_pred.tolist()

In [None]:
# ---------------------
# Backtesting over multiple seasons
# ---------------------

seasons = sorted(df_matches["season"].unique(), reverse=True)
backtest_results = []

for i in range(1, len(seasons)):
    target_season = seasons[0]
    previous_seasons = seasons[1:1 + i]

    target_season_df = df_matches[df_matches["season"] == target_season]

    target_stages = sorted(target_season_df["stage"].unique())

    for stage in target_stages:
        if stage <= min(target_stages) + 1:
            continue

        train_df_prev = df_matches[df_matches["season"].isin(previous_seasons)]
        train_df_target = target_season_df[target_season_df["stage"] < (stage - 1)]
        train_df = pd.concat([train_df_prev, train_df_target], ignore_index=True)

        val_df = target_season_df[target_season_df["stage"] == (stage - 1)]
        test_df = target_season_df[target_season_df["stage"] == stage]

        if train_df.empty or val_df.empty or test_df.empty:
            continue

        f1, season, stage, y_test, y_pred = train_and_evaluate(train_df, val_df, test_df, target_season, stage)

        backtest_results.append({
            "season": f"{season} - {len(previous_seasons)}",
            "stage": stage,
            "train_size": len(train_df),
            "f1_score": f1,
            "y_test": y_test,
            "y_pred": y_pred
        })

results_df = pd.DataFrame(backtest_results)
print("Backtesting results for each season and stage:")

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.metrics import confusion_matrix
#
# # Filter worst stages
# worst_stages = results_df.sort_values(by="f1_score", ascending=True).head(5)
#
# for _, row in worst_stages.iterrows():
#     y_test = row["y_test"]
#     y_pred = row["y_pred"]
#
#     cm = confusion_matrix(y_test, y_pred)
#
#     plt.figure(figsize=(5, 4))
#     sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Home not win", "Home win"], yticklabels=["Home not win", "Home win"])
#     plt.xlabel("Predicted")
#     plt.ylabel("Actual")
#     plt.title(f"Confusion Matrix - {row['season']} Stage {row['stage']}")
#     plt.show()

In [None]:
all_worst_stages = results_df.loc[results_df['f1_score'] == 0.0]

In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

def train_and_analyze_shap_for_stage(df_matches, worst_stage_row):
    season_info = worst_stage_row["season"]
    stage = worst_stage_row["stage"]

    # Extract number of previous seasons used for training
    season, prev_seasons_count = season_info.split(" - ")
    prev_seasons_count = int(prev_seasons_count)

    # Define training and testing data based on previous training logic
    target_season_df = df_matches[df_matches["season"] == season]
    previous_seasons = sorted(df_matches["season"].unique(), reverse=True)[1:1 + prev_seasons_count]

    train_df_prev = df_matches[df_matches["season"].isin(previous_seasons)]
    train_df_target = target_season_df[target_season_df["stage"] < (stage - 1)]
    train_df = pd.concat([train_df_prev, train_df_target], ignore_index=True)

    val_df = target_season_df[target_season_df["stage"] == (stage - 1)]
    test_df = target_season_df[target_season_df["stage"] == stage]

    if train_df.empty or val_df.empty or test_df.empty:
        print(f"Skipping {season} - Stage {stage} due to empty dataset.")
        return None

    feature_cols = [col for col in train_df.columns if col not in
                    ["match_api_id", "result_match", "season", "stage", "date", "home_team", "away_team"]]

    X_train = train_df[feature_cols]
    y_train = train_df["result_match"]

    X_val = val_df[feature_cols]
    y_val = val_df["result_match"]

    X_test = test_df[feature_cols]
    y_test = test_df["result_match"]

    model = XGBClassifier(
        colsample_bytree=0.8829682348067726,
        gamma=2.153140019195803,
        learning_rate=0.2839181641252695,
        max_depth=10,
        n_estimators=750,
        reg_alpha=0.05456053939633371,
        reg_lambda=0.014211434927705319,
        subsample=0.8413541436147373,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    )

    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=False)

    season_columns = X_test.filter(like='season').columns
    X_test = X_test.astype({col: "int8" for col in season_columns})

    # Compute SHAP values
    explainer = shap.Explainer(model.predict_proba, X_test)
    shap_values = explainer(X_test)

    return model, X_test, shap_values, y_test


In [None]:
def plot_shap_summary(shap_values, X_test, season, stage):
    # Use SHAP’s plot_size instead of your own figure, and do not auto-show.
    shap.summary_plot(
        shap_values,
        X_test,
        feature_names=np.array(X_test.columns),
        plot_size=(15, 25),
        show=False
    )
    # Use a suptitle to position above the entire plot
    plt.suptitle(f"SHAP Summary - {season} Stage {stage}", y=1.02)
    plt.tight_layout()
    plt.show()


def plot_shap_comparison(shap_values, X_test, y_test, y_pred, season, stage):
    correct_indices = np.where(y_test == y_pred)[0]
    incorrect_indices = np.where(y_test != y_pred)[0]

    # SHAP for correct predictions
    shap.summary_plot(
        shap_values[correct_indices],
        X_test.iloc[correct_indices],
        feature_names=np.array(X_test.columns),
        plot_size=(15, 25),
        show=False
    )
    plt.suptitle(f"SHAP Values - Correct Predictions ({season} Stage {stage})", y=1.02)
    plt.tight_layout()
    plt.show()

    # SHAP for incorrect predictions
    shap.summary_plot(
        shap_values[incorrect_indices],
        X_test.iloc[incorrect_indices],
        feature_names=np.array(X_test.columns),
        plot_size=(15, 25),
        show=False
    )
    plt.suptitle(f"SHAP Values - Incorrect Predictions ({season} Stage {stage})", y=1.02)
    plt.tight_layout()
    plt.show()

In [None]:
# Loop over worst-performing stages and analyze SHAP values
for _, worst_stage_row in all_worst_stages.iterrows():
    model, X_test, shap_values, y_test = train_and_analyze_shap_for_stage(df_matches, worst_stage_row)

    if model is not None:
        season = worst_stage_row["season"]
        stage = worst_stage_row["stage"]

        # Generate predictions
        y_pred = model.predict(X_test)

        # Plot SHAP analysis
        plot_shap_summary(shap_values, X_test, season, stage)
        plot_shap_comparison(shap_values, X_test, y_test, y_pred, season, stage)

In [None]:
plt.figure(figsize=(35, 7))
for season in results_df["season"].unique():
    season_results = results_df[results_df["season"] == season]
    plt.plot(season_results["stage"], season_results["f1_score"], marker="o", label=f"Season {season}")

plt.xlabel("Stage")
plt.ylabel("F1 Score")
plt.title("Rolling Backtesting Performance Across Seasons")
plt.legend(title="Season")
plt.grid(True)
plt.show()

In [None]:
avg_results = results_df.groupby("season")["f1_score"].mean().reset_index()
avg_results.rename(columns={"f1_score": "avg_f1_score"}, inplace=True)
print("\nAverage F1 Score for each season:")
print(avg_results)

plt.figure(figsize=(12, 6))
plt.bar(avg_results["season"].astype(str), avg_results["avg_f1_score"], color='skyblue')
plt.xlabel("Season")
plt.ylabel("Average F1 Score")
plt.title("Average F1 Score per Season")
plt.grid(axis='y')
plt.show()