# 1.0
This model will use a simple XGBoost on the constructed features.

In [1]:
import numpy as np
import pandas as pd

df = pd.read_parquet('../../data/training_data/dataset_v1.parquet')
df.columns

Index(['draw_size', 'best_of', 'hth_win_p_a', 'surface_Carpet', 'surface_Clay',
       'surface_Grass', 'surface_Hard', 'tourney_level_A', 'tourney_level_F',
       'tourney_level_G', 'tourney_level_M', 'hand_a_A', 'hand_a_L',
       'hand_a_R', 'hand_a_U', 'hand_b_A', 'hand_b_L', 'hand_b_R', 'hand_b_U',
       'round_BR', 'round_ER', 'round_F', 'round_QF', 'round_R128',
       'round_R16', 'round_R32', 'round_R64', 'round_RR', 'round_SF',
       'height_diff', 'age_diff', 'elo_diff', 'elo_surface_diff', 'p_ace_diff',
       'p_df_diff', 'p_1stIn_diff', 'p_1stWon_diff', 'p_2ndWon_diff',
       'p_2ndWon_inPlay_diff', 'p_bpSaved_diff', 'p_rpw_diff',
       'p_retAceAgainst_diff', 'p_ret1stWon_diff', 'p_ret2ndWon_diff',
       'p_ret2ndWon_inPlay_diff', 'p_bpConv_diff', 'p_totalPtsWon_diff',
       'dominance_ratio_diff', 'ht_diff', 'form_delta_diff',
       'elo_momentum_diff', 'recent_minutes_diff', 'log_rank_points_diff',
       'log_total_matches_diff', 'log_total_surface_matches_dif

In [2]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
import numpy as np

def time_series_cv(df, model, n_splits=100, debug=True):
    X = df.drop(columns=["result"])
    y = df["result"].values

    fold_size = len(df) // (n_splits + 1)
    aucs, accs, losses = [], [], []

    for i in range(1, n_splits + 1):
        train_end = fold_size * i
        X_train, y_train = X.iloc[:train_end], y[:train_end]
        X_test,  y_test  = X.iloc[train_end:train_end + fold_size], y[train_end:train_end + fold_size]

        model.fit(X_train, y_train)
        preds = model.predict_proba(X_test)[:,1]
        
        auc = roc_auc_score(y_test, preds)
        pred_class = (preds >= 0.5).astype(int)
        acc = accuracy_score(y_test, pred_class)
        loss = log_loss(y_test, preds)

        aucs.append(auc)
        accs.append(acc)
        losses.append(loss)

        if debug:
            print(f"{i} AUC={auc:.4f}, Accuracy={acc:.4f}, LogLoss={loss:.4f}")

    if debug:
        print(f"\nAUC:      mean={np.mean(aucs):.4f}, std={np.std(aucs):.4f}")
        print(f"Accuracy: mean={np.mean(accs):.4f}, std={np.std(accs):.4f}")
        print(f"LogLoss:  mean={np.mean(losses):.4f}, std={np.std(losses):.4f}")

    return aucs, accs, losses

In [3]:
model = XGBClassifier(
    n_estimators=723,
    learning_rate=0.0073542448230937306,
    max_depth=5,
    subsample=0.7107128168724214,
    colsample_bytree=0.5088728633198565,
    min_child_weight=8,
    objective='binary:logistic',
    eval_metric='logloss'
)

aucs, accs, losses = time_series_cv(df, model)

1 AUC=0.7649, Accuracy=0.6869, LogLoss=0.5800
2 AUC=0.8255, Accuracy=0.7445, LogLoss=0.5179
3 AUC=0.7687, Accuracy=0.6980, LogLoss=0.5750
4 AUC=0.8133, Accuracy=0.7345, LogLoss=0.5290
5 AUC=0.8185, Accuracy=0.7445, LogLoss=0.5229
6 AUC=0.8388, Accuracy=0.7533, LogLoss=0.5048
7 AUC=0.7974, Accuracy=0.7146, LogLoss=0.5448
8 AUC=0.8304, Accuracy=0.7600, LogLoss=0.5086
9 AUC=0.8291, Accuracy=0.7522, LogLoss=0.5108
10 AUC=0.8254, Accuracy=0.7511, LogLoss=0.5190
11 AUC=0.8464, Accuracy=0.7688, LogLoss=0.4918
12 AUC=0.8054, Accuracy=0.7345, LogLoss=0.5366
13 AUC=0.8301, Accuracy=0.7600, LogLoss=0.5099
14 AUC=0.8208, Accuracy=0.7600, LogLoss=0.5209
15 AUC=0.8304, Accuracy=0.7611, LogLoss=0.5111
16 AUC=0.7891, Accuracy=0.7223, LogLoss=0.5554
17 AUC=0.8337, Accuracy=0.7423, LogLoss=0.5039
18 AUC=0.8099, Accuracy=0.7312, LogLoss=0.5321
19 AUC=0.8119, Accuracy=0.7467, LogLoss=0.5296
20 AUC=0.7932, Accuracy=0.7268, LogLoss=0.5514
21 AUC=0.8126, Accuracy=0.7312, LogLoss=0.5293
22 AUC=0.7959, Accurac

KeyboardInterrupt: 

With sensible parameters on the XGBoost model, a cross validation of AUC=0.7273 and Accuracy=0.6643 is achieved. The model's performance increased as more data came in as expected. However, as we come across more recent data, the accuracy drops significantly. This drop could be caused by a regime shift in data, resulting from COVID and the retirement of the Big Three.

A coarse grid search on the hyperparameters is performed to find optimal hyperparameters.

### Hyperparameter Optimisation

In [4]:
from itertools import product

def tune_xgb(df, param_grid, metric="auc"):

    keys = list(param_grid.keys())
    best_score = -np.inf
    best_params = None

    for values in product(*param_grid.values()):
        params = dict(zip(keys, values))

        model = XGBClassifier(
            **params,
            objective="binary:logistic",
            eval_metric="logloss"
        )

        aucs, _, losses = time_series_cv(df, model, debug=False)

        mean_auc = np.mean(aucs)
        mean_loss = np.mean(losses)

        if metric == "auc":
            score = mean_auc
        else:
            score = -mean_loss   # minimise logloss

        if score > best_score:
            best_score = score
            best_params = params
            print("\nBest:", best_params, "\nScore:", best_score)
        else:
            print("\nNo Change")
    
    return best_params
    
param_grid = {
    "max_depth": [4, 5],
    "learning_rate": [0.02],
    "subsample": [0.7, 0.9],
    "colsample_bytree": [0.7, 0.9],
    "min_child_weight": [1, 5, 10],
    "n_estimators": [300, 600],
}

# best_params = tune_xgb(df, param_grid)

Instead of a coarse grid search that takes an exponentially long time to run as features increase, we can use bayesian optimisation. Instead of picking them randomly, the sampler uses past trial results to prefer values that previously looked good and avoid values that looked bad, making the search smarter over time.

In [5]:
import optuna
import numpy as np
from xgboost import XGBClassifier

def tune_xgb_optuna(df, n_trials=100, n_splits=20):
    """
    Run Bayesian hyperparameter optimisation for XGBoost
    using time-series cross-validation on `df`.

    Assumes `time_series_cv(df, model, n_splits)` returns
    (aucs, accs, losses) for each fold.
    """

    def objective(trial):
        # sample hyperparameters
        max_depth = trial.suggest_int("max_depth", 3, 8)
        learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
        subsample = trial.suggest_float("subsample", 0.4, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.4, 1.0)
        min_child_weight = trial.suggest_int("min_child_weight", 1, 20)
        n_estimators = trial.suggest_int("n_estimators", 300, 1000)

        model = XGBClassifier(
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            min_child_weight=min_child_weight,
            n_estimators=n_estimators,
            objective="binary:logistic",
            eval_metric="logloss",
            tree_method="hist",
            random_state=42,
        )

        aucs, accs, losses = time_series_cv(df, model, n_splits=n_splits, debug=True)

        mean_auc = np.mean(aucs)

        # maximsed value
        return mean_auc

    study = optuna.create_study(direction="maximize")  # maximise AUC
    study.optimize(objective, n_trials=n_trials)

    print("\nBest AUC:", study.best_value)
    print("Best params:")
    for k, v in study.best_params.items():
        print(f"  {k}: {v}")

    return study

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# study = tune_xgb_optuna(df)