In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict
from joblib import Parallel, delayed

import sqlite3
import sys
import time
import math
#import tqdm
from tqdm.auto import tqdm
import datetime
import os
import pickle
from pathlib import Path

from glicko2 import Player
import multiprocessing

tqdm.pandas()

if os.path.exists('/workspace/data_2'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data_2/'
else:
    data_path = '../data/'

## Load the single-set model saved from earlier

In [2]:
from sklearn.metrics import accuracy_score, log_loss

dataset_df = pd.read_pickle(data_path + 'dataset_full.pkl')
dataset_df.sort_index(inplace=True) # For convenience, mostly. Not really necessary, we use .loc[] anyways

# Note that there is no real reason to keep separately computing the individual probabilities of winning each individual set.
# Let's just compute them all at once here.

single_set_model = None
with open(data_path + 'single_set_model.pkl', 'rb') as f:
    single_set_model = pickle.load(f)

# Make sure these match up with what features the model was trained on.
features_all_everything = ['p1_default_elo', 'p2_default_elo', 'p1_default_rd', 'p2_default_rd',
       'p1_default_updates', 'p2_default_updates', 'matchup_1', 'matchup_2',
       'matchup_3', 'matchup_4', 'matchup_5', 'matchup_6', 'matchup_7',
       'matchup_8', 'matchup_9', 'matchup_10', 'p1_m1_usage', 'p2_m1_usage',
       'p1/m1/m1_alt2_elo', 'p1/m1/m1_alt2_rd', 'p1/m1/m1_alt2_updates',
       'p2/m1/m1_alt2_elo', 'p2/m1/m1_alt2_rd', 'p2/m1/m1_alt2_updates',
       'p1/m1_alt3_elo', 'p1/m1_alt3_rd', 'p1/m1_alt3_updates',
       'p2/m1_alt3_elo', 'p2/m1_alt3_rd', 'p2/m1_alt3_updates']

dataset_df['p1_win_prob'] = single_set_model.predict_proba(dataset_df[features_all_everything])[:,1]

# As a sanity check, let's verify the accuracy and log loss on 2024 data
# Total accuracy and log loss (including lots of data that the model was trained on)
date_filter = (dataset_df['start'] >= datetime.datetime(2024,1,1)) & (dataset_df['end'] <= datetime.datetime(2024,12,31))
print("2024 single-set performance metrics")
print()
print("On all sets:")
print("Log loss: ", round(log_loss(dataset_df[date_filter]['winner'], dataset_df[date_filter]['p1_win_prob']), 3))
print("Accuracy: ", round(100.0 * accuracy_score(dataset_df[date_filter]['winner'], dataset_df[date_filter]['p1_win_prob'] >= 0.5), 1))
print()
print("Restricting to top 8 sets only:")
print("Log loss: ", round(log_loss(dataset_df[date_filter & dataset_df['top_8']]['winner'], dataset_df[date_filter & dataset_df['top_8']]['p1_win_prob']), 3))
print("Accuracy: ", round(100.0 * accuracy_score(dataset_df[date_filter & dataset_df['top_8']]['winner'], dataset_df[date_filter & dataset_df['top_8']]['p1_win_prob'] >= 0.5), 1))

2024 single-set performance metrics

On all sets:
Log loss:  0.441
Accuracy:  79.8

Restricting to top 8 sets only:
Log loss:  0.514
Accuracy:  75.0


## Load and clean up the tournament dataframe

In particular, this dataframe contains info on all top 8 players, along with the paths that they took to get to where they are in the tournament.

In [3]:
tournament_df = pd.read_pickle(data_path + 'top_8_tournament_previous_sets_and_results_with_winners_df')

# Filter down to tournaments which actually have valid top 8 data, and previous data on getting there.
tournament_df = tournament_df.loc[tournament_df[['LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2',
                                                 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2',
                                                 'LN_A_p1_non_top_8_sets', 'LN_A_p2_non_top_8_sets',
                                                 'LN_B_p1_non_top_8_sets', 'LN_B_p2_non_top_8_sets',
                                                 'WSF_A_p1_non_top_8_sets', 'WSF_A_p2_non_top_8_sets',
                                                 'WSF_B_p1_non_top_8_sets', 'WSF_B_p2_non_top_8_sets']].dropna().index]


# Very rarely (not sure where the problem is) you get something not actually in the single-set dataframe
# It is not actually that common though, so let's just delete those instances.
def references_valid_sets(prev_sets):
    for x in prev_sets:
        if x[0] not in dataset_df.index:
            return False
        
    return True

filter = tournament_df[['LN_A_p1_non_top_8_sets', 'LN_A_p2_non_top_8_sets',
                        'LN_B_p1_non_top_8_sets', 'LN_B_p2_non_top_8_sets',
                        'WSF_A_p1_non_top_8_sets', 'WSF_A_p2_non_top_8_sets',
                        'WSF_B_p1_non_top_8_sets', 'WSF_B_p2_non_top_8_sets']].map(references_valid_sets).all(axis=1)

tournament_df = tournament_df[filter]

# Likewise, some of these sets don't seem to have a valid winner
tournament_df = tournament_df[~tournament_df['winner_id'].isna()]

# A bit more cleanup, for sanity
min_date = datetime.datetime(2015,1,1)
max_date = datetime.datetime(2024,12,31)

tournament_df = tournament_df[(tournament_df['start'] >= min_date) &
                              (tournament_df['end'] >= min_date) &
                              (tournament_df['start'] <= max_date) &
                              (tournament_df['end'] <= max_date)]

# We will only be dealing with data from 2023 onwards, because the single-set predictor that we will be using
# was trained on data up to the end of 2022, and we don't want it leaking data.
# This will also speed up computations by not performing them on data we don't care about.
tournament_df = tournament_df[tournament_df['start'] >= datetime.datetime(2023,1,1)]

tournament_df.sort_values(by=['end', 'start'], inplace=True)

tournament_df[['winner_id', 'LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2']]

Unnamed: 0,winner_id,LN_A_p1,LN_A_p2,LN_B_p1,LN_B_p2,WSF_A_p1,WSF_A_p2,WSF_B_p1,WSF_B_p2
18944,374889,212438,262548,378028,1727716,1971084,374889,29873,21515
18951,2931588,1652036,2720652,2512785,3188220,2931588,1470721,3188222,2551763
18948,701767,2670568,1693244,2159124,2382715,2998349,579172,2207927,701767
18955,26574,1482652,264041,2326935,650242,3024047,26574,674084,557126
18950,631158,2559523,1240442,1077370,3013285,631158,1007980,3188449,2557927
...,...,...,...,...,...,...,...,...,...
39657,2788878,3596098,3540721,4110509,287537,2410418,2788878,2956817,3815971
39672,180292,2298994,519020,36152,4916,36285,180292,37001,25701
39598,19641,405463,1445708,246491,769516,19641,2249893,3822249,148391
39031,267849,533918,56918,55591,342875,893866,267849,216979,512704


## A bit more feature engineering

Here, we compute a "score" that shows how well a player is doing relative to their skill level and the skill level of their opponents in this tournament. If they are beating players well beyond their normal skill level, their score goes up. If they lose, it goes down.

More specifically, we compute the probability that they win, and add the negative log of that probability if they indeed win. Losses are similar, but based on the probability that they lose, and subtracted instead.

In [4]:
# Compute a score based on how likely it was that they actually made it to the top.
# A sort of sum of negative log probabilities, where wins are positive and losses are negative
def prev_set_score(prev_sets):
    result = 0

    for x in prev_sets:
        data = dataset_df.loc[x[0], ['winner', 'p1_win_prob']]
        outcome = x[1]

        # We don't know if this player is p1 or p2 in this list, but this can determine it without looking at player id
        # Compare if (player we are interested in wins) vs (did p1 win)
        if outcome == (data['winner'] == 1.0): # The player is p1
            if outcome: # player wins, as p1
                result += (-np.log(data['p1_win_prob']))
            else:       # player loses, as p1
                result -= (-np.log(1-data['p1_win_prob']))
        else:                                  # The player is p2
            if outcome: # player wins, as p2
                result += (-np.log(1-data['p1_win_prob']))
            else:       # player loses, as p2
                result -= (-np.log(data['p1_win_prob']))

    return result

# Example data (note that there is a consistent ELO throughout this entire dataset)
prev_sets = tournament_df.iloc[10000]['LN_A_p1_non_top_8_sets']
print(prev_sets)
print()
print(dataset_df.loc[[x[0] for x in prev_sets], ['p1_default_elo', 'p2_default_elo', 'winner', 'p1_win_prob']])
print()
print("Previous set score: ", prev_set_score(tournament_df.iloc[10000]['LN_A_p1_non_top_8_sets']))

[(1531552, True), (1531561, True), (1531566, False), (1531590, True)]

         p1_default_elo  p2_default_elo  winner  p1_win_prob
1531552     1744.647701     1068.270218     1.0     0.970317
1531561     1694.260131     1744.647701     0.0     0.388579
1531566     1744.647701     1500.000000     0.0     0.871613
1531590     1599.596283     1744.647701     0.0     0.135352

Previous set score:  -1.3851688378848603


Generally, we will keep track of all of the necessary data for each player in the top 8 by keeping track of what their starting position was.

In [5]:
top_8_pos = ['LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2']
top_8_prevs = [x + '_non_top_8_sets' for x in top_8_pos]
top_8_prevs_lengths = [x + "_len" for x in top_8_prevs] # These columns will just keep track of how many sets the player went through to get to the top 8
top_8_prevs_scores = [x + "_score" for x in top_8_prevs] # These will keep track of their "score" that shows how "well" they are performing relative to their predicted odds.

tournament_df[top_8_prevs_lengths] = tournament_df[top_8_prevs].map(lambda x: len(x)).to_numpy()
tournament_df[top_8_prevs_scores] = tournament_df[top_8_prevs].map(prev_set_score).to_numpy()
tournament_df[top_8_prevs + top_8_prevs_lengths + top_8_prevs_scores].head()

Unnamed: 0,LN_A_p1_non_top_8_sets,LN_A_p2_non_top_8_sets,LN_B_p1_non_top_8_sets,LN_B_p2_non_top_8_sets,WSF_A_p1_non_top_8_sets,WSF_A_p2_non_top_8_sets,WSF_B_p1_non_top_8_sets,WSF_B_p2_non_top_8_sets,LN_A_p1_non_top_8_sets_len,LN_A_p2_non_top_8_sets_len,...,WSF_B_p1_non_top_8_sets_len,WSF_B_p2_non_top_8_sets_len,LN_A_p1_non_top_8_sets_score,LN_A_p2_non_top_8_sets_score,LN_B_p1_non_top_8_sets_score,LN_B_p2_non_top_8_sets_score,WSF_A_p1_non_top_8_sets_score,WSF_A_p2_non_top_8_sets_score,WSF_B_p1_non_top_8_sets_score,WSF_B_p2_non_top_8_sets_score
18944,"[(1028338, True), (1028347, True), (1028351, F...","[(1028335, True), (1028345, True), (1028350, F...","[(1028337, True), (1028346, False), (1028361, ...","[(1028330, True), (1028341, True), (1028348, F...","[(1028342, True), (1028349, True)]","[(1028340, True), (1028348, True)]","[(1028344, True), (1028350, True)]","[(1028346, True), (1028351, True)]",4,4,...,2,2,0.348273,3.581404,3.483719,2.586742,0.356917,0.155761,0.487661,0.352467
18951,"[(1028523, False), (1028528, True)]","[(1028522, False)]","[(1028521, False)]","[(1028519, True), (1028520, False)]","[(1028520, True)]","[(1028521, True)]","[(1028522, True)]","[(1028523, True)]",2,1,...,1,1,0.670509,-0.701147,-0.68033,0.592403,0.227003,0.68033,0.701147,0.316062
18948,"[(1028465, True), (1028469, False), (1028478, ...","[(1028463, True), (1028468, False), (1028479, ...","[(1028459, True), (1028466, False), (1028477, ...","[(1028461, True), (1028467, False), (1028476, ...","[(1028460, True), (1028467, True)]","[(1028458, True), (1028466, True)]","[(1028464, True), (1028469, True)]","[(1028462, True), (1028468, True)]",3,3,...,2,2,0.420448,0.89846,1.131986,-0.424911,0.785745,0.172685,0.21494,0.13257
18955,"[(1028610, True), (1028614, False), (1028622, ...","[(1028608, True), (1028613, False), (1028623, ...","[(1028606, True), (1028612, False), (1028624, ...","[(1028605, True), (1028611, False), (1028625, ...","[(1028607, True), (1028612, True)]","[(1028611, True)]","[(1028609, True), (1028614, True)]","[(1028613, True)]",3,3,...,2,1,1.194331,-0.036581,-0.60635,1.206678,0.893398,0.069473,0.323127,0.247039
18950,"[(1028500, True), (1028504, False)]","[(1028501, True), (1028505, False), (1028510, ...","[(1028500, False), (1028511, True)]","[(1028501, False), (1028512, True)]","[(1028502, True)]","[(1028503, True)]","[(1028504, True)]","[(1028505, True)]",2,3,...,1,1,-0.934742,-0.438122,-0.14944,-0.249527,0.057306,0.092795,1.442649,1.039704


In [6]:
# Create a column that contains the top 8 starting position of the winner
# (or more specifically, the numeric index in top_8_pos)

tournament_df['winner_index'] = 8 # Dummy value, has to be 0-7

for i,position in enumerate(top_8_pos):
    found_filter = (tournament_df['winner_id'] == tournament_df[top_8_pos[i]])
    tournament_df.loc[found_filter, 'winner_index'] = i

(tournament_df['winner_index'] == 8).sum() # Should be zero (everything found)

0

Start pulling in data (like ELO) for each of the top 8 players in the tournaments.

In [7]:
# We will need to have all of the features we've engineered for each of the players that made it to the top 8
# We can be clever and pull most of them (elo-based features) from the previous sets in the dataframe
# The rest (player vs player stats, also called "matchup", but might be renamed) have to be pulled in manually

features_elo = ['p1_default_elo', 'p2_default_elo', 'p1_default_rd', 'p2_default_rd',
       'p1_default_updates', 'p2_default_updates', 'p1_m1_usage', 'p2_m1_usage',
       'p1/m1/m1_alt2_elo', 'p1/m1/m1_alt2_rd', 'p1/m1/m1_alt2_updates',
       'p2/m1/m1_alt2_elo', 'p2/m1/m1_alt2_rd', 'p2/m1/m1_alt2_updates',
       'p1/m1_alt3_elo', 'p1/m1_alt3_rd', 'p1/m1_alt3_updates',
       'p2/m1_alt3_elo', 'p2/m1_alt3_rd', 'p2/m1_alt3_updates']

features_matchup = ['matchup_1', 'matchup_2', 'matchup_3', 'matchup_4', 'matchup_5',
                    'matchup_6', 'matchup_7', 'matchup_8', 'matchup_9', 'matchup_10']

In [8]:
# In order to avoid an organizational nightmare, each of the following:
#     'LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2'
# will have their data stored in separate dataframes, each held in the following dictionary

top_8_stats = {}

def pull_data_from_set(loc, outcome):
    set_data = dataset_df.loc[loc]
    player_num = 'p1' if outcome == (set_data['winner'] == 1.0) else 'p2' # Sneaky way of getting the player number

    features_to_pull = [x for x in features_elo if player_num in x]
    pulled_data = set_data[features_to_pull].copy()
    pulled_data.index = [x.replace(player_num, '') for x in pulled_data.index] # We will add player numbers on an as-needed basis later

    return pulled_data

for top_8_position in tqdm(top_8_pos):
    # First, pull in player 1 data from a previous match.
    # Note that the player might NOT be player 1 in the match that we are pulling from
    top_8_stats[top_8_position] = tournament_df[top_8_position + '_non_top_8_sets'].apply(lambda x: pull_data_from_set(x[0][0], x[0][1]))

  0%|          | 0/8 [00:00<?, ?it/s]

The most important information about the players in the top 8 is essentially how likely each player is to win against the other player. We can use our single-set predictor to estimate this. We can also make use of the information on how well each player has been doing in the tournament relative to their skill level/opponent skill levels, possibly in order to "adjust" the aforementioned probabilities. Or just to toss it as extra information in whatever ML model.

In [9]:
# Removing more than half the data when we look up the matchup data makes it faster by roughly 10x
smaller_df = dataset_df[dataset_df['matchup_1']!=.5].sort_values('end').copy()
smaller_df.shape[0]

600206

Get rid any sets that do not have a pair of players who have appeared in the same top 8 of some tournament.

In [10]:
import pandas as pd
import itertools

# Assuming 'tournament_df' and 'smaller_df' are your DataFrames

# Define the columns that contain the top 8 players in each tournament
top_8_columns = [
    'LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2',
    'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2'
]

# Initialize a list to store all unique pairs of players who appeared in the same top 8
pairs_list = []

# Iterate over each tournament to generate all possible pairs of top 8 players
for index, row in tqdm(tournament_df.iterrows()):
    # Extract the top 8 players for the current tournament
    top_8_players = row[top_8_columns].dropna().unique()
    
    # Generate all unordered pairs of top 8 players
    for p1, p2 in itertools.combinations(top_8_players, 2):
        min_id, max_id = sorted([p1, p2])
        pairs_list.append({'min_id': min_id, 'max_id': max_id})

# Create a DataFrame of all relevant player pairs and remove duplicates
relevant_pairs_df = pd.DataFrame(pairs_list).drop_duplicates()

# Prepare 'smaller_df' by adding 'min_id' and 'max_id' columns for efficient merging
smaller_df['min_id'] = smaller_df[['p1_id', 'p2_id']].min(axis=1)
smaller_df['max_id'] = smaller_df[['p1_id', 'p2_id']].max(axis=1)

# Merge 'smaller_df' with 'relevant_pairs_df' to filter relevant sets
filtered_smaller_df = pd.merge(
    smaller_df,
    relevant_pairs_df,
    on=['min_id', 'max_id'],
    how='inner'
)

# Drop the 'min_id' and 'max_id' columns if they are no longer needed
filtered_smaller_df = filtered_smaller_df.drop(columns=['min_id', 'max_id'])

# The 'filtered_smaller_df' now contains only the relevant sets
print(filtered_smaller_df.shape[0])  # This prints the number of relevant sets


0it [00:00, ?it/s]

362291


This is the slower function.

In [11]:
def get_matchup(p1_id, p2_id, start, dataset_df=filtered_smaller_df):
    """Gets the most recent matchup data from before the start of the tournament.
        This is pretty slow and will likely slow down this cell a lot.
    Args:
        p1_id: The ID of player 1.
        p2_id: The ID of player 2.
        start: The start of the tournament.
        dataset_df: Use only the data that have values other than .5 in matchup_1

    Returns:
        A pd.Series containing the matchup data or default values if no prior sets exist.
    """
    # Ensure the dataset is sorted by 'end'
    assert dataset_df['end'].is_monotonic_increasing, "Dataset must be sorted by 'end'."

    # Use NumPy for faster filtering
    p1_mask = (dataset_df['p1_id'] == p1_id) & (dataset_df['p2_id'] == p2_id)
    p2_mask = (dataset_df['p1_id'] == p2_id) & (dataset_df['p2_id'] == p1_id)
    mask = (p1_mask | p2_mask).to_numpy()

    # Find indices where mask is True
    valid_indices = np.where(mask & (dataset_df['end'].to_numpy() < start))[0]

    if len(valid_indices) == 0:
        # No prior matches, return default values
        return pd.Series(0.5, index=[f'matchup_{n}' for n in range(1, 11)])

    # Get the last valid index
    last_index = valid_indices[-1]
    last_row = dataset_df.iloc[last_index]

    # Define matchup columns
    matchup_cols = [f'matchup_{n}' for n in range(1, 11)]

    # Determine if we need to swap values
    if p1_id == last_row['p1_id'] and p2_id == last_row['p2_id']:
        return last_row[matchup_cols]
    elif p1_id == last_row['p2_id'] and p2_id == last_row['p1_id']:
        return 1 - last_row[matchup_cols]
    else:
        # Should not happen but acts as a fallback
        print("Matchup Data Failed")
        return pd.Series(0.5, index=[f'matchup_{n}' for n in range(1, 11)])
    
%timeit get_matchup(dataset_df.iloc[1_200_000]['p1_id'], dataset_df.iloc[1_200_000]['p2_id'], dataset_df.iloc[1_200_000]['start'])

72.6 ms ± 149 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


This cell has the optimized function.

In [12]:
# Assume 'filtered_smaller_df' is your dataset_df
dataset_df_2 = filtered_smaller_df.copy()

# Create 'min_id' and 'max_id' columns to represent unordered player pairs
dataset_df_2['min_id'] = dataset_df_2[['p1_id', 'p2_id']].min(axis=1)
dataset_df_2['max_id'] = dataset_df_2[['p1_id', 'p2_id']].max(axis=1)

# Create a 'pair_key' column as a tuple of (min_id, max_id)
dataset_df_2['pair_key'] = list(zip(dataset_df_2['min_id'], dataset_df_2['max_id']))

# Sort the dataset by 'end' to ensure matchups are in chronological order
dataset_df_2 = dataset_df_2.sort_values('end').reset_index(drop=True)

# Group the dataset by 'pair_key' for efficient lookups
grouped_pairs = dataset_df_2.groupby('pair_key', sort=False)

def get_matchup(p1_id, p2_id, start, grouped_pairs=grouped_pairs):
    """Gets the most recent matchup data from before the start of the tournament.
    
    Args:
        p1_id: The ID of player 1.
        p2_id: The ID of player 2.
        start: The start of the tournament.
        grouped_pairs: The preprocessed grouped DataFrame by player pairs.
        
    Returns:
        A pd.Series containing the matchup data or default values if no prior sets exist.
    """
    # Create the pair key
    min_id, max_id = min(p1_id, p2_id), max(p1_id, p2_id)
    pair_key = (min_id, max_id)
    
    # Check if the pair exists in the grouped data
    if pair_key not in grouped_pairs.groups:
        # No prior matches, return default values
        return pd.Series(0.5, index=[f'matchup_{n}' for n in range(1, 11)])
    
    # Get the group DataFrame for the pair
    group_df = grouped_pairs.get_group(pair_key)
    
    # Filter matches that occurred before the 'start' time
    prior_matches = group_df[group_df['end'] < start]
    
    if prior_matches.empty:
        # No prior matches before 'start', return default values
        return pd.Series(0.5, index=[f'matchup_{n}' for n in range(1, 11)])
    
    # Get the last match (most recent before 'start')
    last_row = prior_matches.iloc[-1]
    
    # Define matchup columns
    matchup_cols = [f'matchup_{n}' for n in range(1, 11)]
    
    # Determine if we need to swap the matchup data
    if (p1_id == last_row['p1_id']) and (p2_id == last_row['p2_id']):
        return last_row[matchup_cols]
    elif (p1_id == last_row['p2_id']) and (p2_id == last_row['p1_id']):
        return 1 - last_row[matchup_cols]
        # This case should not occur but acts as a fallback
        print("Matchup Data Failed")
        return pd.Series(0.5, index=matchup_cols)

%timeit get_matchup(dataset_df.iloc[1_200_000]['p1_id'], dataset_df.iloc[1_200_000]['p2_id'], dataset_df.iloc[1_200_000]['start'], grouped_pairs=grouped_pairs)

571 μs ± 134 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:

 


# First, let's compute pairwise probabilities of one player in the top 8 winning against another player
def compute_pairwise_prob(row):
    # Row represents p1, column represents p2 (or specifically, the index in top_8_pos). Always follows this order:
    players=['LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2']
    pairwise_probs = np.zeros(shape=(8,8))
    # pairwise_probs = np.full((8, 8), 0.5)
    
    # For convenience, put everything into one dataframe and then run the single set model
    # This is probably more efficient than doing things line by line
    combination_stats = []

    for r in range(0,8):
        for c in range(0,8):
            p1_data = top_8_stats[top_8_pos[r]].loc[row.name]
            p1_data.index = ['p1' + x for x in p1_data.index]

            p2_data = top_8_stats[top_8_pos[c]].loc[row.name]
            p2_data.index = ['p2' + x for x in p2_data.index]

            #TODO: Actually populate this with proper data!
            #      This is currently only just placeholder data,
            #      indicating that the players have never played together before (0.5).
            matchup_data = get_matchup(row[players[r]], row[players[c]], row['start'], grouped_pairs=grouped_pairs)

            total_data = pd.concat([p1_data, p2_data, matchup_data])
            total_data = total_data[features_all_everything] # Entries need to be in the correct order

            combination_stats.append(total_data)

    combination_stats = pd.DataFrame(combination_stats)

    y_prob = single_set_model.predict_proba(combination_stats)

    # Now actually populate this probability matrix with data.
    # Note that we can just use the same nested loop and read off the entries of the 1D probability array one at a time.
    # This will put things in the correct order.
    i = 0
    for r in range(0,8):
        for c in range(0,8):
            pairwise_probs[r,c] = y_prob[i,1]
            i += 1

    # NOTE: Some models that we train are highly non-symmetric, even though they very much should be,
    #       given that we have randomized the players. We can fix that issue here.
    pairwise_probs = 0.5 * (pairwise_probs + (1 - pairwise_probs.T))

    return pairwise_probs.flatten()

pairwise_prob = tournament_df.progress_apply(compute_pairwise_prob, axis=1)
pairwise_prob = np.stack(pairwise_prob.to_numpy()) # Fixes "single column of np arrays" nonsense

# Add that data as columns in the original dataframe
pairwise_prob_cols = []
for r in range(0,8):
    for c in range(0,8):
        pairwise_prob_cols.append("pairprob/" + top_8_pos[r] + "/" + top_8_pos[c])

tournament_df[pairwise_prob_cols] = pairwise_prob

tournament_df[pairwise_prob_cols]

  0%|          | 0/15293 [00:00<?, ?it/s]

Unnamed: 0,pairprob/LN_A_p1/LN_A_p1,pairprob/LN_A_p1/LN_A_p2,pairprob/LN_A_p1/LN_B_p1,pairprob/LN_A_p1/LN_B_p2,pairprob/LN_A_p1/WSF_A_p1,pairprob/LN_A_p1/WSF_A_p2,pairprob/LN_A_p1/WSF_B_p1,pairprob/LN_A_p1/WSF_B_p2,pairprob/LN_A_p2/LN_A_p1,pairprob/LN_A_p2/LN_A_p2,...,pairprob/WSF_B_p1/WSF_B_p1,pairprob/WSF_B_p1/WSF_B_p2,pairprob/WSF_B_p2/LN_A_p1,pairprob/WSF_B_p2/LN_A_p2,pairprob/WSF_B_p2/LN_B_p1,pairprob/WSF_B_p2/LN_B_p2,pairprob/WSF_B_p2/WSF_A_p1,pairprob/WSF_B_p2/WSF_A_p2,pairprob/WSF_B_p2/WSF_B_p1,pairprob/WSF_B_p2/WSF_B_p2
18944,0.5,0.890509,0.852432,0.872104,0.370693,0.070255,0.446105,0.197773,0.109491,0.5,...,0.5,0.294398,0.802227,0.886035,0.868897,0.881124,0.718817,0.127591,0.705602,0.5
18951,0.5,0.341775,0.355264,0.413097,0.141854,0.176179,0.413097,0.268372,0.658225,0.5,...,0.5,0.347865,0.731628,0.634808,0.610338,0.652135,0.294078,0.586523,0.652135,0.5
18948,0.5,0.357299,0.420535,0.282338,0.328865,0.097825,0.141711,0.082159,0.642701,0.5,...,0.5,0.149739,0.917841,0.935993,0.917265,0.920165,0.913170,0.669152,0.850261,0.5
18955,0.5,0.494464,0.514107,0.458946,0.500059,0.060690,0.167319,0.200874,0.505536,0.5,...,0.5,0.467538,0.799126,0.784415,0.774007,0.796112,0.745964,0.109419,0.532462,0.5
18950,0.5,0.229262,0.574498,0.417159,0.068485,0.151151,0.756258,0.414889,0.770738,0.5,...,0.5,0.293731,0.585111,0.340115,0.610354,0.462603,0.043111,0.214632,0.706269,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39657,0.5,0.243661,0.221014,0.338694,0.194618,0.235543,0.232614,0.339257,0.756339,0.5,...,0.5,0.684905,0.660743,0.312041,0.335139,0.456473,0.266067,0.312205,0.315095,0.5
39672,0.5,0.380903,0.398434,0.174738,0.179369,0.054565,0.041974,0.054029,0.619097,0.5,...,0.5,0.615689,0.945971,0.940585,0.945976,0.741612,0.866365,0.302917,0.384311,0.5
39598,0.5,0.741958,0.147051,0.901396,0.057887,0.452737,0.169667,0.127674,0.258042,0.5,...,0.5,0.407935,0.872326,0.934296,0.461910,0.925452,0.084816,0.826207,0.592065,0.5
39031,0.5,0.134118,0.358433,0.743080,0.093580,0.050296,0.067764,0.133463,0.865882,0.5,...,0.5,0.675258,0.866537,0.654290,0.741645,0.936694,0.647324,0.090804,0.324742,0.5


In [14]:
# Add that data as columns in the original dataframe

# Note that we only really need the strictly lower triangular part of the probability matrix
# Everything else is redundant (the diagonal is 0.5, upper triangular part is 1 - lower triangular)
# The entire matrix is kept in there just for easy reading.

pairwise_prob_cols = []
pairwise_prob_cols_reduced = []

for r in range(0,8):
    for c in range(0,8):
        col_name = "pairprob/" + top_8_pos[r] + "/" + top_8_pos[c]

        pairwise_prob_cols.append(col_name)

        if r > c:
            pairwise_prob_cols_reduced.append(col_name)

tournament_df[pairwise_prob_cols] = pairwise_prob
tournament_df[pairwise_prob_cols_reduced]

Unnamed: 0,pairprob/LN_A_p2/LN_A_p1,pairprob/LN_B_p1/LN_A_p1,pairprob/LN_B_p1/LN_A_p2,pairprob/LN_B_p2/LN_A_p1,pairprob/LN_B_p2/LN_A_p2,pairprob/LN_B_p2/LN_B_p1,pairprob/WSF_A_p1/LN_A_p1,pairprob/WSF_A_p1/LN_A_p2,pairprob/WSF_A_p1/LN_B_p1,pairprob/WSF_A_p1/LN_B_p2,...,pairprob/WSF_B_p1/LN_B_p2,pairprob/WSF_B_p1/WSF_A_p1,pairprob/WSF_B_p1/WSF_A_p2,pairprob/WSF_B_p2/LN_A_p1,pairprob/WSF_B_p2/LN_A_p2,pairprob/WSF_B_p2/LN_B_p1,pairprob/WSF_B_p2/LN_B_p2,pairprob/WSF_B_p2/WSF_A_p1,pairprob/WSF_B_p2/WSF_A_p2,pairprob/WSF_B_p2/WSF_B_p1
18944,0.109491,0.147568,0.560340,0.127896,0.778422,0.733066,0.629307,0.858608,0.836600,0.858480,...,0.783884,0.418945,0.086071,0.802227,0.886035,0.868897,0.881124,0.718817,0.127591,0.705602
18951,0.658225,0.644736,0.527030,0.586903,0.488552,0.455060,0.858146,0.810027,0.780279,0.791784,...,0.500000,0.208216,0.451674,0.731628,0.634808,0.610338,0.652135,0.294078,0.586523,0.652135
18948,0.642701,0.579465,0.421759,0.717662,0.580664,0.640270,0.671135,0.606114,0.607849,0.472069,...,0.790583,0.774104,0.331855,0.917841,0.935993,0.917265,0.920165,0.913170,0.669152,0.850261
18955,0.505536,0.485893,0.541610,0.541054,0.534828,0.495115,0.499941,0.528534,0.482402,0.504324,...,0.782730,0.744929,0.093361,0.799126,0.784415,0.774007,0.796112,0.745964,0.109419,0.532462
18950,0.770738,0.425502,0.268448,0.582841,0.366259,0.648973,0.931515,0.937490,0.938933,0.938377,...,0.283199,0.087591,0.197225,0.585111,0.340115,0.610354,0.462603,0.043111,0.214632,0.706269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39657,0.756339,0.778986,0.504637,0.661306,0.367962,0.364845,0.805382,0.627646,0.584842,0.698226,...,0.637497,0.345618,0.470329,0.660743,0.312041,0.335139,0.456473,0.266067,0.312205,0.315095
39672,0.619097,0.601566,0.244213,0.825262,0.752130,0.852503,0.820631,0.722906,0.830637,0.448308,...,0.873133,0.913572,0.494360,0.945971,0.940585,0.945976,0.741612,0.866365,0.302917,0.384311
39598,0.258042,0.852949,0.899345,0.098604,0.153377,0.089267,0.942113,0.938737,0.890623,0.948434,...,0.918402,0.070573,0.639354,0.872326,0.934296,0.461910,0.925452,0.084816,0.826207,0.592065
39031,0.865882,0.641567,0.309669,0.256920,0.091599,0.069609,0.906420,0.656821,0.756968,0.922970,...,0.953487,0.552122,0.096877,0.866537,0.654290,0.741645,0.936694,0.647324,0.090804,0.324742


Finally, we can (disclaimer: slightly inaccurately) compute all possible paths throughout the top 8, and the probability of winning along each path.

In [15]:
# A slighltly incorrectly implemented algorithm that goes through all possible paths throughout the top 8,
# and computes the corresponding probabilities of each player making it through.
def compute_path_prob(row):
    pairwise_probs = row[pairwise_prob_cols].to_numpy().astype(float).reshape((8,8))
    pairwise_probs_zero_diagonal = pairwise_probs - 0.5 * np.identity(8) # Used for janky computations

    # Now start building the tree structure of how the tournament can play out.
    # Each "cell" will represent some set in the tournament played by some p1 and p2.
    # The cell will have to keep track of all of the probabilities of each player making it to that point.
    #
    # Links should have the form (cell, 'winner') or (cell, 'loser'),
    # describing if it is the winner or the loser of the previous that gets to this one
    class cell:
        def __init__(self, p1=None, p2=None, p1_link=None, p2_link=None):
            if p1==None:
                self.p1_probs = None
            else:
                self.p1_probs = np.zeros(8)
                self.p1_probs[p1] = 1.0

            if p2==None:
                self.p2_probs = None
            else:
                self.p2_probs = np.zeros(8)
                self.p2_probs[p2] = 1.0

            # Links to previous cells
            self.p1_link = p1_link
            self.p2_link = p2_link

            # Used for a (hopefully) temporary patch on the fact that these computations are not entirely accurate
            self.pairwise_probs_zero_diagonal = pairwise_probs - 0.5 * np.identity(8)

        # Get the probabilities from the previous cell.
        # Should not be called if there are no links to previous cells.
        def fetch_probs(self):
            self.p1_probs = self.p1_link[0].compute_winner_probs() if self.p1_link[1] == 'winner' else self.p1_link[0].compute_loser_probs()
            self.p2_probs = self.p2_link[0].compute_winner_probs() if self.p2_link[1] == 'winner' else self.p2_link[0].compute_loser_probs()
        
        # Probability of making it to this cell, and then proceeding to win
        def compute_winner_probs(self):
            if self.p1_probs is None or self.p2_probs is None:
                self.fetch_probs()

            probs = np.zeros(8)

            # Old code, far less efficient. Might make the numpy operations make sense though.
            '''
            for p1 in range(0,8):
                # Save a result for p1.
                # It will be the sum over all p2 of
                # (probability that p1 got there) * (probability that p2 got there) * (probability p1 beats p2)
                for p2 in range(0,8):
                    probs[p1] += self.p1_probs[p1] * self.p2_probs[p2] * pairwise_probs[p1, p2]
                    probs[p2] += self.p1_probs[p1] * self.p2_probs[p2] * (1.0 - pairwise_probs[p1, p2])
            '''
            # Just remember that 1-pairwise_probs is the transpose of pairwise_probs, by symmetry
            #
            # TODO: I just realized that the probability of a certain player becoming p1 and another becoming p2 are NOT independent.
            #       In particular, these probabilities become correlated when you could potentially have the same player as p1 or p2.
            #       This is a bit of a janky patch that hopefully gives accurate enough probabilities, but we should come up with a proper fix.       
            probs += self.p1_probs * (pairwise_probs_zero_diagonal @ self.p2_probs) # Probability that (specific p1) wins
            probs += self.p2_probs * (pairwise_probs_zero_diagonal @ self.p1_probs) # Same but p2

            probs /= probs.sum() # Purely due to zeroing out the diagonal of pairwise_probs

            return probs

        # Probability of making it to this cell, and then proceeding to lose
        def compute_loser_probs(self):
            if self.p1_probs is None or self.p2_probs is None:
                self.fetch_probs()

            probs = np.zeros(8)

            '''
            for p1 in range(0,8):
                # Same, except use probability of p1 losing
                for p2 in range(0,8):
                    probs[p1] += self.p1_probs[p1] * self.p2_probs[p2] * (1.0 - pairwise_probs[p1, p2])
                    probs[p2] += self.p1_probs[p1] * self.p2_probs[p2] * pairwise_probs[p1, p2]
            '''
            # TODO: Same janky patch as in winners case here.
            probs += self.p1_probs * (pairwise_probs_zero_diagonal.T @ self.p2_probs)
            probs += self.p2_probs * (pairwise_probs_zero_diagonal.T @ self.p1_probs)

            probs /= probs.sum()
             
            return probs
        
    # 'LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2'
    WSFA = cell(p1=4, p2=5)
    WSFB = cell(p1=6, p2=7)
    LNA  = cell(p1=0, p2=1)
    LNB  = cell(p1=2, p2=3)

    WF = cell(p1_link=(WSFA, 'winner'), p2_link=(WSFB, 'winner'))

    LQFA = cell(p1_link=(WSFA, 'loser'), p2_link=(LNA, 'winner'))
    LQFB = cell(p1_link=(WSFB, 'loser'), p2_link=(LNB, 'winner'))

    LSF = cell(p1_link=(LQFA, 'winner'), p2_link=(LQFB, 'winner'))

    LF = cell(p1_link=(WF, 'loser'), p2_link=(LSF, 'winner'))

    GF = cell(p1_link=(WF, 'winner'), p2_link=(LF, 'winner'))

    # From the Grand Final onwards, some special cases are required, due to how the Grand Final Reset works
    GF.fetch_probs()

    # TODO: Again, same janky fix as before, "removing" correlation between p1 and p2
    win_as_p1_probs = GF.p1_probs * (pairwise_probs_zero_diagonal @ GF.p2_probs) # direct win as p1 (WF winner)
    win_as_p1_probs += GF.p1_probs * ((pairwise_probs_zero_diagonal.T * pairwise_probs_zero_diagonal) @ GF.p2_probs) # p2 win, then p1 win in GFR

    win_as_p2_probs = GF.p2_probs * ((pairwise_probs_zero_diagonal ** 2) @ GF.p1_probs) # win by 2 required for LF winner

    probs = win_as_p1_probs + win_as_p2_probs
    probs /= probs.sum() # Again due to that janky fix

    return probs

## Benchmarking and baselines

Here, we start comparing the performance of a few "obvious" models, such as just choosing the person with the highest ELO out of the top 8 (or winners' side of the top 8), or simulating all possible paths throughout the top 8 and computing the probability of winning as a result.

In [16]:
features = []
features += pairwise_prob_cols_reduced
features += top_8_prevs_lengths + top_8_prevs_scores

In [17]:
# Baseline of "who has the higher elo"

def pull_elo_from_set(loc, outcome):
    set_data = dataset_df.loc[loc]
    player_num = 'p1' if outcome == (set_data['winner'] == 1.0) else 'p2' # Sneaky way of getting the player number

    feature_to_pull = player_num + '_default_elo'
    pulled_data = set_data[feature_to_pull]

    return pulled_data

# First, pull in player 1 data from a previous match.
# Note that the player might NOT be player 1 in the match that we are pulling from
tournament_df[[x + '_elo' for x in top_8_pos]] = tournament_df[[x + '_non_top_8_sets' for x in top_8_pos]].map(lambda x: pull_elo_from_set(x[0][0], x[0][1])).to_numpy()

tournament_df['elo_prediction'] = tournament_df[[x + '_elo' for x in top_8_pos]].idxmax(axis=1).apply(lambda x: x.replace('_elo', ''))
tournament_df['elo_prediction'] = tournament_df['elo_prediction'].apply(lambda x: top_8_pos.index(x))

tournament_df['elo_WSF_prediction'] = tournament_df[[x + '_elo' for x in top_8_pos if "WSF" in x]].idxmax(axis=1).apply(lambda x: x.replace('_elo', ''))
tournament_df['elo_WSF_prediction'] = tournament_df['elo_WSF_prediction'].apply(lambda x: top_8_pos.index(x))

In [18]:
# Predictor of "who has the highest path probability, taking into account all possible paths on how the top 8 will play out"

result = tournament_df.apply(compute_path_prob, axis=1)
tournament_df = pd.concat([tournament_df, pd.DataFrame(np.stack(result.to_numpy()), index=result.index, columns=[x + '_winprob' for x in top_8_pos])], axis=1)

tournament_df['path_prediction'] = tournament_df[[x + '_winprob' for x in top_8_pos]].idxmax(axis=1).apply(lambda x: x.replace('_winprob', ''))
tournament_df['path_prediction'] = tournament_df['path_prediction'].apply(lambda x: top_8_pos.index(x))

In [19]:
train_df = tournament_df[(tournament_df['start'] >= datetime.datetime(2023,1,1)) & (tournament_df['end'] <= datetime.datetime(2023,12,31))].copy()
test_df  = tournament_df[(tournament_df['start'] >= datetime.datetime(2024,1,1)) & (tournament_df['end'] <= datetime.datetime(2024,12,31))].copy()

In [20]:
train_df

Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,elo_WSF_prediction,LN_A_p1_winprob,LN_A_p2_winprob,LN_B_p1_winprob,LN_B_p2_winprob,WSF_A_p1_winprob,WSF_A_p2_winprob,WSF_B_p1_winprob,WSF_B_p2_winprob,path_prediction
18944,melee,kyojin-dojo__melee-singles,Kyojin Dojo,gg,kyojin-dojo,melee-singles,22,,2023-01-01 11:00:00,2023-01-01 19:55:00,...,5,0.001199,3.573428e-06,1.565439e-05,0.000029,0.024960,0.860298,0.022492,0.091003,5
18951,melee,smash-scott-s-14__melee-singles,Smash @ Scott's #14,gg,smash-scott-s-14,melee-singles,23,,2023-01-01 20:00:00,2023-01-01 23:00:00,...,4,0.000413,6.159430e-03,8.491993e-03,0.004781,0.609582,0.090675,0.072382,0.207516,4
18948,melee,fight-of-the-fearless-33__fight-of-the-fearles...,Fight of the Fearless 33,gg,fight-of-the-fearless-33,fight-of-the-fearless-single,23,,2023-01-01 18:00:00,2023-01-01 23:59:00,...,7,0.000038,1.443519e-04,6.080430e-05,0.000386,0.006876,0.272241,0.052058,0.668195,7
18955,melee,kalvar-cup-73-new-year-new-cup__melee-singles,kalvar Cup #73 New year New Cup,gg,kalvar-cup-73-new-year-new-cup,melee-singles,23,,2023-01-02 00:00:00,2023-01-02 03:00:00,...,5,0.000242,3.515723e-04,3.821340e-04,0.000260,0.010260,0.877779,0.049333,0.061392,5
18950,melee,the-oven-no-29-ultimate-melee-weekly__smash-me...,The Oven No. 29 | Ultimate/Melee Weekly!,gg,the-oven-no-29-ultimate-melee-weekly,smash-melee-singles-bracket,23,,2023-01-01 19:00:00,2023-01-02 04:30:00,...,4,0.000278,4.895094e-03,3.953563e-04,0.002500,0.913414,0.046387,0.012365,0.019766,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32849,melee,melee-at-night-175__singles,Melee at Night #175,gg,melee-at-night-175,singles,23,,2023-12-30 05:00:00,2023-12-30 08:00:00,...,5,0.001771,1.002784e-02,1.270200e-03,0.000028,0.010421,0.930893,0.004782,0.040807,5
32850,melee,smol-end-of-year-edition__melee-singles,SMOL: End of Year Edition,gg,smol-end-of-year-edition,melee-singles,23,,2023-12-30 07:00:00,2023-12-30 10:59:00,...,6,0.000712,1.311718e-03,6.772164e-03,0.279017,0.033600,0.053238,0.596631,0.028718,6
32848,melee,your-12th-restock__melee-singles,Your 12th Restock!,gg,your-12th-restock,melee-singles,23,,2023-12-30 05:00:00,2023-12-30 11:00:00,...,5,0.000028,2.389435e-05,2.039743e-05,0.000012,0.287602,0.221832,0.097256,0.393226,7
32851,melee,super-smash-bxl-winter-chill-editions-1__dorfball,Super Smash BXL: Winter Chill Editions 1,gg,super-smash-bxl-winter-chill-editions-1,dorfball,23,,2023-12-29 23:00:00,2023-12-30 22:00:00,...,5,0.000013,4.081673e-03,2.942790e-05,0.000179,0.006610,0.797727,0.003410,0.187950,5


In [21]:
print("Train set baselines:")
print("Highest ELO out of top 8, accuracy:               ", round(100.0 * (train_df['winner_index'] == train_df['elo_prediction']).astype(float).mean(), 1))
print("Highest ELO out of WSF, accuracy:                 ", round(100.0 * (train_df['winner_index'] == train_df['elo_WSF_prediction']).astype(float).mean(), 1))
print("Computing all ways top 8 can play out, accuracy:  ", round(100.0 * (train_df['winner_index'] == train_df['path_prediction']).astype(float).mean(), 1))

Train set baselines:
Highest ELO out of top 8, accuracy:                66.3
Highest ELO out of WSF, accuracy:                  68.5
Computing all ways top 8 can play out, accuracy:   69.3


## A more advanced model and hyperparameter tuning

Just because choosing the highest ELO seems to be such a powerful predictor to begin with, we will make sure to add those to the list of features that we will use. We won't add any of the other old engineered features, because the model already has a bunch of new "fancier" engineered features, and too many might make it perform worse.

**NOTE:** Hyperparameter tuning trials was reduced down to a very low number, so that this entire notebook could be run in a reasonable amount of time. Optimal parameters from a proper number of trials have already been found and are already provided after the hyperparameter tuning block.

In [22]:
features += [x + '_elo' for x in top_8_pos]

In [23]:
# Some features should NOT be scaled, like probabilities.
# Let's just manually scale ELO (note that we will be using regularization)
if tournament_df[[x + '_elo' for x in top_8_pos][0]].mean() >= 400.0: # Prevents this from accidentally being run twice.
    print("Scaling ELO!")
    tournament_df[[x + '_elo' for x in top_8_pos]] /= 1500.0

Scaling ELO!


In [None]:
# Perform hyperparameter tuning on XGBoost
import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import optuna

def objective(trial):
    max_depth        = trial.suggest_int("max_depth", 2, 15, step=1)
    learning_rate    = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    n_estimators     = trial.suggest_int("n_estimators", 50, 1000, step=25)
    subsample        = trial.suggest_float("subsample", 0.6, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0)
    gamma            = trial.suggest_float("gamma", 0.0, 5.0)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)
    reg_lambda       = trial.suggest_float("lambda", 1e-3, 10.0, log=True)
    reg_alpha        = trial.suggest_float("alpha", 1e-3, 10.0, log=True)

    model = xgb.XGBClassifier(max_depth=max_depth,
                              learning_rate=learning_rate,
                              n_estimators=n_estimators,
                              subsample=subsample,
                              colsample_bytree=colsample_bytree,
                              gamma=gamma,
                              min_child_weight=min_child_weight,
                              reg_lambda=reg_lambda,
                              reg_alpha=reg_alpha)
    
    # The percentage of people that win the tournament when starting from losers side is very small, but not zero
    # Hence, it is probably good to use a stratified k-fold split for cross-validation

    n_splits = 3

    skf = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)
    results = np.zeros(n_splits)
    
    for i, (train_index, val_index) in enumerate(skf.split(train_df[features], train_df['winner_index'])):
        model.fit(train_df.iloc[train_index][features], train_df.iloc[train_index]['winner_index'])

        y_pred = model.predict(train_df.iloc[val_index][features])
        results[i] = accuracy_score(train_df.iloc[val_index]['winner_index'], y_pred)
    
    return results.mean()

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50, timeout=3600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


[I 2024-11-29 19:31:30,415] A new study created in memory with name: no-name-484766e6-3642-4321-8991-8888be910c13


[I 2024-11-29 19:31:37,880] Trial 0 finished with value: 0.6842623329543227 and parameters: {'max_depth': 8, 'learning_rate': 0.1947534461077171, 'n_estimators': 300, 'subsample': 0.6017574047996106, 'colsample_bytree': 0.8497990504804047, 'gamma': 2.4833642202341433, 'min_child_weight': 10, 'lambda': 3.7789607073314855, 'alpha': 0.023684603933503544}. Best is trial 0 with value: 0.6842623329543227.
[I 2024-11-29 19:31:53,583] Trial 1 finished with value: 0.6944504600375395 and parameters: {'max_depth': 9, 'learning_rate': 0.024632629061069267, 'n_estimators': 1000, 'subsample': 0.936956598811366, 'colsample_bytree': 0.6101009247093665, 'gamma': 4.20615154900435, 'min_child_weight': 10, 'lambda': 0.3644492590384484, 'alpha': 0.05375044209447076}. Best is trial 1 with value: 0.6944504600375395.
[I 2024-11-29 19:32:22,326] Trial 2 finished with value: 0.6902778978072175 and parameters: {'max_depth': 8, 'learning_rate': 0.01180346695248142, 'n_estimators': 900, 'subsample': 0.970832843080

KeyboardInterrupt: 

In [37]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import optuna
import numpy as np

def objective(trial):
    param = {
        "objective": "binary:logistic",  # Adjust based on your problem
        "eval_metric": "logloss",        # You can change this to other metrics
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 500, step=50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
    }
    
    model = xgb.XGBClassifier(**param)
    
    n_splits = 3
    skf = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)
    results = []
    
    for train_index, val_index in skf.split(train_df[features], train_df['winner_index']):
        X_train, X_val = train_df.iloc[train_index][features], train_df.iloc[val_index][features]
        y_train, y_val = train_df.iloc[train_index]['winner_index'], train_df.iloc[val_index]['winner_index']
        
        model.fit(
            X_train, y_train,
        )
        
        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        results.append(acc)
    
    return np.mean(results)

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=3600, show_progress_bar=True)  # Increased trials if feasible

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")


[I 2024-11-29 19:41:50,554] A new study created in memory with name: no-name-3bb6ff78-7726-4c63-a7b8-386c6481b11a


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-11-29 19:41:59,070] Trial 0 finished with value: 0.6915395625505889 and parameters: {'max_depth': 11, 'learning_rate': 0.10699616715731652, 'n_estimators': 200, 'subsample': 0.8050347328590993, 'colsample_bytree': 0.960056991948947, 'gamma': 4.243102240338591, 'min_child_weight': 9, 'reg_lambda': 0.004402583550744662, 'reg_alpha': 1.8930354942554257}. Best is trial 0 with value: 0.6915395625505889.
[I 2024-11-29 19:42:44,614] Trial 1 finished with value: 0.6810599192611383 and parameters: {'max_depth': 10, 'learning_rate': 0.1664639531247863, 'n_estimators': 300, 'subsample': 0.7496115264813294, 'colsample_bytree': 0.8258281258583777, 'gamma': 0.11838893198305278, 'min_child_weight': 2, 'reg_lambda': 0.0017921279591632353, 'reg_alpha': 1.7856120248686316}. Best is trial 0 with value: 0.6915395625505889.
[I 2024-11-29 19:43:00,712] Trial 2 finished with value: 0.6882404192501239 and parameters: {'max_depth': 6, 'learning_rate': 0.11453221839150102, 'n_estimators': 500, 'subsampl

In [43]:
# Plucked from the above hyperparameter tuning session.
# Doesn't really seem to be getting much better.
xgb_tuned = xgb.XGBClassifier(max_depth=13,
                              learning_rate=0.013278022811244645,
                              n_estimators=500,
                              subsample=0.7451791331241274,
                              colsample_bytree=0.7785615347277948,
                              gamma=2.819394518381603,
                              min_child_weight=6,
                              reg_lambda=0.41270859938111326,
                              reg_alpha=1.9859484482942584,
                              eval_metric='logloss')

# xgb_tuned = xgb.XGBClassifier(max_depth=3,
#                               learning_rate=0.014622126761423044,
#                               n_estimators=400,
#                               subsample=0.6011955245324696,
#                               colsample_bytree=0.7577195728994964,
#                               gamma=3.008563253857424,
#                               min_child_weight=4,
#                               reg_lambda=0.07168139363768657,
#                               reg_alpha=0.17863019947679695,
#                               eval_metric='logloss')

    
xgb_tuned.fit(train_df[features], train_df['winner_index'])
print("Accuracy of XGBoost on test set: ", round(100.0 * accuracy_score(test_df['winner_index'], xgb_tuned.predict(test_df[features])), 1))

Accuracy of XGBoost on test set:  70.2


In [53]:
print("Test set baselines:")
print()
print("Highest ELO out of top 8, accuracy:               ", round(100.0 * (test_df['winner_index'] == test_df['elo_prediction']).astype(float).mean(), 1))
print("Highest ELO out of WSF, accuracy:                 ", round(100.0 * (test_df['winner_index'] == test_df['elo_WSF_prediction']).astype(float).mean(), 1))
print("Computing all ways top 8 can play out, accuracy:  ", round(100.0 * (test_df['winner_index'] == test_df['path_prediction']).astype(float).mean(), 1))

Test set baselines:

Highest ELO out of top 8, accuracy:                67.5
Highest ELO out of WSF, accuracy:                  70.1
Computing all ways top 8 can play out, accuracy:   69.5


In [68]:
no_upset = test_df[test_df['winner_index'] == test_df['elo_WSF_prediction']]
upset = test_df[test_df['winner_index'] != test_df['elo_WSF_prediction']]

print(f"no upsets = {no_upset.shape[0]}")
print(f"upsets = {upset.shape[0]}")
print()
print(f"Accuracy on no upset {(no_upset['winner_index'] == no_upset['path_prediction']).sum()/no_upset.shape[0]:.2%}")
print(f"Accuracy on upset {(upset['winner_index'] == upset['path_prediction']).sum()/upset.shape[0]:.2%}")
print()
print(f"The model predicted something different {(test_df['elo_WSF_prediction']!=test_df['path_prediction']).sum()/test_df.shape[0]:.2%} of the time.")
print(f"The when the model predicted something different, it was right {((test_df['elo_WSF_prediction']!=test_df['path_prediction']) & (test_df['winner_index'] == test_df['path_prediction'])).sum()/(test_df['elo_WSF_prediction']!=test_df['path_prediction']).sum():.2%} of the time.")

no upsets = 3479
upsets = 1482

Accuracy on no upset 92.56%
Accuracy on upset 15.45%

The model predicted something different 12.11% of the time.
The when the model predicted something different, it was right 38.10% of the time.


In [62]:
upset.shape[0]/test_df.shape[0]

0.2987300947389639