In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict
from joblib import Parallel, delayed

import sqlite3
import sys
import time
import math
#import tqdm
from tqdm.auto import tqdm
import datetime
import os
import pickle
from pathlib import Path

from glicko2 import Player
import multiprocessing

tqdm.pandas()

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'

In [None]:
from sklearn.metrics import accuracy_score, log_loss

dataset_df = pd.read_pickle(data_path + 'dataset_full.pkl')
dataset_df.sort_index(inplace=True) # For convenience, mostly. Not really necessary, we use .loc[] anyways

# Note that there is no real reason to keep separately computing the individual probabilities of winning each individual set.
# Let's just compute them all at once here.

single_set_model = None
with open(data_path + 'single_set_model.pkl', 'rb') as f:
    single_set_model = pickle.load(f)

# Make sure these match up with what features the model was trained on.
features_all_everything = ['p1_default_elo', 'p2_default_elo', 'p1_default_rd', 'p2_default_rd',
       'p1_default_updates', 'p2_default_updates', 'matchup_1', 'matchup_2',
       'matchup_3', 'matchup_4', 'matchup_5', 'matchup_6', 'matchup_7',
       'matchup_8', 'matchup_9', 'matchup_10', 'p1_m1_usage', 'p2_m1_usage',
       'p1/m1/m1_alt2_elo', 'p1/m1/m1_alt2_rd', 'p1/m1/m1_alt2_updates',
       'p2/m1/m1_alt2_elo', 'p2/m1/m1_alt2_rd', 'p2/m1/m1_alt2_updates',
       'p1/m1_alt3_elo', 'p1/m1_alt3_rd', 'p1/m1_alt3_updates',
       'p2/m1_alt3_elo', 'p2/m1_alt3_rd', 'p2/m1_alt3_updates']

dataset_df['p1_win_prob'] = single_set_model.predict_proba(dataset_df[features_all_everything])[:,1]

# As a sanity check, let's verify the accuracy and log loss on 2024 data
# Total accuracy and log loss (including lots of data that the model was trained on)
date_filter = (dataset_df['start'] >= datetime.datetime(2024,1,1)) & (dataset_df['end'] <= datetime.datetime(2024,12,31))
print("2024 single-set performance metrics")
print("Log loss: ", round(log_loss(dataset_df[date_filter]['winner'], dataset_df[date_filter]['p1_win_prob']), 3))
print("Accuracy: ", round(100.0 * accuracy_score(dataset_df[date_filter]['winner'], dataset_df[date_filter]['p1_win_prob'] >= 0.5), 1))

In [None]:
tournament_df = pd.read_pickle(data_path + 'top_8_tournament_previous_sets_and_results_with_winners_df')

# Filter down to tournaments which actually have valid top 8 data, and previous data on getting there.
tournament_df = tournament_df.loc[tournament_df[['LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2',
                                                 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2',
                                                 'LN_A_p1_non_top_8_sets', 'LN_A_p2_non_top_8_sets',
                                                 'LN_B_p1_non_top_8_sets', 'LN_B_p2_non_top_8_sets',
                                                 'WSF_A_p1_non_top_8_sets', 'WSF_A_p2_non_top_8_sets',
                                                 'WSF_B_p1_non_top_8_sets', 'WSF_B_p2_non_top_8_sets']].dropna().index]


# Very rarely (not sure where the problem is) you get something not actually in the single-set dataframe
# It is not actually that common though, so let's just delete those instances.
def references_valid_sets(prev_sets):
    for x in prev_sets:
        if x[0] not in dataset_df.index:
            return False
        
    return True

filter = tournament_df[['LN_A_p1_non_top_8_sets', 'LN_A_p2_non_top_8_sets',
                        'LN_B_p1_non_top_8_sets', 'LN_B_p2_non_top_8_sets',
                        'WSF_A_p1_non_top_8_sets', 'WSF_A_p2_non_top_8_sets',
                        'WSF_B_p1_non_top_8_sets', 'WSF_B_p2_non_top_8_sets']].map(references_valid_sets).all(axis=1)

tournament_df = tournament_df[filter]

# Likewise, some of these sets don't seem to have a valid winner
tournament_df = tournament_df[~tournament_df['winner_id'].isna()]

# A bit more cleanup, for sanity
min_date = datetime.datetime(2015,1,1)
max_date = datetime.datetime(2024,12,31)

tournament_df = tournament_df[(tournament_df['start'] >= min_date) &
                              (tournament_df['end'] >= min_date) &
                              (tournament_df['start'] <= max_date) &
                              (tournament_df['end'] <= max_date)]

tournament_df.sort_values(by=['end', 'start'], inplace=True)

tournament_df[['winner_id', 'LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2']]

In [None]:
# Compute a score based on how likely it was that they actually made it to the top.
# A sort of sum of negative log probabilities, where wins are positive and losses are negative
def prev_set_score(prev_sets):
    result = 0

    for x in prev_sets:
        data = dataset_df.loc[x[0], ['winner', 'p1_win_prob']]
        outcome = x[1]

        # We don't know if this player is p1 or p2 in this list, but this can determine it without looking at player id
        # Compare if (player we are interested in wins) vs (did p1 win)
        if outcome == (data['winner'] == 1.0): # The player is p1
            if outcome: # player wins, as p1
                result += (-np.log(data['p1_win_prob']))
            else:       # player loses, as p1
                result -= (-np.log(1-data['p1_win_prob']))
        else:                                  # The player is p2
            if outcome: # player wins, as p2
                result += (-np.log(1-data['p1_win_prob']))
            else:       # player loses, as p2
                result -= (-np.log(data['p1_win_prob']))

    return result

# Example data (note that there is a consistent ELO throughout this entire dataset)
prev_sets = tournament_df.iloc[10010]['LN_A_p1_non_top_8_sets']
print(prev_sets)
print()
print(dataset_df.loc[[x[0] for x in prev_sets], ['p1_default_elo', 'p2_default_elo', 'winner', 'p1_win_prob']])
print()
print("Previous set score: ", prev_set_score(tournament_df.iloc[10010]['LN_A_p1_non_top_8_sets']))

In [None]:
top_8_pos = ['LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2']
top_8_prevs = [x + '_non_top_8_sets' for x in top_8_pos]
top_8_prevs_lengths = [x + "_len" for x in top_8_prevs] # These columns will just keep track of how many sets the player went through to get to the top 8
top_8_prevs_scores = [x + "_score" for x in top_8_prevs] # These will keep track of their "score" that shows how "well" they are performing relative to their predicted odds.

tournament_df[top_8_prevs_lengths] = tournament_df[top_8_prevs].map(lambda x: len(x)).to_numpy()
tournament_df[top_8_prevs_scores] = tournament_df[top_8_prevs].map(prev_set_score).to_numpy()
tournament_df[top_8_prevs + top_8_prevs_lengths + top_8_prevs_scores]

In [None]:
# We will need to have all of the features we've engineered for each of the players that made it to the top 8
# We can be clever and pull most of them (elo-based features) from the previous sets in the dataframe
# The rest (player vs player stats, also called "matchup", but might be renamed) have to be pulled in manually

features_elo = ['p1_default_elo', 'p2_default_elo', 'p1_default_rd', 'p2_default_rd',
       'p1_default_updates', 'p2_default_updates', 'p1_m1_usage', 'p2_m1_usage',
       'p1/m1/m1_alt2_elo', 'p1/m1/m1_alt2_rd', 'p1/m1/m1_alt2_updates',
       'p2/m1/m1_alt2_elo', 'p2/m1/m1_alt2_rd', 'p2/m1/m1_alt2_updates',
       'p1/m1_alt3_elo', 'p1/m1_alt3_rd', 'p1/m1_alt3_updates',
       'p2/m1_alt3_elo', 'p2/m1_alt3_rd', 'p2/m1_alt3_updates']

features_matchup = ['matchup_1', 'matchup_2', 'matchup_3', 'matchup_4', 'matchup_5',
                    'matchup_6', 'matchup_7', 'matchup_8', 'matchup_9', 'matchup_10']

In [None]:
# In order to avoid an organizational nightmare, each of the following:
#     'LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2'
# will have their data stored in separate dataframes, each held in the following dictionary

top_8_stats = {}

def pull_data_from_set(loc, outcome):
    set_data = dataset_df.loc[loc]
    player_num = 'p1' if outcome == (set_data['winner'] == 1.0) else 'p2' # Sneaky way of getting the player number

    features_to_pull = [x for x in features_elo if player_num in x]
    pulled_data = set_data[features_to_pull].copy()
    pulled_data.index = [x.replace(player_num, '') for x in pulled_data.index] # We will add player numbers on an as-needed basis later

    return pulled_data

for top_8_position in tqdm(top_8_pos):
    # First, pull in player 1 data from a previous match.
    # Note that the player might NOT be player 1 in the match that we are pulling from
    top_8_stats[top_8_position] = tournament_df[top_8_position + '_non_top_8_sets'].apply(lambda x: pull_data_from_set(x[0][0], x[0][1]))

In [None]:
def compute_path_prob(row):
    # First, compute the probabilities of any player in this top 8 winning a set against any other player.
    # Row represents p1, column represents p2 (or specifically, the index in top_8_pos)
    pairwise_probs = np.zeros(shape=(8,8))
    
    # For convenience, put everything into one dataframe and then run the single set model
    # This is probably more efficient than doing things line by line
    combination_stats = []

    for r in range(0,8):
        for c in range(0,8):
            p1_data = top_8_stats[top_8_pos[r]].loc[row.name]
            p1_data.index = ['p1' + x for x in p1_data.index]

            p2_data = top_8_stats[top_8_pos[c]].loc[row.name]
            p2_data.index = ['p2' + x for x in p2_data.index]

            #TODO: Actually populate this with proper data!
            #      This is currently only just placeholder data,
            #      indicating that the players have never played together before (0.5).
            matchup_data = pd.Series(0.5, index=['matchup_' + str(n) for n in range(1,10+1)])

            total_data = pd.concat([p1_data, p2_data, matchup_data])
            total_data = total_data[features_all_everything] # Entries need to be in the correct order

            combination_stats.append(total_data)

    combination_stats = pd.DataFrame(combination_stats)

    y_prob = single_set_model.predict_proba(combination_stats)

    # Now actually populate this probability matrix with data.
    # Note that we can just use the same nested loop and read off the entries of the 1D probability array one at a time.
    # This will put things in the correct order.
    i = 0
    for r in range(0,8):
        for c in range(0,8):
            pairwise_probs[r,c] = y_prob[i,1]
            i += 1


    # NOTE: Some models that we train are highly non-symmetric, even though they very much should be,
    #       given that we have randomized the players. We can fix that issue here.
    pairwise_probs = 0.5 * (pairwise_probs + (1 - pairwise_probs.T))
    pairwise_probs_zero_diagonal = pairwise_probs - 0.5 * np.identity(8) # Used for janky computations

    # Now start building the tree structure of how the tournament can play out.
    # Each "cell" will represent some set in the tournament played by some p1 and p2.
    # The cell will have to keep track of all of the probabilities of each player making it to that point.
    #
    # Links should have the form (cell, 'winner') or (cell, 'loser'),
    # describing if it is the winner or the loser of the previous that gets to this one
    class cell:
        def __init__(self, p1=None, p2=None, p1_link=None, p2_link=None):
            if p1==None:
                self.p1_probs = None
            else:
                self.p1_probs = np.zeros(8)
                self.p1_probs[p1] = 1.0

            if p2==None:
                self.p2_probs = None
            else:
                self.p2_probs = np.zeros(8)
                self.p2_probs[p2] = 1.0

            # Links to previous cells
            self.p1_link = p1_link
            self.p2_link = p2_link

            # Used for a (hopefully) temporary patch on the fact that these computations are not entirely accurate
            self.pairwise_probs_zero_diagonal = pairwise_probs - 0.5 * np.identity(8)

        # Get the probabilities from the previous cell.
        # Should not be called if there are no links to previous cells.
        def fetch_probs(self):
            self.p1_probs = self.p1_link[0].compute_winner_probs() if self.p1_link[1] == 'winner' else self.p1_link[0].compute_loser_probs()
            self.p2_probs = self.p2_link[0].compute_winner_probs() if self.p2_link[1] == 'winner' else self.p2_link[0].compute_loser_probs()
        
        # Probability of making it to this cell, and then proceeding to win
        def compute_winner_probs(self):
            if self.p1_probs is None or self.p2_probs is None:
                self.fetch_probs()

            probs = np.zeros(8)

            # Old code, far less efficient. Might make the numpy operations make sense though.
            '''
            for p1 in range(0,8):
                # Save a result for p1.
                # It will be the sum over all p2 of
                # (probability that p1 got there) * (probability that p2 got there) * (probability p1 beats p2)
                for p2 in range(0,8):
                    probs[p1] += self.p1_probs[p1] * self.p2_probs[p2] * pairwise_probs[p1, p2]
                    probs[p2] += self.p1_probs[p1] * self.p2_probs[p2] * (1.0 - pairwise_probs[p1, p2])
            '''
            # Just remember that 1-pairwise_probs is the transpose of pairwise_probs, by symmetry
            #
            # TODO: I just realized that the probability of a certain player becoming p1 and another becoming p2 are NOT independent.
            #       In particular, these probabilities become correlated when you could potentially have the same player as p1 or p2.
            #       This is a bit of a janky patch that hopefully gives accurate enough probabilities, but we should come up with a proper fix.       
            probs += self.p1_probs * (pairwise_probs_zero_diagonal @ self.p2_probs) # Probability that (specific p1) wins
            probs += self.p2_probs * (pairwise_probs_zero_diagonal @ self.p1_probs) # Same but p2

            probs /= probs.sum() # Purely due to zeroing out the diagonal of pairwise_probs

            return probs

        # Probability of making it to this cell, and then proceeding to lose
        def compute_loser_probs(self):
            if self.p1_probs is None or self.p2_probs is None:
                self.fetch_probs()

            probs = np.zeros(8)

            '''
            for p1 in range(0,8):
                # Same, except use probability of p1 losing
                for p2 in range(0,8):
                    probs[p1] += self.p1_probs[p1] * self.p2_probs[p2] * (1.0 - pairwise_probs[p1, p2])
                    probs[p2] += self.p1_probs[p1] * self.p2_probs[p2] * pairwise_probs[p1, p2]
            '''
            # TODO: Same janky patch as in winners case here.
            probs += self.p1_probs * (pairwise_probs_zero_diagonal.T @ self.p2_probs)
            probs += self.p2_probs * (pairwise_probs_zero_diagonal.T @ self.p1_probs)

            probs /= probs.sum()
             
            return probs
        
    # 'LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2'
    WSFA = cell(p1=4, p2=5)
    WSFB = cell(p1=6, p2=7)
    LNA  = cell(p1=0, p2=1)
    LNB  = cell(p1=2, p2=3)

    WF = cell(p1_link=(WSFA, 'winner'), p2_link=(WSFB, 'winner'))

    LQFA = cell(p1_link=(WSFA, 'loser'), p2_link=(LNA, 'winner'))
    LQFB = cell(p1_link=(WSFB, 'loser'), p2_link=(LNB, 'winner'))

    LSF = cell(p1_link=(LQFA, 'winner'), p2_link=(LQFB, 'winner'))

    LF = cell(p1_link=(WF, 'loser'), p2_link=(LSF, 'winner'))

    GF = cell(p1_link=(WF, 'winner'), p2_link=(LF, 'winner'))

    # From the Grand Final onwards, some special cases are required, due to how the Grand Final Reset works
    GF.fetch_probs()

    # TODO: Again, same janky fix as before, "removing" correlation between p1 and p2
    win_as_p1_probs = GF.p1_probs * (pairwise_probs_zero_diagonal @ GF.p2_probs) # direct win as p1 (WF winner)
    win_as_p1_probs += GF.p1_probs * ((pairwise_probs_zero_diagonal.T * pairwise_probs_zero_diagonal) @ GF.p2_probs) # p2 win, then p1 win in GFR

    win_as_p2_probs = GF.p2_probs * ((pairwise_probs_zero_diagonal ** 2) @ GF.p1_probs) # win by 2 required for LF winner

    probs = win_as_p1_probs + win_as_p2_probs
    probs /= probs.sum() # Again due to that janky fix

    return probs


n = 10000
result = tournament_df.iloc[n:n+10].apply(compute_path_prob, axis=1)
print(pd.DataFrame(np.stack(result.to_numpy()),
      index=result.index, columns=[x + '_winprob' for x in top_8_pos]))
#top_8_stats[top_8_pos[0]]

In [None]:
tournament_df['winner_id']

In [None]:
# Baseline of "who has the higher elo"

test_df = tournament_df[tournament_df['start'] >= datetime.datetime(2024,1,1)].copy()
print(test_df.shape)

def pull_elo_from_set(loc, outcome):
    set_data = dataset_df.loc[loc]
    player_num = 'p1' if outcome == (set_data['winner'] == 1.0) else 'p2' # Sneaky way of getting the player number

    feature_to_pull = player_num + '_default_elo'
    pulled_data = set_data[feature_to_pull]

    return pulled_data

# First, pull in player 1 data from a previous match.
# Note that the player might NOT be player 1 in the match that we are pulling from
test_df[[x + '_elo' for x in top_8_pos]] = test_df[[x + '_non_top_8_sets' for x in top_8_pos]].map(lambda x: pull_elo_from_set(x[0][0], x[0][1])).to_numpy()

test_df['elo_prediction'] = test_df[[x + '_elo' for x in top_8_pos]].idxmax(axis=1).apply(lambda x: x.replace('_elo', ''))
test_df['elo_prediction'] = test_df.apply(lambda row: row[row['elo_prediction']], axis=1)
test_df

In [None]:
# Predictor of "who has the highest path probability, taking into account all possible paths on how the top 8 will play out"

result = test_df.apply(compute_path_prob, axis=1)
test_df = pd.concat([test_df, pd.DataFrame(np.stack(result.to_numpy()), index=result.index, columns=[x + '_winprob' for x in top_8_pos])], axis=1)

test_df['path_prediction'] = test_df[[x + '_winprob' for x in top_8_pos]].idxmax(axis=1).apply(lambda x: x.replace('_winprob', ''))
test_df['path_prediction'] = test_df.apply(lambda row: row[row['path_prediction']], axis=1)
test_df

In [None]:
test_df[['winner_id', 'elo_prediction', 'path_prediction']]

In [None]:
print("ELO-only accuracy: ", round(100.0 * (test_df['winner_id'] == test_df['elo_prediction']).astype(float).mean(), 1))
print("Pathprob accuracy: ", round(100.0 * (test_df['winner_id'] == test_df['path_prediction']).astype(float).mean(), 1))