In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from utils.updateStats import getStats, updateStats, createStats
pd.set_option('display.max_columns', None)
import json, hashlib

In [2]:
clean_data = pd.read_csv("../data/cleanedDataset.csv")
previous_stats = createStats()

# Iterate through each row in clean_data
for index, row in tqdm(clean_data.iterrows(), total=len(clean_data)):
    # update stats
    previous_stats = updateStats(row, previous_stats)

100%|██████████| 95375/95375 [00:06<00:00, 14126.14it/s]


In [33]:
# Load the model from models
xgb_model = XGBClassifier()
xgb_model.load_model("../models/final_xgb_model.json")

# I define this here to make the results easier to interpret
mapper = np.vectorize(lambda x: "Player 2 Wins" if x == 0 else "Player 1 Wins")

In [36]:
player2 = {
    "Name": "Jannik Sinner",                    # name, not necessary
    "ID": 206173,                               # ID
    "ATP_POINTS": 0,                         # ATP points
    "ATP_RANK": 1,                              # ATP rank
    "AGE": 23,                                  # age 
    "HEIGHT": 191,                              # height
}

player1 = {
    "Name": "Carlos Alcaraz",
    "ID": 207989,
    "ATP_POINTS": 0,
    "ATP_RANK": 2,
    "AGE": 22,
    "HEIGHT": 183,
}

match = {
    "BEST_OF": 5,                               # 3 or 5 (grand slams)
    "DRAW_SIZE": 128,                           
    "SURFACE": "Clay",                          # Surface of the match. Options are (Hard, Clay, Grass, Carpet)
}

# Call getStatsPlayers function
output = getStats(player1, player2, match, previous_stats)
# print(output)


match_data = pd.DataFrame([dict(sorted(output.items()))])
mapper(xgb_model.predict(np.array(match_data, dtype=object)))

{'BEST_OF': 5, 'DRAW_SIZE': 128, 'AGE_DIFF': -1, 'HEIGHT_DIFF': -8, 'ATP_RANK_DIFF': 1, 'ATP_POINTS_DIFF': 0, 'ELO_DIFF': -153.16437404770863, 'ELO_SURFACE_DIFF': 108.21368656080813, 'N_GAMES_DIFF': -77, 'H2H_DIFF': 2, 'H2H_SURFACE_DIFF': 0, 'WIN_LAST_3_DIFF': 0, 'ELO_GRAD_LAST_3_DIFF': np.float64(1.7327271277320402), 'P_ACE_LAST_3_DIFF': -0.9046534524275529, 'P_DF_LAST_3_DIFF': 0.26887536623844954, 'P_1STIN_LAST_3_DIFF': -6.015610357875765, 'P_1STWON_LAST_3_DIFF': 10.43896452683994, 'P_2NDWON_LAST_3_DIFF': 14.050224466891144, 'P_BPSAVED_LAST_3_DIFF': -22.222222222222214, 'WIN_LAST_5_DIFF': -1, 'ELO_GRAD_LAST_5_DIFF': np.float64(2.04631826295353), 'P_ACE_LAST_5_DIFF': -0.6334950193023374, 'P_DF_LAST_5_DIFF': 1.6146338159088085, 'P_1STIN_LAST_5_DIFF': -1.3387003726310454, 'P_1STWON_LAST_5_DIFF': 3.171083080704875, 'P_2NDWON_LAST_5_DIFF': 4.582410795259101, 'P_BPSAVED_LAST_5_DIFF': -15.833333333333329, 'WIN_LAST_10_DIFF': -3, 'ELO_GRAD_LAST_10_DIFF': np.float64(-8.11266189248028), 'P_ACE

array(['Player 2 Wins'], dtype='<U13')

In [5]:
player1 = {
    "Name": "Fritz",                            # name, not necessary
    "ID": 126203,                               # ID
    "ATP_POINTS": 4815,                         # ATP points
    "ATP_RANK": 4,                              # ATP rank
    "AGE": 28,                                  # age 
    "HEIGHT": 196,                              # height
}

player2 = {
    "Name": "Ruud",
    "ID": 134770,
    "ATP_POINTS": 2915,
    "ATP_RANK": 14,
    "AGE": 27,
    "HEIGHT": 183,
}

match = {
    "BEST_OF": 3,                               # 3 or 5 (grand slams)
    "DRAW_SIZE": 128,                           
    "SURFACE": "Clay",                          # Surface of the match. Options are (Hard, Clay, Grass, Carpet)
}

# Call getStatsPlayers function
output = getStats(player1, player2, match, previous_stats)

match_data = pd.DataFrame([dict(sorted(output.items()))])
mapper(xgb_model.predict(np.array(match_data, dtype=object)))

array(['Player 2 Wins'], dtype='<U13')

In [6]:
match_data

Unnamed: 0,AGE_DIFF,ATP_POINTS_DIFF,ATP_RANK_DIFF,BEST_OF,DRAW_SIZE,ELO_DIFF,ELO_GRAD_LAST_100_DIFF,ELO_GRAD_LAST_10_DIFF,ELO_GRAD_LAST_200_DIFF,ELO_GRAD_LAST_25_DIFF,ELO_GRAD_LAST_3_DIFF,ELO_GRAD_LAST_50_DIFF,ELO_GRAD_LAST_5_DIFF,ELO_SURFACE_DIFF,H2H_DIFF,H2H_SURFACE_DIFF,HEIGHT_DIFF,N_GAMES_DIFF,P_1STIN_LAST_100_DIFF,P_1STIN_LAST_10_DIFF,P_1STIN_LAST_200_DIFF,P_1STIN_LAST_25_DIFF,P_1STIN_LAST_3_DIFF,P_1STIN_LAST_50_DIFF,P_1STIN_LAST_5_DIFF,P_1STWON_LAST_100_DIFF,P_1STWON_LAST_10_DIFF,P_1STWON_LAST_200_DIFF,P_1STWON_LAST_25_DIFF,P_1STWON_LAST_3_DIFF,P_1STWON_LAST_50_DIFF,P_1STWON_LAST_5_DIFF,P_2NDWON_LAST_100_DIFF,P_2NDWON_LAST_10_DIFF,P_2NDWON_LAST_200_DIFF,P_2NDWON_LAST_25_DIFF,P_2NDWON_LAST_3_DIFF,P_2NDWON_LAST_50_DIFF,P_2NDWON_LAST_5_DIFF,P_ACE_LAST_100_DIFF,P_ACE_LAST_10_DIFF,P_ACE_LAST_200_DIFF,P_ACE_LAST_25_DIFF,P_ACE_LAST_3_DIFF,P_ACE_LAST_50_DIFF,P_ACE_LAST_5_DIFF,P_BPSAVED_LAST_100_DIFF,P_BPSAVED_LAST_10_DIFF,P_BPSAVED_LAST_200_DIFF,P_BPSAVED_LAST_25_DIFF,P_BPSAVED_LAST_3_DIFF,P_BPSAVED_LAST_50_DIFF,P_BPSAVED_LAST_5_DIFF,P_DF_LAST_100_DIFF,P_DF_LAST_10_DIFF,P_DF_LAST_200_DIFF,P_DF_LAST_25_DIFF,P_DF_LAST_3_DIFF,P_DF_LAST_50_DIFF,P_DF_LAST_5_DIFF,WIN_LAST_100_DIFF,WIN_LAST_10_DIFF,WIN_LAST_200_DIFF,WIN_LAST_25_DIFF,WIN_LAST_3_DIFF,WIN_LAST_50_DIFF,WIN_LAST_5_DIFF
0,1,1900,-10,3,128,152.524575,0.36581,6.712093,0.562577,8.732341,-2.265762,3.497649,0.005462,-119.995234,-1,-1,13,81,-2.954803,-2.953234,-3.496765,-4.302408,-10.89653,-2.890145,-8.223294,6.476002,4.654115,5.357632,5.571129,2.546896,7.958218,6.923672,2.13115,-2.187207,0.270346,1.109648,0.31746,1.091473,1.56074,6.716255,3.484558,5.78068,4.073385,2.616184,7.069818,5.276409,7.23054,28.785714,5.540188,21.949533,19.444444,19.476543,24.428571,-0.322109,-0.932926,-0.137076,-0.256273,2.173708,-0.191981,1.304225,6,3,7,6,0,3,1


In [37]:
# see how sure of the prediction the model is
probs = xgb_model.predict_proba(np.array(match_data, dtype=object))

# Extract probability of each class
prob_player1_wins = probs[0][1]
prob_player2_wins = probs[0][0]

print(f"Probability of {player1['Name']} winning: {prob_player1_wins:.2%}")
print(f"Probability of {player2['Name']} winning: {prob_player2_wins:.2%}")

Probability of Carlos Alcaraz winning: 48.40%
Probability of Jannik Sinner winning: 51.60%
