In [1]:
from random import *
import pandas as pd
import itertools
import numpy as np
import os
from tqdm.auto import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
base_route = "/content/drive/MyDrive/DSL/"
file_list = os.listdir(base_route)
file_list

['2001-2018 (1).csv',
 '2019.csv',
 '2020.csv',
 'AO_draw.csv',
 'test.csv',
 'result_gnb_1000_(3).csv']

In [4]:
train_data = pd.read_csv(base_route +file_list[0], index_col=0)
test_data19 = pd.read_csv(base_route +file_list[1], index_col=0)
test_data20 = pd.read_csv(base_route +file_list[2], index_col=0)

## 전처리 및 세팅

In [5]:
def train_test_split(data):
    data = data.copy()
    data.drop(['tourney_date', 'winner_id', 'loser_id'], axis=1, inplace=True)
    np.random.seed(202101)
    random_number = np.random.rand(len(data), ) < 0.5
    data_win = data.copy()[random_number]
    data_lose = data.copy()[~random_number]    # [~random_number]를 설정해주지 않았어서 수정
    data_win['y'] = 1
    data_lose['y'] = 0
    trim_data = data_lose.rename(columns={'winner_hand': 'loser_hand', 'loser_hand': 'winner_hand',
                                          'winner_ht': 'loser_ht', 'loser_ht': 'winner_ht',
                                          'winner_age': 'loser_age', 'loser_age': 'winner_age',
                                          'winner_rank': 'loser_rank', 'loser_rank': 'winner_rank',
                                          'winner_rank_points': 'loser_rank_points', 'loser_rank_points': 'winner_rank_points',
                                         })
    trim_data = pd.concat([trim_data, data_win])    # 여기에서 똑같은 데이터가 두 번 들어가게 됨.
    # more trimming
    trim_data['ht_dff'] = trim_data['winner_ht'] - trim_data['loser_ht']
    trim_data['rank_points_dff'] = trim_data['winner_rank_points'] - trim_data['loser_rank_points']
    trim_data['age_dff'] = trim_data['winner_age'] - trim_data['loser_age']
    
    trim_data.drop(['winner_ht', 'loser_ht', 'winner_rank_points', 'loser_rank_points', 'winner_age', 'loser_age'], axis=1, inplace=True)
    
    trim_data.rename(columns={'winner_hand': 'hand',
                              'loser_hand': 'opponent_hand',
                              'winner_rank': 'rank',
                              'loser_rank': 'opponent_rank'}, inplace=True)
    # encoding
    trim_data.loc[(trim_data.surface == 'Carpet'),'surface'] = np.nan
    final_data = pd.get_dummies(trim_data, columns=["hand",'opponent_hand', 'best_of', 'surface'])
    final_data.dropna(inplace=True)
    # split
    print(final_data.shape)
    X = final_data.drop(["Quarter", 'y', 'Month'], axis=1)
    print(X.columns)
    y = final_data['y']
    return X, y

In [6]:
X_train, y_train = train_test_split(train_data)
X_test19, y_test19 = train_test_split(test_data19)
X_test20, y_test20 = train_test_split(test_data20)

(46943, 19)
Index(['opponent_rank', 'rank', 'ht_dff', 'rank_points_dff', 'age_dff',
       'hand_L', 'hand_R', 'hand_U', 'opponent_hand_L', 'opponent_hand_R',
       'opponent_hand_U', 'best_of_3', 'best_of_5', 'surface_Clay',
       'surface_Grass', 'surface_Hard'],
      dtype='object')
(841, 19)
Index(['opponent_rank', 'rank', 'ht_dff', 'rank_points_dff', 'age_dff',
       'hand_L', 'hand_R', 'hand_U', 'opponent_hand_L', 'opponent_hand_R',
       'opponent_hand_U', 'best_of_3', 'best_of_5', 'surface_Clay',
       'surface_Grass', 'surface_Hard'],
      dtype='object')
(272, 19)
Index(['opponent_rank', 'rank', 'ht_dff', 'rank_points_dff', 'age_dff',
       'hand_L', 'hand_R', 'hand_U', 'opponent_hand_L', 'opponent_hand_R',
       'opponent_hand_U', 'best_of_3', 'best_of_5', 'surface_Clay',
       'surface_Grass', 'surface_Hard'],
      dtype='object')


상위 train_test_split과 같은 함수의 경우에는 model.ipynb를 그대로 참고함.

> 기존에 random_number를 지정 안했을 때 보다 정확히 절반이 줄었다. train_data의 전체 row숫자가 약 5.5만개였고, 과거에 X_train의 row숫자가 9만개를 넘겼었다는 이야기

In [7]:
AO_draw = pd.read_csv(base_route + file_list[3])

In [8]:
AO_draw.head()

Unnamed: 0,name,rank,rank_point,height,age,hand
0,N.Djokovic,1,12030,188,33.723,Right-Handed
1,J.Chardy,66,950,188,33.995,Right-Handed
2,F.Tiafoe,62,1005,188,23.049,Right-Handed
3,S.Travaglia,71,894,185,29.118,Right-Handed
4,R.Opelka,40,1402,211,23.447,Right-Handed


In [9]:
all_list = list(itertools.combinations(range(128),2))    # 조합의 종류 출력

In [10]:
predict_prob = []
base = [0,1,0,0,1]    # 호주 오픈의 경우 Best of 5, Hard court이기 때문.
total_base = []
for p, q in all_list:
    a = AO_draw.iloc[p]
    b = AO_draw.iloc[q]
    op_rank = b["rank"]
    rank = a["rank"]
    ht_dff = a["height"]-b["height"]
    point_dff = a["rank_point"]-b["rank_point"]
    age_dff = a["age"]-b["age"]
    if a["hand"]=="Right-Handed":
        hand_a = [0,1,0]
    else:
        hand_a = [1,0,0]
    if b["hand"]=="Right-Handed":
        hand_b = [0,1,0]
    else:
        hand_b = [1,0,0]
    p_q_match = [op_rank,rank,ht_dff,point_dff,age_dff]+hand_a+hand_b+base
    total_base.append(p_q_match)
    # 여기에서 모델에 적용할 input database를 만들어낸다.


In [11]:
match_input = pd.DataFrame(total_base, columns=['opponent_rank', 'rank', 'ht_dff', 'rank_points_dff', 'age_dff',
       'hand_L', 'hand_R', 'hand_U', 'opponent_hand_L', 'opponent_hand_R',
       'opponent_hand_U', 'best_of_3', 'best_of_5', 'surface_Clay',
       'surface_Grass', 'surface_Hard'])

## 모델링

### GNB

In [12]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()

In [14]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [15]:
scaler.transform(X_train)

array([[-0.21856992, -0.61715743,  0.83778367, ..., -0.6956252 ,
        -0.34900395,  0.93983751],
       [-0.3992428 , -0.24499811,  1.35958522, ..., -0.6956252 ,
        -0.34900395,  0.93983751],
       [-0.42049843, -0.05360188, -0.72762099, ..., -0.6956252 ,
        -0.34900395,  0.93983751],
       ...,
       [-0.66493821, -0.75538804, -1.87558441, ..., -0.6956252 ,
        -0.34900395,  0.93983751],
       [-0.71807729, -0.75538804, -1.04070192, ..., -0.6956252 ,
        -0.34900395,  0.93983751],
       [-0.76058856, -0.71285555,  1.04650429, ..., -0.6956252 ,
        -0.34900395,  0.93983751]])

In [16]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred19 = gnb.predict(X_test19)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test19.shape[0], (y_test19 != y_pred19).sum()))

Number of mislabeled points out of a total 841 points : 308


### 로지스틱

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
LR = LogisticRegression(random_state=202101)
LR.fit(X_train, y_train)
y_pred19 = LR.predict(X_test19)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test19.shape[0], (y_test19 != y_pred19).sum()))

Number of mislabeled points out of a total 841 points : 302


### XGBoost

In [19]:
from xgboost import XGBClassifier, plot_importance

In [20]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [21]:
y_pred19 = xgb.predict(X_test19)
y_pred20 = xgb.predict(X_test20)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test19.shape[0], (y_test19 != y_pred19).sum()))
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test20.shape[0], (y_test20 != y_pred20).sum()))

Number of mislabeled points out of a total 841 points : 306
Number of mislabeled points out of a total 272 points : 85


### Catboost

In [22]:
from catboost import CatBoostClassifier

In [23]:
cat = CatBoostClassifier(n_estimators = 1000, learning_rate =0.05, verbose=False)
cat.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7fbf1cd37f90>

In [24]:
y_pred19 = cat.predict(X_test19)
y_pred20 = cat.predict(X_test20)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test19.shape[0], (y_test19 != y_pred19).sum()))
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test20.shape[0], (y_test20 != y_pred20).sum()))

Number of mislabeled points out of a total 841 points : 305
Number of mislabeled points out of a total 272 points : 82


## input 전처리

In [25]:
def model_apply(match_input, model):
    '''
    모든 매치의 경우의 수와 각 매치에서 선수의 승률을 merge하는 함수

    match_input: pd.DataFrame()
        위에서 만든 match_input 그대로 입력

    model: "not str"
        각 모델에서  prediction 해주는 함수 그대로 입력
        ex) LogisticRegression.predict_proba
    '''
    
    prediction = model(match_input)
    prediction_df = pd.DataFrame(prediction, columns = ["win","lose"])
    r_value = pd.concat([match_input[['rank', 'opponent_rank']],prediction_df[['lose']]], axis=1)
    r_value = r_value.rename(columns={"lose":"win"})
    
    return r_value

In [26]:
final_input_gnb= model_apply(match_input,gnb.predict_proba)
final_input_LR = model_apply(match_input,LR.predict_proba)
final_input_xgb = model_apply(match_input,xgb.predict_proba)
final_input_cat = model_apply(match_input,cat.predict_proba)

## MC simulator

### class 지정

In [27]:
class tournament(object):
    
    '''
    
    tournament class

    self.변수 소개

    self.how_many_total_wins: 최종 리턴 값. 누가 얼마나 많은 승을 거두었는지
    
    self.match_list: 가능한 경기 조합 및 확률

    self.match_counts: 0라운드부터 라운드마다 몇 경기를 하는지

    self.rank_list: 대진표에서 rank만 따온 것

    '''

    def fit(self, draw, m_input):
        '''

        대진표와 model_apply()의 return값을 이용하여 class에 적용

        draw: pd.DataFrame()
            대진표
        m_input: pd.DataFrame()
            모든 경기조합과 각각 경기들의 승률
        
        '''

        rank_list = draw[["rank"]]
        self.how_many_total_wins=rank_list.copy()
        self.how_many_total_wins.insert(1,"win",0)
        self.match_list = m_input.copy().to_numpy()
        self.match_counts = []
        self.len = len(rank_list)
        self.rank_list = draw["rank"].copy()
        
        for i in range(len(rank_list)):
            self.len = int(self.len/2)
            self.match_counts.append(self.len)
            if self.len <=1:
                break

        
        
    def match_win(self, rank_a, rank_b):
        '''

        두 선수를 붙이고 누가 이길 것인지를 출력

        rank_a, rank_b: int
            각 선수의 rank

        '''

        match_a = self.match_list[self.match_list[:,0]==rank_a]
        match_b = match_a[match_a[:,1]==rank_b]
        prob = match_b[0,2].tolist()
        rand_num = uniform(0,1)

        if prob > rand_num:
            return rank_a
        else:
            return rank_b
        
    def tournament_one_round(self, matches, rank_list_1):
        '''
        
        한 라운드를 모두 매칭하여 승리자를 뽑아내는 함수

        matches: int
            해당 라운드에 몇 경기를 하는가

        rank_list_1: list
            해당 라운드 진출자들의 rank list

        '''
        winners = []
        
        for i in range(matches):
            a = 2*i
            b = 2*i + 1
            rank_a = rank_list_1[a]
            rank_b = rank_list_1[b]
            winner = self.match_win(rank_a, rank_b)
            winners.append(winner)
        return winners
        
    def tournament_one_cycle(self):
        '''

        토너먼트 전체를 돌리는 함수

        '''

        rank_list_2 = self.rank_list
        for matches in self.match_counts:
            rank_list_2 = self.tournament_one_round(matches,rank_list_2)
        self.how_many_total_wins.loc[self.how_many_total_wins['rank']==rank_list_2[0],'win']+=1
        
    def cycles(self, num=1):
        '''

        토너먼트 전체를 몇 번 돌릴 것인가?

        num: int
            몇 번 돌릴 것인가?
        '''

        self.cycle = num
    
        for i in tqdm(range(num)):

            self.tournament_one_cycle()
        return(self.how_many_total_wins)

In [28]:
def file_name(model,num,file_type):
    '''

    result를 편하게 저장하기 위한 함수

    model: str
        어떠한 모델을 사용하였는지
    num: int
        Cycle의 크기
    file_type: str
        파일 확장자

    '''

    result_name = "result_"+model+"_"+str(num)
    result_name_count = 0
    file_list_1 = os.listdir(base_route)

    for file_name in file_list_1:
        if result_name in file_name:
            result_name_count += 1

    if result_name_count == 0:
        result_count = "."

    else:
        result_count = "_(%s)."%(str(result_name_count))

    final_name = result_name + result_count +file_type
    return final_name


def rank_to_name(result):
    '''

    MC를 돌린 result에 이름, 승률 추가하는 함수

    result: pd.DataFrame()
        MC를 돌린 결과값

    '''

    result_1 = result.copy()
    result_1['name'] = ''
    result_1['win_rate'] = 0
    cycle = result['win'].sum()
    
    name_list = []

    for index, row in result_1.iterrows():
        rank = row['rank']
        name = AO_draw.loc[AO_draw['rank']==rank]['name'].tolist()[0]
        name_list.append(name)

    result_1['name']=name_list
    result_1['win_rate']=round(result_1['win']/cycle*100,2)
    print(result_1)
    result_return = result_1[['name','rank','win','win_rate']]
    
    return result_return


def tourn_model(num,final_input, model):
    '''

    tournament를 좀 더 깔끔하게 쓰기 위해 define한 함수.


    num: int
        cycle의 크기
    
    final_input: pd.DataFrame()
        위에서 썼던 그 final_input 맞다.

    model: str
        모델명
    '''

    print('tournament by %s in %d' % (model,num))
    tour= tournament()
    tour.fit(AO_draw, final_input)
    result = tour.cycles(num).sort_values(by=['win'], axis=0, ascending=False)
    result = rank_to_name(result)

    file_name_1 = file_name(model,tour.cycle,"csv")
    final_name = base_route + file_name_1
    result.to_csv(final_name,index=False)





### simulator 실행

In [29]:
cycle_list = [100, 1000, 10000, 100000]  # 몇 가지의 cycle을 돌릴 것인가?

for cycle in cycle_list:
    tourn_model(cycle,final_input_gnb,"gnb")
    tourn_model(cycle,final_input_LR,"LR")
    tourn_model(cycle,final_input_xgb,"xgb")
    tourn_model(cycle,final_input_cat,"cat")

tournament by gnb in 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


     rank  win         name  win_rate
0       1   47   N.Djokovic      47.0
95      4   20   D.Medvedev      20.0
32      3   16      D.Thiem      16.0
127     2   11      R.Nadal      11.0
96      6    2  S.Tsitsipas       2.0
..    ...  ...          ...       ...
41     43    0      M.Cilic       0.0
40     19    0   G.Dimitrov       0.0
39     32    0    U.Humbert       0.0
38    106    0   Y.Uchiyama       0.0
64      8    0     A.Rublev       0.0

[128 rows x 4 columns]
tournament by LR in 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


     rank  win         name  win_rate
0       1   46   N.Djokovic      46.0
127     2   19      R.Nadal      19.0
32      3   18      D.Thiem      18.0
95      4   10   D.Medvedev      10.0
96      6    3  S.Tsitsipas       3.0
..    ...  ...          ...       ...
40     19    0   G.Dimitrov       0.0
39     32    0    U.Humbert       0.0
38    106    0   Y.Uchiyama       0.0
37     47    0    N.Kyrgios       0.0
65    101    0   Y.Hanfmann       0.0

[128 rows x 4 columns]
tournament by xgb in 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


     rank  win        name  win_rate
0       1   33  N.Djokovic      33.0
32      3   22     D.Thiem      22.0
95      4   18  D.Medvedev      18.0
127     2   14     R.Nadal      14.0
64      8    3    A.Rublev       3.0
..    ...  ...         ...       ...
42    174    0      A.Bolt       0.0
41     43    0     M.Cilic       0.0
40     19    0  G.Dimitrov       0.0
39     32    0   U.Humbert       0.0
66     83    0  T.Monteiro       0.0

[128 rows x 4 columns]
tournament by cat in 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


     rank  win        name  win_rate
0       1   37  N.Djokovic      37.0
95      4   22  D.Medvedev      22.0
32      3   20     D.Thiem      20.0
127     2   18     R.Nadal      18.0
31      7    2    A.Zverev       2.0
..    ...  ...         ...       ...
41     43    0     M.Cilic       0.0
40     19    0  G.Dimitrov       0.0
39     32    0   U.Humbert       0.0
38    106    0  Y.Uchiyama       0.0
64      8    0    A.Rublev       0.0

[128 rows x 4 columns]
tournament by gnb in 1000


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


     rank  win         name  win_rate
0       1  456   N.Djokovic      45.6
95      4  210   D.Medvedev      21.0
32      3  178      D.Thiem      17.8
127     2   60      R.Nadal       6.0
96      6   45  S.Tsitsipas       4.5
..    ...  ...          ...       ...
42    174    0       A.Bolt       0.0
41     43    0      M.Cilic       0.0
39     32    0    U.Humbert       0.0
38    106    0   Y.Uchiyama       0.0
66     83    0   T.Monteiro       0.0

[128 rows x 4 columns]
tournament by LR in 1000


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


     rank  win         name  win_rate
0       1  483   N.Djokovic      48.3
127     2  173      R.Nadal      17.3
32      3  157      D.Thiem      15.7
95      4  148   D.Medvedev      14.8
96      6   21  S.Tsitsipas       2.1
..    ...  ...          ...       ...
40     19    0   G.Dimitrov       0.0
39     32    0    U.Humbert       0.0
38    106    0   Y.Uchiyama       0.0
37     47    0    N.Kyrgios       0.0
64      8    0     A.Rublev       0.0

[128 rows x 4 columns]
tournament by xgb in 1000


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


     rank  win        name  win_rate
0       1  319  N.Djokovic      31.9
32      3  205     D.Thiem      20.5
127     2  173     R.Nadal      17.3
95      4  167  D.Medvedev      16.7
31      7   36    A.Zverev       3.6
..    ...  ...         ...       ...
41     43    0     M.Cilic       0.0
40     19    0  G.Dimitrov       0.0
39     32    0   U.Humbert       0.0
38    106    0  Y.Uchiyama       0.0
65    101    0  Y.Hanfmann       0.0

[128 rows x 4 columns]
tournament by cat in 1000


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


     rank  win        name  win_rate
0       1  359  N.Djokovic      35.9
32      3  222     D.Thiem      22.2
127     2  193     R.Nadal      19.3
95      4  160  D.Medvedev      16.0
31      7   19    A.Zverev       1.9
..    ...  ...         ...       ...
42    174    0      A.Bolt       0.0
41     43    0     M.Cilic       0.0
39     32    0   U.Humbert       0.0
38    106    0  Y.Uchiyama       0.0
65    101    0  Y.Hanfmann       0.0

[128 rows x 4 columns]
tournament by gnb in 10000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


     rank   win         name  win_rate
0       1  4704   N.Djokovic     47.04
95      4  1877   D.Medvedev     18.77
32      3  1780      D.Thiem     17.80
127     2   739      R.Nadal      7.39
96      6   371  S.Tsitsipas      3.71
..    ...   ...          ...       ...
54    128     0      C.Stebe      0.00
53    105     0  J.Duckworth      0.00
52    120     0    D.Dzumhur      0.00
51    229     0      B.Tomic      0.00
66     83     0   T.Monteiro      0.00

[128 rows x 4 columns]
tournament by LR in 10000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


     rank   win         name  win_rate
0       1  4879   N.Djokovic     48.79
127     2  1811      R.Nadal     18.11
32      3  1513      D.Thiem     15.13
95      4  1444   D.Medvedev     14.44
96      6   163  S.Tsitsipas      1.63
..    ...   ...          ...       ...
42    174     0       A.Bolt      0.00
41     43     0      M.Cilic      0.00
40     19     0   G.Dimitrov      0.00
39     32     0    U.Humbert      0.00
65    101     0   Y.Hanfmann      0.00

[128 rows x 4 columns]
tournament by xgb in 10000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


     rank   win         name  win_rate
0       1  3264   N.Djokovic     32.64
32      3  2116      D.Thiem     21.16
127     2  1762      R.Nadal     17.62
95      4  1591   D.Medvedev     15.91
96      6   387  S.Tsitsipas      3.87
..    ...   ...          ...       ...
50    104     0     Y.Sugita      0.00
46     41     0  K.Nishikori      0.00
44    178     0  K.Coppejans      0.00
43     89     0     N.Gombos      0.00
65    101     0   Y.Hanfmann      0.00

[128 rows x 4 columns]
tournament by cat in 10000


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


     rank   win         name  win_rate
0       1  3472   N.Djokovic     34.72
32      3  2088      D.Thiem     20.88
127     2  2054      R.Nadal     20.54
95      4  1717   D.Medvedev     17.17
96      6   249  S.Tsitsipas      2.49
..    ...   ...          ...       ...
43     89     0     N.Gombos      0.00
42    174     0       A.Bolt      0.00
41     43     0      M.Cilic      0.00
39     32     0    U.Humbert      0.00
65    101     0   Y.Hanfmann      0.00

[128 rows x 4 columns]
tournament by gnb in 100000


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


     rank    win          name  win_rate
0       1  47204    N.Djokovic     47.20
95      4  18710    D.Medvedev     18.71
32      3  17922       D.Thiem     17.92
127     2   7290       R.Nadal      7.29
96      6   3594   S.Tsitsipas      3.59
..    ...    ...           ...       ...
98    266      0  T.Kokkinakis      0.00
68   1001      0          L.Tu      0.00
42    174      0        A.Bolt      0.00
38    106      0    Y.Uchiyama      0.00
66     83      0    T.Monteiro      0.00

[128 rows x 4 columns]
tournament by LR in 100000


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


     rank    win         name  win_rate
0       1  49744   N.Djokovic     49.74
127     2  17900      R.Nadal     17.90
32      3  14589      D.Thiem     14.59
95      4  14505   D.Medvedev     14.50
96      6   1510  S.Tsitsipas      1.51
..    ...    ...          ...       ...
24     35      0  A.Mannarino      0.00
89    198      0      R.Haase      0.00
54    128      0      C.Stebe      0.00
91    206      0      Q.Halys      0.00
53    105      0  J.Duckworth      0.00

[128 rows x 4 columns]
tournament by xgb in 100000


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


     rank    win         name  win_rate
0       1  31968   N.Djokovic     31.97
32      3  20816      D.Thiem     20.82
127     2  18090      R.Nadal     18.09
95      4  16237   D.Medvedev     16.24
96      6   3664  S.Tsitsipas      3.66
..    ...    ...          ...       ...
60    209      0     A.Muller      0.00
59    113      0   A.Karatsev      0.00
54    128      0      C.Stebe      0.00
53    105      0  J.Duckworth      0.00
65    101      0   Y.Hanfmann      0.00

[128 rows x 4 columns]
tournament by cat in 100000


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


     rank    win         name  win_rate
0       1  34811   N.Djokovic     34.81
32      3  21381      D.Thiem     21.38
127     2  20147      R.Nadal     20.15
95      4  17203   D.Medvedev     17.20
96      6   2428  S.Tsitsipas      2.43
..    ...    ...          ...       ...
52    120      0    D.Dzumhur      0.00
51    229      0      B.Tomic      0.00
50    104      0     Y.Sugita      0.00
45     69      0     J.Vesely      0.00
65    101      0   Y.Hanfmann      0.00

[128 rows x 4 columns]
