In [1]:
from random import *
import pandas as pd
import itertools
import numpy as np
import os
from tqdm import tqdm

In [2]:
file_list = os.listdir()
file_list

['.ipynb_checkpoints',
 '2001-2018 (1).csv',
 '2019.csv',
 '2020.csv',
 'AO_draw.csv',
 'MC.ipynb',
 'model.ipynb']

In [3]:
train_data = pd.read_csv(file_list[1], index_col=0)
test_data19 = pd.read_csv(file_list[2], index_col=0)
test_data20 = pd.read_csv(file_list[3], index_col=0)

## 전처리 및 세팅

In [4]:
def train_test_split(data):
    data = data.copy()
    data.drop(['tourney_date', 'winner_id', 'loser_id'], axis=1, inplace=True)
    np.random.seed(202101)
    random_number = np.random.rand(len(data), ) < 0.5
    data_win = data.copy()[random_number]
    data_lose = data.copy()[~random_number]
    data_win['y'] = 1
    data_lose['y'] = 0
    trim_data = data_lose.rename(columns={'winner_hand': 'loser_hand', 'loser_hand': 'winner_hand',
                                          'winner_ht': 'loser_ht', 'loser_ht': 'winner_ht',
                                          'winner_age': 'loser_age', 'loser_age': 'winner_age',
                                          'winner_rank': 'loser_rank', 'loser_rank': 'winner_rank',
                                          'winner_rank_points': 'loser_rank_points', 'loser_rank_points': 'winner_rank_points',
                                         })
    trim_data = pd.concat([trim_data, data_win])
    # more trimming
    trim_data['ht_dff'] = trim_data['winner_ht'] - trim_data['loser_ht']
    trim_data['rank_points_dff'] = trim_data['winner_rank_points'] - trim_data['loser_rank_points']
    trim_data['age_dff'] = trim_data['winner_age'] - trim_data['loser_age']
    
    trim_data.drop(['winner_ht', 'loser_ht', 'winner_rank_points', 'loser_rank_points', 'winner_age', 'loser_age'], axis=1, inplace=True)
    
    trim_data.rename(columns={'winner_hand': 'hand',
                              'loser_hand': 'opponent_hand',
                              'winner_rank': 'rank',
                              'loser_rank': 'opponent_rank'}, inplace=True)
    # encoding
    trim_data.loc[(trim_data.surface == 'Carpet'),'surface'] = np.nan
    final_data = pd.get_dummies(trim_data, columns=["hand",'opponent_hand', 'best_of', 'surface'])
    final_data.dropna(inplace=True)
    # split
    print(final_data.shape)
    X = final_data.drop(["Quarter", 'y', 'Month'], axis=1)
    print(X.columns)
    y = final_data['y']
    return X, y

In [5]:
X_train, y_train = train_test_split(train_data)
X_test19, y_test19 = train_test_split(test_data19)
X_test20, y_test20 = train_test_split(test_data20)

(46943, 19)
Index(['opponent_rank', 'rank', 'ht_dff', 'rank_points_dff', 'age_dff',
       'hand_L', 'hand_R', 'hand_U', 'opponent_hand_L', 'opponent_hand_R',
       'opponent_hand_U', 'best_of_3', 'best_of_5', 'surface_Clay',
       'surface_Grass', 'surface_Hard'],
      dtype='object')
(841, 19)
Index(['opponent_rank', 'rank', 'ht_dff', 'rank_points_dff', 'age_dff',
       'hand_L', 'hand_R', 'hand_U', 'opponent_hand_L', 'opponent_hand_R',
       'opponent_hand_U', 'best_of_3', 'best_of_5', 'surface_Clay',
       'surface_Grass', 'surface_Hard'],
      dtype='object')
(272, 19)
Index(['opponent_rank', 'rank', 'ht_dff', 'rank_points_dff', 'age_dff',
       'hand_L', 'hand_R', 'hand_U', 'opponent_hand_L', 'opponent_hand_R',
       'opponent_hand_U', 'best_of_3', 'best_of_5', 'surface_Clay',
       'surface_Grass', 'surface_Hard'],
      dtype='object')


In [6]:
AO_draw = pd.read_csv("AO_draw.csv")

In [7]:
AO_draw.head()

Unnamed: 0,name,rank,rank_point,height,age,hand
0,N.Djokovic,1,12030,188,33.723,Right-Handed
1,J.Chardy,66,950,188,33.995,Right-Handed
2,F.Tiafoe,62,1005,188,23.049,Right-Handed
3,S.Travaglia,71,894,185,29.118,Right-Handed
4,R.Opelka,40,1402,211,23.447,Right-Handed


In [8]:
all_list = list(itertools.combinations(range(128),2))

In [9]:
predict_prob = []
base = [0,1,0,0,1]
total_base = []
for p, q in all_list:
    a = AO_draw.iloc[p]
    b = AO_draw.iloc[q]
    op_rank = b["rank"]
    rank = a["rank"]
    ht_dff = a["height"]-b["height"]
    point_dff = a["rank_point"]-b["rank_point"]
    age_dff = a["age"]-b["age"]
    if a["hand"]=="Right-Handed":
        hand_a = [0,1,0]
    else:
        hand_a = [1,0,0]
    if b["hand"]=="Right-Handed":
        hand_b = [0,1,0]
    else:
        hand_b = [1,0,0]
    p_q_match = [op_rank,rank,ht_dff,point_dff,age_dff]+hand_a+hand_b+base
    total_base.append(p_q_match)
    # 여기에서 model.ipynb와 같은 input cell을 만들어낸다.

# 이후 모델에 적용한 뒤에 append하여 probability추가하고 나머지 p,q제외하고 drop

In [10]:
match_input = pd.DataFrame(total_base, columns=['opponent_rank', 'rank', 'ht_dff', 'rank_points_dff', 'age_dff',
       'hand_L', 'hand_R', 'hand_U', 'opponent_hand_L', 'opponent_hand_R',
       'opponent_hand_U', 'best_of_3', 'best_of_5', 'surface_Clay',
       'surface_Grass', 'surface_Hard'])

## 모델링

### GNB

In [11]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()

In [13]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [14]:
scaler.transform(X_train)

array([[-0.21856992, -0.61715743,  0.83778367, ..., -0.6956252 ,
        -0.34900395,  0.93983751],
       [-0.3992428 , -0.24499811,  1.35958522, ..., -0.6956252 ,
        -0.34900395,  0.93983751],
       [-0.42049843, -0.05360188, -0.72762099, ..., -0.6956252 ,
        -0.34900395,  0.93983751],
       ..., 
       [-0.66493821, -0.75538804, -1.87558441, ..., -0.6956252 ,
        -0.34900395,  0.93983751],
       [-0.71807729, -0.75538804, -1.04070192, ..., -0.6956252 ,
        -0.34900395,  0.93983751],
       [-0.76058856, -0.71285555,  1.04650429, ..., -0.6956252 ,
        -0.34900395,  0.93983751]])

In [15]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred19 = gnb.predict(X_test19)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test19.shape[0], (y_test19 != y_pred19).sum()))

Number of mislabeled points out of a total 841 points : 308


### 1-2 로지스틱

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
LR = LogisticRegression(random_state=202101)
LR.fit(X_train, y_train)
y_pred19 = LR.predict(X_test19)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test19.shape[0], (y_test19 != y_pred19).sum()))

Number of mislabeled points out of a total 841 points : 302


## 추가 전처리

In [18]:
def model_apply(match_input, model):
    
    prediction = model(match_input)
    prediction_df = pd.DataFrame(prediction, columns = ["win","lose"])
    r_value = pd.concat([match_input[['rank', 'opponent_rank']],prediction_df[['lose']]], axis=1)
    r_value = r_value.rename(columns={"lose":"win"})
    return r_value

In [19]:
final_input_gnb= model_apply(match_input,gnb.predict_proba)

In [20]:
final_input_log = model_apply(match_input,LR.predict_proba)

In [21]:
class tournament(object):
    
    def fit(self, draw, m_input):
        rank_list = draw[["rank"]]
        self.how_many_total_wins=rank_list.copy()
        self.how_many_total_wins.insert(1,"win",0)
        self.match_list = m_input.copy()
        self.match_counts = []
        self.len = len(rank_list)
        self.rank_list = draw["rank"].copy()
        
        for i in range(len(rank_list)):
            self.len = int(self.len/2)
            self.match_counts.append(self.len)
            if self.len <=1:
                break

        
        
    def match_win(self, rank_a, rank_b):

        prob = self.match_list.loc[(self.match_list['rank']==rank_a)&(self.match_list['opponent_rank']==rank_b)]['win']
        prob = prob.tolist()[0]
        rand_num = uniform(0,1)

        if prob > rand_num:
            return rank_a
        else:
            return rank_b
        
    def tournament_one_round(self, matches, rank_list_1):
        
        winners = []
        
        for i in range(matches):
            a = 2*i
            b = 2*i + 1
            rank_a = rank_list_1[a]
            rank_b = rank_list_1[b]
            winner = self.match_win(rank_a, rank_b)
            winners.append(winner)
        return winners
        
    def tournament_one_cycle(self):

        rank_list_2 = self.rank_list
        for matches in self.match_counts:
            rank_list_2 = self.tournament_one_round(matches,rank_list_2)
        self.how_many_total_wins.loc[self.how_many_total_wins['rank']==rank_list_2[0],'win']+=1
        
    def cycles(self, num=1):
#         num = int(input("cycles? \n"))
        self.cycle = num
        for i in tqdm(range(num)):

            self.tournament_one_cycle()
        return(self.how_many_total_wins)

In [22]:
def file_name(model,num,file_type):
    result_name = "result_"+model+"_"+str(num)
    result_name_count = 0
    for file_name in file_list:
        if result_name in file_name:
            result_name_count += 1
    result_name_count += 1
    
    final_name = result_name +"_("+str(result_name_count)+")."+file_type
    return final_name

In [None]:
to_gnb = tournament()
to_gnb.fit(AO_draw, final_input_gnb)
result_gnb = to_gnb.cycles(10000).sort_values(by=['win'], axis=0, ascending=False)
file_name_gnb = file_name("gnb",to_gnb.cycle,"csv")
result_gnb.to_csv(file_name_gnb,index=False)

 92%|██████████████████████████████████████████████████████████████████████▌      | 9165/10000 [18:45<01:43,  8.09it/s]

In [None]:
to_log = tournament()
to_log.fit(AO_draw, final_input_log)
result_log = to_log.cycles(10000).sort_values(by=['win'], axis=0, ascending=False)
file_name_log = file_name("log",to_gnb.cycle,"csv")
result_log.to_csv(file_name_log,index=False)