In [2]:
import pandas as pd
import os
import math
from collections import defaultdict
from glicko2 import Player as Glicko2Player
import trueskill
import re
import numpy as np
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)

In [3]:
data = pd.DataFrame()
dataset_path = "../dataset"
for file in os.listdir(dataset_path):
    data_holder = pd.read_csv(f"../dataset/{file}")
    data = pd.concat([data, data_holder], ignore_index=True)


In [4]:
data['outcome'] = 1
data['tourney_date'] = pd.to_datetime(data['tourney_date'], format='%Y%m%d')
data['tourney_datetime'] = data['tourney_date'] + pd.to_timedelta(
    data.groupby(['tourney_date']).cumcount(), unit='s'
)

data = data.sort_values(by='tourney_datetime').reset_index(drop=True)
data['match_id'] = data.index
data = data.drop(columns=['tourney_date'])

data_orig = data.copy()

In [5]:
np.random.seed(42)

indices_para_inverter = data.sample(frac=0.5).index

colunas_pares = [
    ('winner_id', 'loser_id'),
    ('winner_seed', 'loser_seed'),
    ('winner_entry', 'loser_entry'),
    ('winner_name', 'loser_name'),
    ('winner_hand', 'loser_hand'),
    ('winner_ht', 'loser_ht'),
    ('winner_ioc', 'loser_ioc'),
    ('winner_age', 'loser_age'),
    ('winner_rank', 'loser_rank'),
    ('winner_rank_points', 'loser_rank_points'),

    ('w_ace', 'l_ace'),
    ('w_df', 'l_df'),
    ('w_svpt', 'l_svpt'),
    ('w_1stIn', 'l_1stIn'),
    ('w_1stWon', 'l_1stWon'),
    ('w_2ndWon', 'l_2ndWon'),
    ('w_SvGms', 'l_SvGms'),
    ('w_bpSaved', 'l_bpSaved'),
    ('w_bpFaced', 'l_bpFaced'),
]

for col_a, col_b in colunas_pares:
    temp = data.loc[indices_para_inverter, col_a].copy()
    data.loc[indices_para_inverter, col_a] = data.loc[indices_para_inverter, col_b]
    data.loc[indices_para_inverter, col_b] = temp

data.loc[indices_para_inverter, 'outcome'] = 0 

rename_dict = {}

for col in data.columns:
    if col.startswith('winner_'):
        rename_dict[col] = col.replace('winner_', 'player_A_')
    elif col.startswith('loser_'):
        rename_dict[col] = col.replace('loser_', 'player_B_')
    elif col.startswith('w_'):
        rename_dict[col] = col.replace('w_', 'player_A_')
    elif col.startswith('l_'):
        rename_dict[col] = col.replace('l_', 'player_B_')

data.rename(columns=rename_dict, inplace=True)

In [6]:
def inverter_score(score_str):
    if not isinstance(score_str, str):
        return score_str
    
    score_str = score_str.strip().upper()
    especiais = ['RET', 'W/O', 'WO', 'RETIRE']
    
    if any(score_str.startswith(e) for e in especiais):
        return score_str  # não inverte
    
    sets = score_str.split()
    sets_invertidos = []
    
    for s in sets:
        match = re.match(r'(\d+)-(\d+)(\(\d+\))?', s)
        if match:
            p1, p2, tb = match.groups()
            novo_set = f"{p2}-{p1}{tb or ''}"
            sets_invertidos.append(novo_set)
        else:
            sets_invertidos.append(s)  
            
    return ' '.join(sets_invertidos)

data.loc[data['outcome'] == 0, 'score'] = data.loc[data['outcome'] == 0, 'score'].apply(inverter_score)

In [7]:
def compute_rating_features(df):
    elo        = defaultdict(lambda: 1500)
    glicko     = defaultdict(lambda: Glicko2Player())
    ts_env     = trueskill.TrueSkill()
    ts_ratings = defaultdict(ts_env.Rating)

    rows = []

    for _, row in df.iterrows():
        A, B, result = row['player_A_name'], row['player_B_name'], row['outcome']

        rA_e, rB_e = elo[A], elo[B]
        pA,  pB    = glicko[A], glicko[B]
        tA,  tB    = ts_ratings[A], ts_ratings[B]

        exp_e = 1 / (1 + 10 ** ((rB_e - rA_e) / 400))
        q     = math.log(10) / 400
        g_phi = 1 / math.sqrt(1 + (3 * q**2 * pB.rd**2) / math.pi**2)
        exp_g = 1 / (1 + 10 ** (-g_phi * (pA.rating - pB.rating) / 400))

        quality = ts_env.quality_1vs1(tA, tB)

        rows.append({
            'tourney_datetime': row['tourney_datetime'],
            'player_A_name':        A,  
            'player_B_name':        B, 
            'winner_elo':          rA_e, 
            'loser_elo':          rB_e, 
            'elo_diff':       rA_e - rB_e, 
            'winner_elo_exp':       exp_e,
            'winner_glicko':       pA.rating,
            'winner_glicko_rd':    pA.rd,
            'loser_glicko':       pB.rating,
            'loser_glicko_rd':    pB.rd,
            'glicko_diff':    pA.rating - pB.rating, 
            'winner_glicko_exp':    exp_g,
            'winner_ts_mu':        tA.mu,
            'winner_ts_sigma':     tA.sigma,
            'loser_ts_mu':        tB.mu, 
            'loser_ts_sigma':     tB.sigma,
            'ts_quality':     quality,
            'match_id':       row['match_id']
        })

        K = 32
        elo[A] += K * (result - exp_e)
        elo[B] += K * ((1 - result) - (1 - exp_e))

        pA.update_player([pB.rating], [pB.rd], [result])
        pB.update_player([pA.rating], [pA.rd], [1 - result])

        if result == 1:
            new_tA, new_tB = ts_env.rate_1vs1(tA, tB)
        else:
            new_tB, new_tA = ts_env.rate_1vs1(tB, tA)
        ts_ratings[A], ts_ratings[B] = new_tA, new_tB

    return pd.DataFrame(rows)

df_features = compute_rating_features(data)

In [8]:
def invert_score_if_needed(score: str, outcome: int) -> str:
    """Inverte os sets na string de score se outcome==0."""
    if not isinstance(score, str):
        return score
    s = score.strip().upper()
    if outcome == 1 or s in {'W/O','WO','RET','RETIRE'}:
        return score  # não inverte

    partes = s.split()
    invertidos = []
    for p in partes:
        m = re.match(r'(\d+)-(\d+)(\(\d+\))?$', p)
        if m:
            a, b, tb = m.groups()
            invertidos.append(f"{b}-{a}{tb or ''}")
        else:
            invertidos.append(p)
    return ' '.join(invertidos)


def extract_score_features(row):
    """Retorna dict com features de score, já invertendo se necessário."""
    score = invert_score_if_needed(row['score'], row['outcome'])
    s = score.strip().upper() if isinstance(score, str) else ''
    if s in {'W/O','WO','RET','RETIRE'} or pd.isna(score):
        return {
            'score': score,
            'is_walkover': 1,
            'sets_A': np.nan, 'sets_B': np.nan,
            'games_A': np.nan, 'games_B': np.nan,
            'set_diff': np.nan, 'game_diff': np.nan,
            'n_tiebreaks': np.nan, 'pct_games_A': np.nan
        }
    pattern = r'(\d+)-(\d+)(?:\((\d+)\))?'
    raw = re.findall(pattern, score)
    sets = [(int(a), int(b), int(tb) if tb else 0) for a,b,tb in raw]
    sets_A = sum(1 for a,b,_ in sets if a > b)
    sets_B = sum(1 for a,b,_ in sets if b > a)
    games_A = sum(a for a,_,_ in sets)
    games_B = sum(b for _,b,_ in sets)
    n_tie  = sum(1 for *_,tb in sets if tb>0)
    total_games = games_A + games_B

    return {
        'score': score,
        'is_walkover': 0,
        'sets_A': sets_A,
        'sets_B': sets_B,
        'games_A': games_A,
        'games_B': games_B,
        'set_diff': sets_A - sets_B,
        'game_diff': games_A - games_B,
        'n_tiebreaks': n_tie,
        'pct_games_A': (games_A / total_games) if total_games else np.nan
    }

score_feats = data.apply(extract_score_features, axis=1, result_type='expand')

df_feats = pd.concat([
    data[['tourney_datetime','player_A_name','player_B_name','score','outcome','match_id']],
    score_feats
], axis=1)
score_feats = data.apply(extract_score_features, axis=1, result_type='expand')

df_feats = pd.concat([
    data[['match_id','tourney_datetime','player_A_name','player_B_name','score','outcome']],
    score_feats
], axis=1)

In [9]:
df_final = df_feats.sort_values(['tourney_datetime','match_id']).reset_index(drop=True)
df_final = (df_feats
            .sort_values('tourney_datetime')
            .reset_index(drop=True))
df_final['match_id'] = df_final.index

stats_cols = ['sets_A','sets_B','games_A','games_B','n_tiebreaks']
long = pd.concat([
    df_final[['match_id','tourney_datetime','player_A_name']+stats_cols]
      .rename(columns={'player_A_name':'player'}),
    df_final[['match_id','tourney_datetime','player_B_name']+stats_cols]
      .rename(columns={'player_B_name':'player'})
]).sort_values(['player','tourney_datetime','match_id'])

for c in stats_cols:
    long[f'avg_{c}'] = (long
                       .groupby('player')[c]
                       .transform(lambda x: x.expanding().mean().shift()))

long_stats = long[['match_id','player']+[f'avg_{c}' for c in stats_cols]]

df_model = (df_final
            .merge(long_stats, left_on=['match_id','player_A_name'],
                   right_on=['match_id','player'], how='left')
            .rename(columns={f'avg_{c}':f'player_A_avg_{c}' for c in stats_cols})
            .drop(columns=['player']))

df_model = (df_model
            .merge(long_stats, left_on=['match_id','player_B_name'],
                   right_on=['match_id','player'], how='left')
            .rename(columns={f'avg_{c}':f'player_B_avg_{c}' for c in stats_cols})
            .drop(columns=['player']))

to_drop = ['score','sets_A','sets_B','games_A','games_B','n_tiebreaks',
           'is_walkover','set_diff','game_diff','pct_games_A']
df_model = df_model.drop(columns=[c for c in to_drop if c in df_model.columns])

In [10]:
data[['match_id',"tourney_datetime",'player_A_name',"player_B_name"]].sort_values(by='match_id',ascending=False)

Unnamed: 0,match_id,tourney_datetime,player_A_name,player_B_name
129558,129558,2024-12-18 00:00:14,Nishesh Basavareddy,Luca Van Assche
129557,129557,2024-12-18 00:00:13,Juncheng Shang,Nishesh Basavareddy
129556,129556,2024-12-18 00:00:12,Luca Van Assche,Juncheng Shang
129555,129555,2024-12-18 00:00:11,Alex Michelsen,Nishesh Basavareddy
129554,129554,2024-12-18 00:00:10,Alex Michelsen,Luca Van Assche
...,...,...,...,...
4,4,1985-01-07 00:00:04,David Lewis,Leif Shiras
3,3,1985-01-07 00:00:03,Russell Barlow,Brad Drewett
2,2,1985-01-07 00:00:02,Howard Sands,Jonathan Canter
1,1,1985-01-07 00:00:01,Mark Wooldridge,Karl Meiler


In [11]:
df_model[['match_id',"tourney_datetime",'player_A_name',"player_B_name"]].sort_values(by='match_id',ascending=False)

Unnamed: 0,match_id,tourney_datetime,player_A_name,player_B_name
129558,129558,2024-12-18 00:00:14,Nishesh Basavareddy,Luca Van Assche
129557,129557,2024-12-18 00:00:13,Juncheng Shang,Nishesh Basavareddy
129556,129556,2024-12-18 00:00:12,Luca Van Assche,Juncheng Shang
129555,129555,2024-12-18 00:00:11,Alex Michelsen,Nishesh Basavareddy
129554,129554,2024-12-18 00:00:10,Alex Michelsen,Luca Van Assche
...,...,...,...,...
4,4,1985-01-07 00:00:04,David Lewis,Leif Shiras
3,3,1985-01-07 00:00:03,Russell Barlow,Brad Drewett
2,2,1985-01-07 00:00:02,Howard Sands,Jonathan Canter
1,1,1985-01-07 00:00:01,Mark Wooldridge,Karl Meiler


In [12]:
df_final2 = data.merge(df_model, on = ['outcome','match_id',"tourney_datetime",'player_A_name',"player_B_name"], how = 'left')
df_final2 = df_final2.merge(df_features, on = ['match_id',"tourney_datetime",'player_A_name',"player_B_name"], how = 'left')

In [13]:
df_final2 = df_final2.drop(columns=
[
'player_B_ioc',
'player_A_ioc',
'match_id',
'player_A_rank_points',
'player_A_rank',
'player_B_rank_points',
'player_B_rank',
'score',
'player_A_entry',
'player_A_seed',
'player_A_id',
'player_B_entry',
'player_B_seed',
'player_B_id',
'match_num',
'tourney_id',
'minutes',
'player_A_ace',
'player_A_df',
'player_A_svpt',
'player_A_1stIn',
'player_A_1stWon',
'player_A_2ndWon',
'player_A_SvGms',
'player_A_bpSaved',
'player_A_bpFaced',
'player_B_ace',
'player_B_df',
'player_B_svpt',
'player_B_1stIn',
'player_B_1stWon',
'player_B_2ndWon',
'player_B_SvGms',
'player_B_bpSaved',
'player_B_bpFaced',
'draw_size'
])

# Dummies


In [14]:
df_dummies = pd.get_dummies(df_final2, columns=['best_of','surface','round'], drop_first=True)

In [15]:
def reduce_top_categories(df, config):
    """
    Para cada (col, top_n) em config, cria coluna col_reduzido com as top_n categorias
    e agrupa o resto em 'Outros', depois aplica get_dummies em todas de uma vez.
    """
    reduced_cols = []
    for col, n in config:
        top = df[col].value_counts().nlargest(n).index
        reduced_col = f"{col}_reduzido"
        df[reduced_col] = df[col].where(df[col].isin(top), 'Outros')
        reduced_cols.append(reduced_col)
        
    return pd.get_dummies(df, columns=reduced_cols, drop_first=True)

config = [
    ('tourney_name', 20),
    ('tourney_level', 3),
    ('player_A_hand', 1),
    ('player_B_hand', 1),
]

df_dummies = reduce_top_categories(df_dummies, config)
df_dummies = df_dummies.drop(columns=["tourney_name",'tourney_level','player_A_hand','player_B_hand'])

In [16]:
scaler = MinMaxScaler()
df_dummies[['player_B_ht', 'player_B_age','player_A_ht', 'player_A_age']] = scaler.fit_transform(df_dummies[['player_B_ht', 'player_B_age','player_A_ht', 'player_A_age']])

In [17]:
df_dummies

Unnamed: 0,player_A_name,player_A_ht,player_A_age,player_B_name,player_B_ht,player_B_age,outcome,tourney_datetime,player_A_avg_sets_A,player_A_avg_sets_B,player_A_avg_games_A,player_A_avg_games_B,player_A_avg_n_tiebreaks,player_B_avg_sets_A,player_B_avg_sets_B,player_B_avg_games_A,player_B_avg_games_B,player_B_avg_n_tiebreaks,winner_elo,loser_elo,elo_diff,winner_elo_exp,winner_glicko,winner_glicko_rd,loser_glicko,loser_glicko_rd,glicko_diff,winner_glicko_exp,winner_ts_mu,winner_ts_sigma,loser_ts_mu,loser_ts_sigma,ts_quality,best_of_5,surface_Clay,surface_Grass,surface_Hard,round_ER,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF,tourney_name_reduzido_Barcelona,tourney_name_reduzido_Canada Masters,tourney_name_reduzido_Cincinnati Masters,tourney_name_reduzido_Indian Wells Masters,tourney_name_reduzido_Indianapolis,tourney_name_reduzido_Kitzbuhel,tourney_name_reduzido_Madrid Masters,tourney_name_reduzido_Memphis,tourney_name_reduzido_Miami Masters,tourney_name_reduzido_Monte Carlo Masters,tourney_name_reduzido_Outros,tourney_name_reduzido_Paris Masters,tourney_name_reduzido_Queen's Club,tourney_name_reduzido_Roland Garros,tourney_name_reduzido_Rome Masters,tourney_name_reduzido_Rotterdam,tourney_name_reduzido_US Open,tourney_name_reduzido_Vienna,tourney_name_reduzido_Washington,tourney_name_reduzido_Wimbledon,tourney_level_reduzido_G,tourney_level_reduzido_M,tourney_level_reduzido_Outros,player_A_hand_reduzido_R,player_B_hand_reduzido_R
0,Kelvin Belcher,0.875000,0.294915,John Fitzgerald,0.865385,0.320132,0,1985-01-07 00:00:00,,,,,,,,,,,1500.000000,1500.000000,0.000000,0.500000,1500.000000,350.000000,1500.000000,350.000000,0.000000,0.500000,25.000000,8.333333,25.000000,8.333333,0.447214,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True
1,Mark Wooldridge,0.889423,0.271186,Karl Meiler,0.841346,0.702970,0,1985-01-07 00:00:01,,,,,,,,,,,1500.000000,1500.000000,0.000000,0.500000,1500.000000,350.000000,1500.000000,350.000000,0.000000,0.500000,25.000000,8.333333,25.000000,8.333333,0.447214,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True
2,Howard Sands,0.817308,0.284746,Jonathan Canter,0.875000,0.171617,1,1985-01-07 00:00:02,,,,,,,,,,,1500.000000,1500.000000,0.000000,0.500000,1500.000000,350.000000,1500.000000,350.000000,0.000000,0.500000,25.000000,8.333333,25.000000,8.333333,0.447214,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True
3,Russell Barlow,0.817308,0.254237,Brad Drewett,0.875000,0.399340,0,1985-01-07 00:00:03,,,,,,,,,,,1500.000000,1500.000000,0.000000,0.500000,1500.000000,350.000000,1500.000000,350.000000,0.000000,0.500000,25.000000,8.333333,25.000000,8.333333,0.447214,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,David Lewis,0.850962,0.196610,Leif Shiras,0.865385,0.363036,0,1985-01-07 00:00:04,,,,,,,,,,,1500.000000,1500.000000,0.000000,0.500000,1500.000000,350.000000,1500.000000,350.000000,0.000000,0.500000,25.000000,8.333333,25.000000,8.333333,0.447214,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129554,Alex Michelsen,0.913462,0.196610,Luca Van Assche,0.841346,0.204620,1,2024-12-18 00:00:10,2.155844,0.467532,14.948052,10.000000,0.610390,2.241379,0.551724,15.517241,10.586207,0.620690,1698.350371,1507.511677,190.838695,0.749989,1883.000355,64.854386,1712.478470,67.086100,170.521886,0.723138,30.874094,0.981368,27.896286,1.147192,0.859322,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,True
129555,Alex Michelsen,0.913462,0.196610,Nishesh Basavareddy,0.850962,0.174917,1,2024-12-18 00:00:11,2.166667,0.474359,14.923077,10.025641,0.628205,,,,,,1706.350710,1500.000000,206.350710,0.766356,1889.546758,64.804578,1500.000000,350.000000,389.546758,0.817624,30.966481,0.977791,25.000000,8.333333,0.485200,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,True
129556,Luca Van Assche,0.841346,0.203390,Juncheng Shang,0.850962,0.181518,1,2024-12-18 00:00:12,2.254237,0.559322,15.474576,10.610169,0.644068,2.135593,0.542373,15.457627,10.932203,0.491525,1499.511338,1672.973717,-173.462378,0.269231,1705.677028,66.932407,1877.955410,65.798391,-172.278383,0.274726,27.770281,1.138877,30.955710,1.082360,0.843026,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False
129557,Juncheng Shang,0.850962,0.179661,Nishesh Basavareddy,0.850962,0.174917,0,2024-12-18 00:00:13,2.150000,0.550000,15.433333,10.933333,0.516667,3.000000,1.000000,14.000000,12.000000,2.000000,1649.589096,1492.523403,157.065692,0.711800,1860.856320,65.662927,1449.373336,301.780405,411.482984,0.846921,30.714637,1.072524,21.584862,6.782215,0.391436,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True


In [18]:
df_dummies.loc[df_dummies.tourney_datetime =='2024-12-18 00:00:14' ]

Unnamed: 0,player_A_name,player_A_ht,player_A_age,player_B_name,player_B_ht,player_B_age,outcome,tourney_datetime,player_A_avg_sets_A,player_A_avg_sets_B,player_A_avg_games_A,player_A_avg_games_B,player_A_avg_n_tiebreaks,player_B_avg_sets_A,player_B_avg_sets_B,player_B_avg_games_A,player_B_avg_games_B,player_B_avg_n_tiebreaks,winner_elo,loser_elo,elo_diff,winner_elo_exp,winner_glicko,winner_glicko_rd,loser_glicko,loser_glicko_rd,glicko_diff,winner_glicko_exp,winner_ts_mu,winner_ts_sigma,loser_ts_mu,loser_ts_sigma,ts_quality,best_of_5,surface_Clay,surface_Grass,surface_Hard,round_ER,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF,tourney_name_reduzido_Barcelona,tourney_name_reduzido_Canada Masters,tourney_name_reduzido_Cincinnati Masters,tourney_name_reduzido_Indian Wells Masters,tourney_name_reduzido_Indianapolis,tourney_name_reduzido_Kitzbuhel,tourney_name_reduzido_Madrid Masters,tourney_name_reduzido_Memphis,tourney_name_reduzido_Miami Masters,tourney_name_reduzido_Monte Carlo Masters,tourney_name_reduzido_Outros,tourney_name_reduzido_Paris Masters,tourney_name_reduzido_Queen's Club,tourney_name_reduzido_Roland Garros,tourney_name_reduzido_Rome Masters,tourney_name_reduzido_Rotterdam,tourney_name_reduzido_US Open,tourney_name_reduzido_Vienna,tourney_name_reduzido_Washington,tourney_name_reduzido_Wimbledon,tourney_level_reduzido_G,tourney_level_reduzido_M,tourney_level_reduzido_Outros,player_A_hand_reduzido_R,player_B_hand_reduzido_R
129558,Nishesh Basavareddy,0.850962,0.172881,Luca Van Assche,0.841346,0.20462,0,2024-12-18 00:00:14,3.0,1.0,14.5,10.5,1.5,2.266667,0.566667,15.45,10.616667,0.666667,1515.301007,1522.895959,-7.594952,0.489072,1819.452715,269.713818,1723.900602,66.776296,95.552113,0.631368,29.710595,5.003827,28.037036,1.126736,0.737168,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,True


In [19]:
data_orig.loc[(data_orig.winner_name.str.contains("Jannik Sinner")) | (data_orig.loser_name.str.contains("Jannik Sinner"))].head(5)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,outcome,tourney_datetime,match_id
114605,2019-7648,Budapest,Clay,32,A,290,111513,5.0,,Laslo Djere,R,188.0,SRB,23.8,206173,,LL,Jannik Sinner,R,191.0,ITA,17.6,6-3 6-1,3,R16,90.0,5.0,2.0,64.0,37.0,26.0,15.0,8.0,4.0,4.0,1.0,1.0,70.0,38.0,20.0,16.0,8.0,5.0,9.0,33.0,1246.0,314.0,96.0,1,2019-04-22 00:00:57,114605
114614,2019-7648,Budapest,Clay,32,A,279,206173,,LL,Jannik Sinner,R,191.0,ITA,17.6,200218,,WC,Mate Valkusz,R,183.0,HUN,20.6,6-2 0-6 6-4,3,R32,103.0,1.0,2.0,68.0,36.0,24.0,10.0,12.0,2.0,8.0,2.0,5.0,82.0,54.0,27.0,15.0,12.0,9.0,15.0,314.0,96.0,323.0,91.0,1,2019-04-22 00:01:06,114614
114754,2019-M009,Rome Masters,Clay,64,M,277,126774,8.0,,Stefanos Tsitsipas,R,193.0,GRE,20.7,206173,,WC,Jannik Sinner,R,191.0,ITA,17.7,6-3 6-2,3,R32,77.0,1.0,1.0,55.0,34.0,22.0,14.0,9.0,2.0,3.0,1.0,0.0,48.0,28.0,16.0,7.0,8.0,2.0,6.0,7.0,3790.0,263.0,170.0,1,2019-05-13 00:00:23,114754
114774,2019-M009,Rome Masters,Clay,64,M,252,206173,,WC,Jannik Sinner,R,191.0,ITA,17.7,105449,,,Steve Johnson,R,188.0,USA,29.3,1-6 6-1 7-5,3,R64,111.0,3.0,3.0,78.0,47.0,33.0,14.0,13.0,1.0,4.0,5.0,3.0,79.0,46.0,31.0,18.0,13.0,3.0,7.0,263.0,170.0,59.0,885.0,1,2019-05-13 00:00:43,114774
114828,2019-7694,Lyon,Clay,32,A,284,106137,,LL,Tristan Lamasine,R,183.0,FRA,26.2,206173,,Q,Jannik Sinner,R,191.0,ITA,17.7,6-0 7-6(5),3,R32,77.0,1.0,1.0,66.0,50.0,38.0,6.0,9.0,4.0,5.0,3.0,2.0,55.0,31.0,14.0,17.0,9.0,3.0,7.0,270.0,153.0,229.0,215.0,1,2019-05-20 00:00:42,114828


In [20]:
df_dummies.loc[(df_dummies.player_A_name.str.contains("Jannik Sinner")) | (df_dummies.player_B_name.str.contains("Jannik Sinner"))].head(5)

Unnamed: 0,player_A_name,player_A_ht,player_A_age,player_B_name,player_B_ht,player_B_age,outcome,tourney_datetime,player_A_avg_sets_A,player_A_avg_sets_B,player_A_avg_games_A,player_A_avg_games_B,player_A_avg_n_tiebreaks,player_B_avg_sets_A,player_B_avg_sets_B,player_B_avg_games_A,player_B_avg_games_B,player_B_avg_n_tiebreaks,winner_elo,loser_elo,elo_diff,winner_elo_exp,winner_glicko,winner_glicko_rd,loser_glicko,loser_glicko_rd,glicko_diff,winner_glicko_exp,winner_ts_mu,winner_ts_sigma,loser_ts_mu,loser_ts_sigma,ts_quality,best_of_5,surface_Clay,surface_Grass,surface_Hard,round_ER,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF,tourney_name_reduzido_Barcelona,tourney_name_reduzido_Canada Masters,tourney_name_reduzido_Cincinnati Masters,tourney_name_reduzido_Indian Wells Masters,tourney_name_reduzido_Indianapolis,tourney_name_reduzido_Kitzbuhel,tourney_name_reduzido_Madrid Masters,tourney_name_reduzido_Memphis,tourney_name_reduzido_Miami Masters,tourney_name_reduzido_Monte Carlo Masters,tourney_name_reduzido_Outros,tourney_name_reduzido_Paris Masters,tourney_name_reduzido_Queen's Club,tourney_name_reduzido_Roland Garros,tourney_name_reduzido_Rome Masters,tourney_name_reduzido_Rotterdam,tourney_name_reduzido_US Open,tourney_name_reduzido_Vienna,tourney_name_reduzido_Washington,tourney_name_reduzido_Wimbledon,tourney_level_reduzido_G,tourney_level_reduzido_M,tourney_level_reduzido_Outros,player_A_hand_reduzido_R,player_B_hand_reduzido_R
114605,Laslo Djere,0.889423,0.315254,Jannik Sinner,0.903846,0.108911,1,2019-04-22 00:00:57,2.126984,0.380952,15.047619,10.52381,0.666667,,,,,,1641.739834,1500.0,141.739834,0.69337,1803.285736,64.222815,1500.0,350.0,303.285736,0.762801,29.710659,1.028647,25.0,8.333333,0.516954,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True
114614,Jannik Sinner,0.903846,0.105085,Mate Valkusz,0.865385,0.207921,1,2019-04-22 00:01:06,2.0,0.0,12.0,4.0,0.0,3.0,0.5,21.0,11.5,0.5,1490.187827,1481.364713,8.823114,0.512695,1430.60476,286.085866,1390.366838,255.768001,40.237922,0.544839,21.14834,6.67266,19.755081,5.934083,0.546128,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True
114754,Jannik Sinner,0.903846,0.108475,Stefanos Tsitsipas,0.913462,0.211221,0,2019-05-13 00:00:23,2.0,0.5,12.0,8.0,0.0,2.15873,0.420635,14.928571,10.690476,0.587302,1505.781595,1985.833032,-480.051437,0.059334,1549.182602,241.433109,2115.146115,65.492852,-565.963513,0.039552,24.309258,5.808377,35.292107,0.869071,0.296329,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,True
114774,Steve Johnson,0.889423,0.501695,Jannik Sinner,0.903846,0.112211,0,2019-05-13 00:00:43,2.165563,0.390728,15.195364,10.705298,0.569536,2.0,0.333333,12.0,7.0,0.0,1618.20182,1503.882894,114.318927,0.658827,1810.433834,62.471473,1537.02149,233.568516,273.412344,0.77978,31.21699,0.801948,23.457903,5.363677,0.460172,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,True
114828,Tristan Lamasine,0.865385,0.39661,Jannik Sinner,0.903846,0.112211,1,2019-05-20 00:00:42,2.166667,0.333333,14.5,10.0,0.166667,2.0,0.5,12.5,8.25,0.0,1479.284453,1524.965367,-45.680914,0.434636,1484.300364,179.810495,1734.950901,207.797508,-250.650537,0.230674,22.244751,3.659277,29.114391,4.283344,0.506795,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True


In [21]:
df_dummies.columns

Index(['player_A_name', 'player_A_ht', 'player_A_age', 'player_B_name',
       'player_B_ht', 'player_B_age', 'outcome', 'tourney_datetime',
       'player_A_avg_sets_A', 'player_A_avg_sets_B', 'player_A_avg_games_A',
       'player_A_avg_games_B', 'player_A_avg_n_tiebreaks',
       'player_B_avg_sets_A', 'player_B_avg_sets_B', 'player_B_avg_games_A',
       'player_B_avg_games_B', 'player_B_avg_n_tiebreaks', 'winner_elo',
       'loser_elo', 'elo_diff', 'winner_elo_exp', 'winner_glicko',
       'winner_glicko_rd', 'loser_glicko', 'loser_glicko_rd', 'glicko_diff',
       'winner_glicko_exp', 'winner_ts_mu', 'winner_ts_sigma', 'loser_ts_mu',
       'loser_ts_sigma', 'ts_quality', 'best_of_5', 'surface_Clay',
       'surface_Grass', 'surface_Hard', 'round_ER', 'round_F', 'round_QF',
       'round_R128', 'round_R16', 'round_R32', 'round_R64', 'round_RR',
       'round_SF', 'tourney_name_reduzido_Barcelona',
       'tourney_name_reduzido_Canada Masters',
       'tourney_name_reduzido_Cinci

In [None]:
df_dummies.to_parquet("df_ml.parquet")