In [1]:
import os
import numpy as np
import pandas as pd
import datetime as dt

from definitions import INPUT_PATH
from pre_process_utils import logger

# Read all ATP players
atp_players = pd.read_csv(os.path.join(INPUT_PATH, 'atp_results/atp_players.csv'), parse_dates=['birthdate'])

# Load all matches from odds files
match_odds = []
for year in range(2001, 2019 + 1):
    if year < 2013:
        match_odds.append(pd.read_excel(os.path.join(INPUT_PATH, 'odds/' + str(year) + '.xls'), 
                                          parse_dates=['Date']))
    else:
        match_odds.append(pd.read_excel(os.path.join(INPUT_PATH, 'odds/' + str(year) + '.xlsx'), 
                                         parse_dates=['Date']))

match_odds = pd.concat(match_odds, sort=False)

processed_matches = pd.read_hdf(os.path.join(INPUT_PATH, 'processed_matches_FINAL.h5'), key='matches')

# Uncomment if debug
#odds_players = odds_players.loc[odds_players['Date'] >= '20190101']

In [2]:
@logger
def start_pipe(df):
    return df.copy()

@logger
def clean_odds(df):
    filter_cols = ['ATP', 'Date', 'Winner', 'Loser', 'CBW', 'CBL', 'GBW', 'GBL', 'IWW', 'IWL', 'SBW', 'SBL', 
                  'B365W', 'B365L', 'B&WW', 'B&WL', 'EXW', 'EXL', 'PSW', 'PSL']
    df = df.filter(filter_cols)
    df.columns = map(str.lower, df.columns)
    df.fillna(0, inplace=True)
    df['tourney_date'] = dt.date(1970, 1, 1)
    df['avg_ratio'] = float('nan')
    df['max_w'] = float('nan')
    df['max_l'] = float('nan')
    df['broker_max_w'] = ''
    df['broker_max_l'] = ''
    df['winner'] = df['winner'].str.lower().str.strip()
    df['loser'] = df['loser'].str.lower().str.strip()
    
    return df

@logger
def calc_odds(df):
    t_date = dt.date(1970, 1, 1)
    t_num = 0
    
    def wrapper(match):
        nonlocal t_date, t_num
        
        if match['atp'] != t_num:
            t_num = match['atp']
            t_date = match['date']
        
        return calc_match_odds(match, t_date)
    return df.apply(wrapper, axis=1)

def calc_match_odds(match, t_date):
    match['tourney_date'] = t_date
    
    brokers = ['cb', 'gb', 'iw', 'sb', 'b365', 'b&w', 'ex', 'ps']
    total_ratio = 0
    n = 0
    max_w = 0
    max_l = 0
    broker_max_w = ''
    broker_max_l = ''
    
    for b in brokers:
        odds_w = match[b + 'w']
        odds_l = match[b + 'l']
        
        if isinstance(odds_w, str):
            odds_w = float(odds_w.replace(',', ''))
        
        if isinstance(odds_l, str):
            odds_l = float(odds_l.replace(',', ''))
        
        if odds_w > 0 and odds_l > 0:
            if odds_w > max_w:
                max_w = odds_w
                broker_max_w = b

            if odds_l > max_l:
                max_l = odds_l
                broker_max_l = b

            total_ratio += odds_w / odds_l
            n += 1
    
    if n > 0:
        match['max_w'] = max_w
        match['max_l'] = max_l
        match['broker_max_w'] = broker_max_w
        match['broker_max_l'] = broker_max_l
        match['avg_ratio'] = total_ratio/n
        
    return match 

@logger
def prune_odds(df):
    brokers = ['cb', 'gb', 'iw', 'sb', 'b365', 'b&w', 'ex', 'ps']
    
    for b in brokers:
        df.drop([b + 'w', b + 'l'], inplace=True, axis=1)
    
    df.dropna(inplace=True)
    
    return df

proc_match_odds = (match_odds
    .pipe(start_pipe)
    .pipe(clean_odds)
    .pipe(calc_odds)
    .pipe(prune_odds))

proc_match_odds

start_pipe took=0:00:00.108254 shape=(51945, 54)
clean_odds took=0:00:00.162722 shape=(51945, 26)
calc_odds took=0:00:18.538828 shape=(51945, 26)
prune_odds took=0:00:00.074152 shape=(50968, 10)


Unnamed: 0,atp,date,winner,loser,tourney_date,avg_ratio,max_w,max_l,broker_max_w,broker_max_l
2,1,2001-01-01,haas t.,smith l.,2001-01-01,0.297143,1.12,4.00,gb,gb
4,1,2001-01-01,hewitt l.,arthurs w.,2001-01-01,0.341308,1.28,3.80,cb,gb
16,1,2001-01-01,haas t.,malisse x.,2001-01-01,0.441155,1.35,3.20,cb,gb
18,1,2001-01-01,hewitt l.,phau b.,2001-01-01,0.205455,1.13,5.50,sb,sb
22,1,2001-01-01,massu n.,clement a.,2001-01-01,1.684578,2.70,1.45,cb,iw
...,...,...,...,...,...,...,...,...,...,...
2605,66,2019-11-15,nadal r.,tsitsipas s.,2019-11-10,0.475008,1.44,3.26,b365,ps
2606,66,2019-11-15,zverev a.,medvedev d.,2019-11-10,1.097765,2.14,1.90,ps,b365
2607,66,2019-11-16,tsitsipas s.,federer r.,2019-11-10,2.755928,3.75,1.33,ps,ps
2608,66,2019-11-16,thiem d.,zverev a.,2019-11-10,0.888095,1.84,2.10,ps,ps


In [3]:
@logger
def clean_players(df):
    df = df.filter(['player_id', 'firstname', 'lastname'])
    df['lastname'] = df['lastname'].str.lower()
    df['firstname'] = df['firstname'].str.lower()
    df['fullname'] = ''
    return df

@logger
def transform_name(df):
    return df.apply(transform_player_name, axis=1)

def transform_player_name(player):
    fullname = str(player['lastname']) + ' '
    
    for n in str(player['firstname']).split(' '):
        if len(n) > 0:
            fullname += n[0] + '.'
            
    player['fullname'] = fullname.strip()
    
    return player
    
proc_atp_players = (atp_players
    .pipe(start_pipe)
    .pipe(clean_players)
    .pipe(transform_name))

proc_atp_players

start_pipe took=0:00:00.009036 shape=(54405, 6)
clean_players took=0:00:00.058922 shape=(54405, 4)
transform_name took=0:00:09.051721 shape=(54405, 4)


Unnamed: 0,player_id,firstname,lastname,fullname
0,100001,gardnar,mulloy,mulloy g.
1,100002,pancho,segura,segura p.
2,100003,frank,sedgman,sedgman f.
3,100004,giuseppe,merlo,merlo g.
4,100005,richard pancho,gonzales,gonzales r.p.
...,...,...,...,...
54400,209366,joe,herin,herin j.
54401,209367,geronimo,marcolini,marcolini g.
54402,209368,juan martin,fumeaux buenaventura,fumeaux buenaventura j.m.
54403,209369,vadim,kontseba,kontseba v.


In [12]:
@logger
def merge_names(df, atp):
    df['winner_id'] = df['winner']
    df['loser_id'] = df['loser']
    
    ws = df['winner'].to_numpy()
    ls = df['loser'].to_numpy()
    ps = np.unique(np.append(ws, ls))
    pids = []
    
    # Match by player name
    for p in ps:
        # Match by full name, exact match
        pid = atp.loc[atp['fullname'] == p]
        
        # If no fullname match, get lastname
        if len(pid) == 0:
            lastname = max(p.split(), key=len)
            pid = atp.loc[atp['lastname'] == lastname]
        
        # Extract ATP info
        if len(pid) == 1:      
            pids.append(pid.iloc[0]['player_id'])
        elif len(pid) > 1:
            pid_ids = ','.join([str(v.player_id) + ' ' + v.fullname for v in pid.itertuples()])
            pids.append(pid_ids)
        else:
            pids.append(0)
                            
    df['winner_id'] = df['winner_id'].replace(ps, pids)
    df['loser_id'] = df['loser_id'].replace(ps, pids)
    
    return df

proc_match_odds = (proc_match_odds
    .pipe(start_pipe)
    .pipe(merge_names, proc_atp_players))

proc_match_odds

start_pipe took=0:00:00.004897 shape=(50968, 10)
merge_names took=0:00:13.070740 shape=(50968, 12)


Unnamed: 0,atp,date,winner,loser,tourney_date,avg_ratio,max_w,max_l,broker_max_w,broker_max_l,winner_id,loser_id
2,1,2001-01-01,haas t.,smith l.,2001-01-01,0.297143,1.12,4.00,gb,gb,103163,"102875 smith l.,132437 smith l.,141608 smith l."
4,1,2001-01-01,hewitt l.,arthurs w.,2001-01-01,0.341308,1.28,3.80,cb,gb,103720,101885
16,1,2001-01-01,haas t.,malisse x.,2001-01-01,0.441155,1.35,3.20,cb,gb,103163,103598
18,1,2001-01-01,hewitt l.,phau b.,2001-01-01,0.205455,1.13,5.50,sb,sb,103720,103451
22,1,2001-01-01,massu n.,clement a.,2001-01-01,1.684578,2.70,1.45,cb,iw,103454,103096
...,...,...,...,...,...,...,...,...,...,...,...,...
2605,66,2019-11-15,nadal r.,tsitsipas s.,2019-11-10,0.475008,1.44,3.26,b365,ps,104745,126774
2606,66,2019-11-15,zverev a.,medvedev d.,2019-11-10,1.097765,2.14,1.90,ps,b365,100644,106421
2607,66,2019-11-16,tsitsipas s.,federer r.,2019-11-10,2.755928,3.75,1.33,ps,ps,126774,103819
2608,66,2019-11-16,thiem d.,zverev a.,2019-11-10,0.888095,1.84,2.10,ps,ps,106233,100644


In [24]:
# add list of name suggestions to id
# if player_match_type = full, partial, none
# drop if none
# lookup match get id by other player name

@logger
def merge_by_opp(df, atp_matches):
    return df.apply(merge_by_opp_match, axis=1, args=(atp_matches,))

def merge_by_opp_match(match, atp_matches):
    winner_id = match['winner_id']
    loser_id = match['loser_id']
    
    wid_not_num = not isinstance(winner_id, int)
    lid_not_num = not isinstance(loser_id, int)
    
    if wid_not_num & lid_not_num:
        match['winner_id'] = float('nan')
        match['loser_id'] = float('nan')
    elif wid_not_num or lid_not_num:
        if wid_not_num:
            search_id = loser_id
            search_player = 'loser_id'
            get_player = 'winner_id'
            alts = winner_id
        else:
            search_id = winner_id
            search_player = 'winner_id'
            get_player = 'loser_id'
            alts = loser_id
        
        mask = (
            (atp_matches['tourney_date'] == match['tourney_date']) & 
            (atp_matches[search_player] == match[search_player]))
        potential = atp_matches.loc[mask]
        
        if len(potential) == 1:
            potential = potential.iloc[0]
            alts = alts.split(',')
            hit = False
            
            for a in alts:
                a = a.split()
                player_name = a[1]
                
                if player_name == potential[get_player]:
                    match[get_player] == a[0]
                    hit = True
                    break
            
            if not hit:
                match[get_player] = float('nan')
            
        else:
            match[get_player] = float('nan')
        
    return match    
    
merged_match_odds = (proc_match_odds
    .pipe(start_pipe)
    .pipe(merge_by_opp, processed_matches))

start_pipe took=0:00:00.009519 shape=(50968, 12)
merge_by_opp took=0:01:09.442206 shape=(50968, 12)


In [27]:
@logger
def prune_matches(df):
    mask = ((np.isnan(df['winner_id'])) | (np.isnan(df['loser_id'])))
    missed = len(df.loc[mask])
    
    print("Merged %.2f", (1 - missed/len(df)))
    
    df.dropna(inplace=True)
    
    return df

pruned_match_odds = (merged_match_odds
    .pipe(start_pipe)
    .pipe(prune_matches))

start_pipe took=0:00:00.018895 shape=(50968, 12)
Merged %.2f 0.7142128394286611
prune_matches took=0:00:00.030954 shape=(36402, 12)
