# Point-by-Point Dataset

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

# ATP ELO rankings
elo = pd.read_pickle("atp_elo.pkl")
elo['id'] = elo.index
elo = elo.set_index('name')
print(elo.shape)

(476, 40)


In [2]:
player_name_set = set(elo.index.values)
print("ATP ELO Player set length: " + str(len(player_name_set)))

df_pbp = pd.read_pickle("pbp.pkl") # same pbp dataset only with surface added
df_pbp = df_pbp[df_pbp.apply(lambda x: x['server1'] in player_name_set and x['server2'] in player_name_set, axis = 1)]

# Calculating the Elo values for both servers
def get_elo(date, player):
    prev_date = str(date)[:4] + "0700" if (int(date) % 1000)>700 else str(date)[:4] + "0100"
    return elo.loc[player, prev_date]  
    
df_pbp['elo1'] = df_pbp.apply(lambda x: get_elo(x['date_new'], x['server1']), axis = 1)
df_pbp['elo2'] = df_pbp.apply(lambda x: get_elo(x['date_new'], x['server2']), axis = 1)
df_pbptr = df_pbp[df_pbp.apply(lambda x: int(x['date'][-2:]) <= 15, axis = 1)]
df_pbpte = df_pbp[df_pbp.apply(lambda x: int(x['date'][-2:]) > 15, axis = 1)]
print("df_pbptr shape is " + str(df_pbptr.shape))
print("df_pbpte shape is " + str(df_pbpte.shape))
print("PBP Player set length: " + str(len(player_name_set)))


ATP ELO Player set length: 476
df_pbptr shape is (7857, 16)
df_pbpte shape is (1518, 16)
PBP Player set length: 476


## 1. PBP dataset, Player vs All, Player vs ELO, Player vs Player

In [127]:
# Player vs All Probabilities Model
serv_pr_tr = {x:(0,0) for x in player_name_set}
serv_pr_tr_surf = {x:{surf:(0,0) for surf in ['Grass', 'Clay', 'Hard']} for x in player_name_set}

# Match is row from df_te where both players are in serv_pr_te
def update_serv(match, serv_pr):
    
    pbp = [x for x in match['pbp'].replace(".", ";").split(";") if x.strip != ""]
    # Turning all TieBreak matches into blanks
    pbp = ["" if "/" in x else x for x in pbp]
    
    pbp_1 = ''.join(pbp[0:][::2]) # even indices
    pbp_2 = ''.join(pbp[1:][::2]) # odd indices
    s1 = match['server1']
    s2 = match['server2']
    win1 = pbp_1.count("S") + pbp_1.count("A")
    win2 = pbp_2.count("S") + pbp_2.count("A")
    
    serv_pr[s1] = (serv_pr[s1][0] + win1, serv_pr[s1][1] + len(pbp_1))
    serv_pr[s2] = (serv_pr[s2][0] + win2, serv_pr[s2][1] + len(pbp_2))
    
    return serv_pr

# Match is row from df_te where both players are in serv_pr_te
def update_serv_surf(match, serv_pr):
    
    pbp = [x for x in match['pbp'].replace(".", ";").split(";") if x.strip != ""]
    # Turning all TieBreak matches into blanks
    pbp = ["" if "/" in x else x for x in pbp]
    
    pbp_1 = ''.join(pbp[0:][::2]) # even indices
    pbp_2 = ''.join(pbp[1:][::2]) # odd indices
    s1 = match['server1']
    s2 = match['server2']
    surf = match['surface']
    win1 = pbp_1.count("S") + pbp_1.count("A")
    win2 = pbp_2.count("S") + pbp_2.count("A")
    
    serv_pr[s1][surf] = (serv_pr[s1][surf][0] + win1, serv_pr[s1][surf][1] + len(pbp_1))
    serv_pr[s2][surf] = (serv_pr[s2][surf][0] + win2, serv_pr[s2][surf][1] + len(pbp_2))
    
    return serv_pr

#Getting the True service win percentages
for i,row in df_pbptr.iterrows():
    serv_pr_tr = update_serv(row, serv_pr_tr)
    serv_pr_tr_surf = update_serv_surf(row, serv_pr_tr_surf)



In [4]:
from tqdm import tqdm

# Player vs Player Probabilities Model
def update_serv_opp(match, serv_pr):
    pbp = [x for x in match['pbp'].replace(".", ";").split(";") if x.strip != ""]
    # Turning all TieBreak matches into blanks
    pbp = ["" if "/" in x else x for x in pbp]
    
    pbp_1 = ''.join(pbp[0:][::2]) # even indices
    pbp_2 = ''.join(pbp[1:][::2]) # odd indices
    s1 = match['server1']
    s2 = match['server2']
    win1 = pbp_1.count("S") + pbp_1.count("A")
    win2 = pbp_2.count("S") + pbp_2.count("A")
    
    serv_pr[s1][s2] = (serv_pr[s1][s2][0] + win1, serv_pr[s1][s2][1] + len(pbp_1))
    serv_pr[s2][s1] = (serv_pr[s2][s1][0] + win2, serv_pr[s2][s1][1] + len(pbp_2))

    return serv_pr

serv_pr_tr_opp = {x:{a:b for a,b in zip(list(player_name_set), [(0,0)]*len(player_name_set))} for x in player_name_set}
# Getting the True service win percentages
for i,row in tqdm(df_pbptr.iterrows()):
    serv_pr_tr_opp = update_serv_opp(row, serv_pr_tr_opp)

7857it [00:00, 8504.63it/s]


In [129]:
# Player vs ELO Probabilities Model

import calendar
month_num = {v: k for k,v in enumerate(calendar.month_abbr)}

# Returning most previous ELO
def get_elo(date, name):
    # Finding the most previous date from the given date
    # given date format 01 Jan 17
    date_s = date.strip().split(" ")
    if (month_num[date_s[1]] >= 7):
        return elo.loc[name, "20" + date_s[2] + "0700"]
    return elo.loc[name, "20" + date_s[2] + "0100"]    

# Returning the difference in elo bucket
def get_elo_bucket(elo, opp_elo):
    diff = opp_elo - elo
    if diff < -300:
        return "negfar"
    elif diff < -100:
        return "negmed"
    elif diff <= 0:
        return "negclo"
    elif diff <= 100:
        return "posclo"
    elif diff <= 300:
        return "posmed"
    return "posfar"

# Match is row from df_te where both players are in serv_pr_te
def update_serv_elo(match, serv_pr):
    
    pbp = [x for x in match['pbp'].replace(".", ";").split(";") if x.strip != ""]
    # Turning all TieBreak matches into blanks
    pbp = ["" if "/" in x else x for x in pbp]
    
    pbp_1 = ''.join(pbp[0:][::2]) # even indices
    pbp_2 = ''.join(pbp[1:][::2]) # odd indices
    s1 = match['server1']
    s2 = match['server2']
    win1 = pbp_1.count("S") + pbp_1.count("A")
    win2 = pbp_2.count("S") + pbp_2.count("A")
    
    elo1 = get_elo(match['date'], s1)
    elo2 = get_elo(match['date'], s2)
    b1 = get_elo_bucket(elo1, elo2) # based on opponent elo
    b2 = get_elo_bucket(elo2, elo1) # based on opponent elo
    
    serv_pr[s1][b1] = (serv_pr[s1][b1][0] + win1, serv_pr[s1][b1][1] + len(pbp_1))
    serv_pr[s2][b2] = (serv_pr[s2][b2][0] + win2, serv_pr[s2][b2][1] + len(pbp_2))

    return serv_pr

bckts = ['negfar', 'negmed', 'negclo', 'posclo', 'posmed', 'posfar']
bckts2 = ['negfar', 'negmed', 'med', 'posmed', 'posfar']
serv_pr_tr_elo = {x:{a:b for a,b in zip(bckts, [(0,0)]*len(bckts))} for x in player_name_set}

# Getting the True service win percentages
for i,row in tqdm(df_pbptr.iterrows()):
    serv_pr_tr_elo = update_serv_elo(row, serv_pr_tr_elo)
    
    
# Match is row from df_te where both players are in serv_pr_te
def update_serv_elosurf(match, serv_pr):
    
    pbp = [x for x in match['pbp'].replace(".", ";").split(";") if x.strip != ""]
    # Turning all TieBreak matches into blanks
    pbp = ["" if "/" in x else x for x in pbp]
    
    pbp_1 = ''.join(pbp[0:][::2]) # even indices
    pbp_2 = ''.join(pbp[1:][::2]) # odd indices
    s1 = match['server1']
    s2 = match['server2']
    win1 = pbp_1.count("S") + pbp_1.count("A")
    win2 = pbp_2.count("S") + pbp_2.count("A")
    
    elo1 = get_elo(match['date'], s1)
    elo2 = get_elo(match['date'], s2)
    b1 = get_elo_bucket(elo1, elo2) # based on opponent elo
    b2 = get_elo_bucket(elo2, elo1) # based on opponent elo
    surf = match['surface']
    serv_pr[s1][surf][b1] = (serv_pr[s1][surf][b1][0] + win1, serv_pr[s1][surf][b1][1] + len(pbp_1))
    serv_pr[s2][surf][b2] = (serv_pr[s2][surf][b2][0] + win2, serv_pr[s2][surf][b2][1] + len(pbp_2))

    return serv_pr  

serv_pr_tr_elo_surf = {x:{surf:{a:b for a,b in zip(bckts, [(0,0)]*len(bckts))} for surf in ['Grass', "Hard", 'Clay']} for x in player_name_set}
for i,row in tqdm(df_pbptr.iterrows()):
    serv_pr_tr_elo_surf = update_serv_elosurf(row,serv_pr_tr_elo_surf)
#5 Using ELO conditioned on surface



7857it [00:01, 5907.35it/s]
7857it [00:01, 5437.15it/s]


In [130]:
from sklearn.metrics import log_loss

""" Calculating Error for four of these point probability models """
# returns variance calculation and error calculation
# returns variance and number of points
#1. Overall Service Win Percentage
all_points = ''.join(df_pbptr['pbp'].values).replace(';', '').replace('.', '').replace('/','').rstrip()
all_serv_win = (all_points.count("S") + all_points.count("A"))/len(all_points)
y_true = []
y_pred = []
for i,row in df_pbpte.iterrows():
    true,pred = calc_error(all_serv_win, all_serv_win, row)
    y_true.extend(true)
    y_pred.extend(pred)
print("Overall service win percentage log-loss is " + str(log_loss(y_true,y_pred,eps=1e-15)))

df_pbptrg = df_pbptr[df_pbptr['surface'].apply(lambda x: x == 'Grass')]
df_pbptrh = df_pbptr[df_pbptr['surface'].apply(lambda x: x == 'Hard')]
df_pbptrc = df_pbptr[df_pbptr['surface'].apply(lambda x: x == 'Clay')]
all_pointsg = ''.join(df_pbptrg['pbp'].values).replace(';', '').replace('.', '').replace('/','').rstrip()
all_serv_wing = (all_points.count("S") + all_points.count("A"))/len(all_points)
all_pointsh = ''.join(df_pbptrh['pbp'].values).replace(';', '').replace('.', '').replace('/','').rstrip()
all_serv_winh = (all_points.count("S") + all_points.count("A"))/len(all_points)
all_pointsc = ''.join(df_pbptrc['pbp'].values).replace(';', '').replace('.', '').replace('/','').rstrip()
all_serv_winc = (all_points.count("S") + all_points.count("A"))/len(all_points)

#2. Overall serve win percentage per player
y_true = []
y_pred = []
for i,row in df_pbpte.iterrows():
    true,pred = calc_error(serv_pr_tr[row['server1']][0]/serv_pr_tr[row['server1']][1], serv_pr_tr[row['server2']][0]/serv_pr_tr[row['server2']][1], row) if serv_pr_tr[row['server1']][1] > 0 and serv_pr_tr[row['server2']][1] > 0 else calc_error(all_serv_win, all_serv_win, row)
    y_true.extend(true)
    y_pred.extend(pred)
print("Overall service win percentage log-loss for individual player is " + str(log_loss(y_true,y_pred,eps=1e-15)))

#3. Using ELO
y_true = []
y_pred = []
for i,row in df_pbpte.iterrows():
    b1 = get_elo_bucket(get_elo(row['date'], row['server1']), get_elo(row['date'], row['server2']))
    b2 = get_elo_bucket(get_elo(row['date'], row['server2']), get_elo(row['date'], row['server1']))
    
    s1_p = serv_pr_tr_elo[row['server1']][b1][0]/serv_pr_tr_elo[row['server1']][b1][1] if serv_pr_tr_elo[row['server1']][b1][1] > 0 else  all_serv_win
    s2_p = serv_pr_tr_elo[row['server2']][b2][0]/serv_pr_tr_elo[row['server2']][b2][1] if serv_pr_tr_elo[row['server2']][b2][1] > 0 else  all_serv_win

    true,pred = calc_error(s1_p, s2_p, row)
    y_true.extend(true)
    y_pred.extend(pred)
    
    
print("Overall service win percentage log-loss for individual player with ELO categories is " + str(log_loss(y_true,y_pred,eps=1e-15)))
    
#4. Using Player v Player, only using service win percentage per player when no past information of plaayervplayer
y_true = []
y_pred = []
for i,row in df_pbpte.iterrows():
    s1_p = serv_pr_tr_opp[row['server1']][row['server2']][0]/serv_pr_tr_opp[row['server1']][row['server2']][1] if serv_pr_tr_opp[row['server1']][row['server2']][1] > 0 else all_serv_win
    s2_p = serv_pr_tr_opp[row['server2']][row['server1']][0]/serv_pr_tr_opp[row['server2']][row['server1']][1] if serv_pr_tr_opp[row['server2']][row['server1']][1] > 0 else all_serv_win
    true,pred = calc_error(s1_p, s2_p, row)
    y_true.extend(true)
    y_pred.extend(pred)
    
print("Overall service win percentage log-loss for individual players given opponent is " + str(log_loss(y_true,y_pred,eps=1e-15)))
    


Overall service win percentage log-loss is 0.6544080237031278
Overall service win percentage log-loss for individual player is 0.6539744207688547
Overall service win percentage log-loss for individual player with ELO categories is 0.6544815392906047
Overall service win percentage log-loss for individual players given opponent is 0.6566809851688695


## 2. PBP dataset, Logistic Regression, with surface

In [7]:
df_pbptrc = df_pbptr[df_pbptr['surface'].apply(lambda x: x == 'Clay')]
df_pbptrh = df_pbptr[df_pbptr['surface'].apply(lambda x: x == 'Hard')]
df_pbptrg = df_pbptr[df_pbptr['surface'].apply(lambda x: x == 'Grass')]

df_pbptec = df_pbpte[df_pbpte['surface'].apply(lambda x: x == 'Clay')]
df_pbpteh = df_pbpte[df_pbpte['surface'].apply(lambda x: x == 'Hard')]
df_pbpteg = df_pbpte[df_pbpte['surface'].apply(lambda x: x == 'Grass')]

print("Training Shapes for Clay, Hard, Grass: %d %d %d" % (df_pbptrc.shape[0], df_pbptrh.shape[0], df_pbptrg.shape[0]))
print("Testing Shapes for Clay, Hard, Grass: %d %d %d" % (df_pbptec.shape[0], df_pbpteh.shape[0], df_pbpteg.shape[0]))



Training Shapes for Clay, Hard, Grass: 2331 4684 842
Testing Shapes for Clay, Hard, Grass: 603 701 214


In [9]:
# Splitting based on the Elo

# Takes in a dataframe, split into a dictionary of dataframes of keys
# farneg, neg, mid, pos, farpos - 
def elo_split(df):
    df_far = df[df.apply(lambda x: np.abs(int(x['elo1']) - int(x['elo2'])) >= 150, axis = 1)]
    df_far['serverad'] = df_far.apply(lambda x: 1 if x['elo1'] > x['elo2'] else 2, axis = 1)
    
    df_mid = df[df.apply(lambda x: np.abs(int(x['elo1']) - int(x['elo2'])) >= 50 and np.abs(int(x['elo1']) - int(x['elo2'])) < 150, axis = 1)]
    df_mid['serverad'] = df_mid.apply(lambda x: 1 if x['elo1'] > x['elo2'] else 2, axis = 1)
    
    df_sam = df[df.apply(lambda x: np.abs(int(x['elo1']) - int(x['elo2'])) < 50, axis = 1)]
    df_sam['serverad'] = df_sam.apply(lambda x: 1 if x['elo1'] > x['elo2'] else 0 if x['elo1'] == x['elo2'] else 2, axis = 1)
    
    return df_far, df_mid, df_sam

df_fartrc, df_midtrc, df_nertrc = elo_split(df_pbptrc)
df_fartrg, df_midtrg, df_nertrg = elo_split(df_pbptrg)
df_fartrh, df_midtrh, df_nertrh = elo_split(df_pbptrh)

df_fartec, df_midtec, df_nertec = elo_split(df_pbptec)
df_farteg, df_midteg, df_nerteg = elo_split(df_pbpteg)
df_farteh, df_midteh, df_nerteh = elo_split(df_pbpteh)

# Evaluating the size of datasets

print("Training Shapes for Clay - Far, Mid, Near ELO: %d %d %d" % (df_fartrc.shape[0], df_midtrc.shape[0], df_nertrc.shape[0]))
print("Testing Shapes for Clay - Far, Mid, Near ELO: %d %d %d" % (df_fartec.shape[0], df_midtec.shape[0], df_nertec.shape[0]))

print("Training Shapes for Grass - Far, Mid, Near ELO: %d %d %d" % (df_fartrg.shape[0], df_midtrg.shape[0], df_nertrg.shape[0]))
print("Testing Shapes for Grass - Far, Mid, Near ELO: %d %d %d" % (df_farteg.shape[0], df_midteg.shape[0], df_nerteg.shape[0]))

print("Training Shapes for Hard - Far, Mid, Near ELO: %d %d %d" % (df_fartrh.shape[0], df_midtrh.shape[0], df_nertrh.shape[0]))
print("Testing Shapes for Hard - Far, Mid, Near ELO: %d %d %d" % (df_farteh.shape[0], df_midteh.shape[0], df_nerteh.shape[0]))

df_pbptrc['serverad'] = df_pbptrc.apply(lambda x: 1 if x['elo1'] > x['elo2'] else 0 if x['elo1'] == x['elo2'] else 2, axis = 1)
df_pbptrh['serverad'] = df_pbptrh.apply(lambda x: 1 if x['elo1'] > x['elo2'] else 0 if x['elo1'] == x['elo2'] else 2, axis = 1)
df_pbptrg['serverad'] = df_pbptrg.apply(lambda x: 1 if x['elo1'] > x['elo2'] else 0 if x['elo1'] == x['elo2'] else 2, axis = 1)
df_pbptec['serverad'] = df_pbptec.apply(lambda x: 1 if x['elo1'] > x['elo2'] else 0 if x['elo1'] == x['elo2'] else 2, axis = 1)
df_pbpteh['serverad'] = df_pbpteh.apply(lambda x: 1 if x['elo1'] > x['elo2'] else 0 if x['elo1'] == x['elo2'] else 2, axis = 1)
df_pbpteg['serverad'] = df_pbpteg.apply(lambda x: 1 if x['elo1'] > x['elo2'] else 0 if x['elo1'] == x['elo2'] else 2, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Training Shapes for Clay - Far, Mid, Near ELO: 1034 847 450
Testing Shapes for Clay - Far, Mid, Near ELO: 243 232 128
Training Shapes for Grass - Far, Mid, Near ELO: 393 289 160
Testing Shapes for Grass - Far, Mid, Near ELO: 106 67 41
Training Shapes for Hard - Far, Mid, Near ELO: 2179 1647 858
Testing Shapes for Hard - Far, Mid, Near ELO: 314 242 145


In [10]:
from sklearn import svm
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler


            
def log_predict(Xtr, Ytr, Xte, Yte):
    est = sm.Logit(Ytr, Xtr)
    est2 = est.fit()
    print(est2.summary())
    teYpred = est2.predict(Xte)
    return Yte, teYpred
    #return log_loss(Yte, teYpred, eps=1e-15)

def rfr_predict(Xtr,Ytr,Xte,Yte):
    Xtr_stand = StandardScaler().fit_transform(Xtr)
    Xte_stand = StandardScaler().fit_transform(Xte)
    
    rfr = RandomForestRegressor(n_estimators = 200, min_samples_split = 10)
    rfr.fit(Xtr_stand, Ytr)
    teYpred = rfr.predict(Xte_stand)
    Yte_pred = np.round(teYpred)
    return Yte, teYpred
    #return log_loss(Yte, teYpred, eps=1e-15)

  from pandas.core import datetools


In [16]:
"""Looking at predictions given match state"""
from tqdm import tqdm

def get_break_count(pbp, play_num):
    games1 = pbp[play_num-1:][::2]
    games2 = pbp[int(2-play_num):][::2]
    
    games1broke = len([x for x in games1 if x[-1] == 'R' or x[-1] == 'D'])
    games2broke = len([x for x in games2 if x[-1] == 'R' or x[-1] == 'D'])
    return games2broke-games1broke

def turn_tiebreak(tb):
    tbs = tb.split("/")
    if len(tbs) % 2 == 0:
        if (tbs[-1].replace(".","")[-1] == 'S' or tbs[-1].replace(".","")[-1] == 'A'):
            return "R."
        else:
            return "S."
    if len(tbs) % 2 == 1:
        if (tbs[-1].replace(".","")[-1] == 'S' or tbs[-1].replace(".","")[-1] == 'A'):
            return "S."
        else:
            return "R."

# Per a given row and a pos/neg elo marker, return the matrix with above characteristics
# diff value takes on pos/neg
def get_rows_score(match, pos = True, n=4):
    # Determining target player to look at
    if pos:
        play_num = 1 if match['serverad'] == 1 else 2
        elo_target1 = match['elo1'] if match['serverad'] == 1 else match['elo2']
        elo_target2 = match['elo2'] if match['serverad'] == 1 else match['elo1']
    else:
        play_num = 2 if match['serverad'] == 1 else 1
        elo_target1 = match['elo2'] if match['serverad'] == 1 else match['elo1']
        elo_target2 = match['elo1'] if match['serverad'] == 1 else match['elo2']

    match_pbp = match['pbp'][:-1] if match['pbp'][-1] == '.' else match['pbp']
    pbp = [x for x in match_pbp.rstrip().replace(".", ".;").split(";") if x.strip != ""]
    
    # Turning all TieBreak matches into blanks
    pbp = [turn_tiebreak(x) if "/" in x else x for x in pbp]

    start_game = pbp[3 + play_num]
    pbp_start = pbp[:3+play_num]
        
    # Service win proportion
    pbp_start_sw = ''.join(pbp_start[play_num-1:][::2])
    sw_prop = (pbp_start_sw.count("S") + pbp_start_sw.count("A"))/len(pbp_start_sw)
    
    # Return win proportion
    pbp_start_rw = ''.join(pbp_start[int(2-play_num):][::2])
    rw_prop = (pbp_start_rw.count("R") + pbp_start_rw.count("D"))/len(pbp_start_rw)
    
    # Momentum
    momentstr = ''.join(pbp_start)[-4:].replace("R", "W").replace("D", "W").replace("S", "L").replace("A", "L")
    moment = momentstr.count("S") + momentstr.count("A")
    
    # break player advantage
    break_cnt = get_break_count(pbp_start, play_num)
    
    state = [pbp_start_sw.count("S")+ pbp_start_sw.count("A"), len(pbp_start_sw), pbp_start_rw.count("R") + pbp_start_rw.count("D"), len(pbp_start_rw), moment, momentstr]
    X = []
    Y = []
    
    i = 3+play_num
    match_score1, match_score2 = 0,0
    game_cnt = i
    while i < len(pbp):
        # first game is target player's service
        game = pbp[i]
        game_score1, game_score2 = 0,0
        
        for j in game:
            if j == '.':
                match_score1 += 1 if game.replace('.','')[-1] == 'S' or game.replace(".",'')[-1] == 'A' else 0
                match_score2 += 1 if game.replace('.','')[-1] == 'D' or game.replace(".",'')[-1] == 'R' else 0
                game_cnt = 0
                break
            # add into the X and Y arrays
            res = 1 if j == "S" or j == "A" else 0
            X.append([state[0]/state[1], state[2]/state[3], state[4],elo_target1, elo_target2, game_score1, game_score2, break_cnt, game_cnt, match_score1, match_score2])
            Y.append(res)
            
            state[0] += res
            state[1] += 1
            state[4] -= 1 if state[5][0] == "W" else 0
            state[4] += res
            state[5] = (state[5][1:] + "W") if res == 1 else (state[5][1:] + "L")
            
            game_score1 += res
            game_score2 += 1 if res == 0 else 0
            
        game_cnt += 1
        if len(game) == 0:
            print(match['pbp_id'])
            print()
        if not (game.replace('.', '')[-1] == 'S' or game.replace(".", '')[-1] == 'A'):
            break_cnt -= 1
            
        if i + 1 == len(pbp):
            break
            
        # second game is target player's return, only update momentum and return win
        i += 1
        game = pbp[i]
        
        for j in game:
            if j == '.':
                match_score1 += 1 if game.replace('.','')[-1] == 'D' or game.replace(".",'')[-1] == 'R' else 0
                match_score2 += 1 if game.replace('.','')[-1] == 'S' or game.replace(".",'')[-1] == 'A' else 0
                game_cnt = 0
                break
            res = 1 if j == "D" or j == "R" else 0
            state[2] += res
            state[3] += 1
            state[4] -= 1 if state[5][0] == "W" else 0
            state[4] += res
            state[5] = (state[5][1:] + "W") if res == 1 else (state[5][1:] + "L")
            
        if not (game.replace('.', '')[-1] == 'S' or game.replace(".", '')[-1] == 'A'):
            break_cnt += 1

        game_cnt += 1
        i += 1     
        
    return X,Y

    
def get_matrix_score(df, pos=True):
    X_tot, Y_tot = [], []
    
    for i,row in tqdm(df.iterrows()):
        X_row, Y_row = get_rows_score(row, True, n=4)
        X_tot.extend(X_row)
        Y_tot.extend(Y_row)
        X_row, Y_row = get_rows_score(row, False, n=4)
        X_tot.extend(X_row)
        Y_tot.extend(Y_row)
    return X_tot, Y_tot

In [17]:
y_true_l = []
y_pred_l = []
y_true_r = []
y_pred_r = []

Xtr,Ytr = get_matrix_score(df_pbptrg)
Xte,Yte = get_matrix_score(df_pbpteg)

cols = ['serv_win_%', 'ret_win_%', 'momentum', 'elo_s', 'elo_r', 'game_score_s', 'game_score_r', 'break_adv_s', 'game_cnt', 'match_score_s', 'match_score_r']

true,pred = log_predict(pd.DataFrame(Xtr, columns = cols), Ytr, pd.DataFrame(Xte, columns = cols), Yte)
pred_round = np.round(pred)
print("error for grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_round, true) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred)))
print("Number of predicted 0 values is " + str(len(pred_round)-sum(pred_round)))
print("min and max is %f and %f " % (float(max(pred)), float(min(pred))))
inds = [i for i,x in enumerate(list(true)) if x == 0]
true0 = [x for i,x in enumerate(list(true)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))

true_r,pred_r = rfr_predict(Xtr,Ytr,Xte,Yte)
pred_rround = np.round(pred_r)
print("rfr_error for grass is %f" % float(log_loss(true_r, pred_r, eps=1e-15)))
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_rround, true_r) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred_r)))
print("Number of predicted 0 values is " + str(len(pred_rround)-sum(pred_rround)))
print("min and max is %f and %f " % (float(max(pred_r)), float(min(pred_r))))
inds = [i for i,x in enumerate(list(true_r)) if x == 0]
true0 = [x for i,x in enumerate(list(true_r)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred_r)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))
y_true_r.extend(list(true_r))
y_pred_r.extend(list(pred_r))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))

Xtr,Ytr = get_matrix_score(df_pbptrh)
Xte,Yte = get_matrix_score(df_pbpteh)
true,pred = log_predict(pd.DataFrame(Xtr, columns = cols), Ytr, pd.DataFrame(Xte, columns = cols), Yte)
pred_round = np.round(pred)
print("error for hard is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_round, true) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred)))
print("Number of predicted 0 values is " + str(len(pred_round)-sum(pred_round)))
print("min and max is %f and %f " % (float(max(pred)), float(min(pred))))
inds = [i for i,x in enumerate(list(true)) if x == 0]
true0 = [x for i,x in enumerate(list(true)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))

true_r,pred_r = rfr_predict(Xtr,Ytr,Xte,Yte)
pred_rround = np.round(pred_r)
print("rfr_error for hard is %f" % float(log_loss(true_r, pred_r, eps=1e-15)))
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_rround, true_r) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred_r)))
print("Number of predicted 0 values is " + str(len(pred_rround)-sum(pred_rround)))
print("min and max is %f and %f " % (float(max(pred_r)), float(min(pred_r))))
inds = [i for i,x in enumerate(list(true_r)) if x == 0]
true0 = [x for i,x in enumerate(list(true_r)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred_r)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))
y_true_r.extend(list(true_r))
y_pred_r.extend(list(pred_r))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))

Xtr,Ytr = get_matrix_score(df_pbptrc)
Xte,Yte = get_matrix_score(df_pbptec)
true,pred = log_predict(pd.DataFrame(Xtr, columns = cols), Ytr, pd.DataFrame(Xte, columns = cols), Yte)
pred_round = np.round(pred)
print("error for clay is %f" % float(log_loss(true,pred_round,eps=1e-15)))
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_round, true) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred)))
print("Number of predicted 0 values is " + str(len(pred_round)-sum(pred_round)))
print("min and max is %f and %f " % (float(max(pred)), float(min(pred))))
print("Number classified as 0s: %d" % np.count_nonzero(np.round(pred) == 0))
inds = [i for i,x in enumerate(list(true)) if x == 0]
true0 = [x for i,x in enumerate(list(true)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))

true_r,pred_r = rfr_predict(Xtr,Ytr,Xte,Yte)
pred_rround = np.round(pred_r)
print("rfr_error for clay is %f" % float(log_loss(true_r, pred_r, eps=1e-15)))
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_rround, true_r) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred_r)))
print("Number of predicted 0 values is " + str(len(pred_rround)-sum(pred_rround)))
print("min and max is %f and %f " % (float(max(pred_r)), float(min(pred_r))))
inds = [i for i,x in enumerate(list(true_r)) if x == 0]
true0 = [x for i,x in enumerate(list(true_r)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred_r)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))
y_true_r.extend(list(true_r))
y_pred_r.extend(list(pred_r))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))

print("Total log error is " + str(log_loss(y_true_l,y_pred_l,eps=1e-15)))
print("Total rfr error is " + str(log_loss(y_true_r,y_pred_r,eps=1e-15)))"""

'y_true_l = []\ny_pred_l = []\ny_true_r = []\ny_pred_r = []\n\nXtr,Ytr = get_matrix_score(df_pbptrg)\nXte,Yte = get_matrix_score(df_pbpteg)\n\ncols = [\'serv_win_%\', \'ret_win_%\', \'momentum\', \'elo_s\', \'elo_r\', \'game_score_s\', \'game_score_r\', \'break_adv_s\', \'game_cnt\', \'match_score_s\', \'match_score_r\']\n\ntrue,pred = log_predict(pd.DataFrame(Xtr, columns = cols), Ytr, pd.DataFrame(Xte, columns = cols), Yte)\npred_round = np.round(pred)\nprint("error for grass is %f" % float(log_loss(true,pred,eps=1e-15)))\nprint("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_round, true) if x == 0 and y == 0])))\nprint("Length of Yte_pred is " + str(len(pred)))\nprint("Number of predicted 0 values is " + str(len(pred_round)-sum(pred_round)))\nprint("min and max is %f and %f " % (float(max(pred)), float(min(pred))))\ninds = [i for i,x in enumerate(list(true)) if x == 0]\ntrue0 = [x for i,x in enumerate(list(true)) if i in inds]\npred0 = [x for i,x in enume

In [None]:
gams = [.01, 1, 10, 100, 500, 1000]

Xtr,Ytr = get_matrix_score(df_pbptrg)
Xte,Yte = get_matrix_score(df_pbpteg)

for g in gams:
    clf = svm.SVC(gamma = g)
    clf.fit(np.array(Xtr), np.array(Ytr))
    Yte_pred = clf.predict(np.array(Xte))
    print("Misclassification error with gamma %f is: %f" % (g, np.mean(Yte_pred != Yte)))
    print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(Yte_pred, Yte) if x == 0 and y == 0])))
    print("Length of Yte_pred is " + str(len(Yte_pred)))
    print("Number of predicted 0 values is " + str(len(Yte_pred)-sum(Yte_pred)))

# Using MCP Dataset

In [23]:
import pandas as pd
df_mcp_old = pd.read_pickle("mcp_old_final.pkl")
df_mcp_new = pd.read_pickle("mcp_new_final.pkl")
print(len(set(df_mcp_old['id'].values)))
print(len(set(df_mcp_new['id'].values)))

717
1938


In [39]:
df_mcp_old = pd.read_pickle("mcp_old_final.pkl")
df_mcp_new = pd.read_pickle("mcp_new_final.pkl")
df_mcp = pd.concat([df_mcp_old, df_mcp_new])

print("The number of Hard points is " + str(df_mcp[df_mcp['surface'].apply(lambda x: x == 'Hard')].shape))
print("The number of Clay points is " + str(df_mcp[df_mcp['surface'].apply(lambda x: x == 'Clay')].shape))
print("The number of Grass points is " + str(df_mcp[df_mcp['surface'].apply(lambda x: x == 'Grass')].shape))

df_mcp_oldg = df_mcp_old[df_mcp_old['surface'].apply(lambda x: x == 'Grass')]
df_mcp_oldh = df_mcp_old[df_mcp_old['surface'].apply(lambda x: x == 'Hard')]
df_mcp_oldc = df_mcp_old[df_mcp_old['surface'].apply(lambda x: x == 'Clay')]

df_mcp_newg = df_mcp_new[df_mcp_new['surface'].apply(lambda x: x == 'Grass')]
df_mcp_newh = df_mcp_new[df_mcp_new['surface'].apply(lambda x: x == 'Hard')]
df_mcp_newc = df_mcp_new[df_mcp_new['surface'].apply(lambda x: x == 'Clay')]

df_mcp_oldgte = df_mcp_oldg[df_mcp_oldg.apply(lambda x: x['id'] > '2017', axis = 1)]
df_mcp_oldhte = df_mcp_oldh[df_mcp_oldh.apply(lambda x: x['id'] > '2017', axis = 1)]
df_mcp_oldcte = df_mcp_oldc[df_mcp_oldc.apply(lambda x: x['id'] > '2017', axis = 1)]
df_mcp_oldgtr = df_mcp_oldg[df_mcp_oldg.apply(lambda x: x['id'] <= '2017', axis = 1)]
df_mcp_oldhtr = df_mcp_oldh[df_mcp_oldh.apply(lambda x: x['id'] <= '2017', axis = 1)]
df_mcp_oldctr = df_mcp_oldc[df_mcp_oldc.apply(lambda x: x['id'] <= '2017', axis = 1)]

df_mcp_newgte = df_mcp_newg[df_mcp_newg.apply(lambda x: x['id'] > '2018', axis = 1)]
df_mcp_newhte = df_mcp_newh[df_mcp_newh.apply(lambda x: x['id'] > '2018', axis = 1)]
df_mcp_newcte = df_mcp_newc[df_mcp_newc.apply(lambda x: x['id'] > '2018', axis = 1)]
df_mcp_newgtr = df_mcp_newg[df_mcp_newg.apply(lambda x: x['id'] <= '2018', axis = 1)]
df_mcp_newhtr = df_mcp_newh[df_mcp_newh.apply(lambda x: x['id'] <= '2018', axis = 1)]
df_mcp_newctr = df_mcp_newc[df_mcp_newc.apply(lambda x: x['id'] <= '2018', axis = 1)]

print("df_mcp_oldgte: " + str(len(set(df_mcp_oldgte['id'].values))))
print("df_mcp_oldhte: " + str(len(set(df_mcp_oldhte['id'].values))))
print("df_mcp_oldcte: " + str(len(set(df_mcp_oldcte['id'].values))))
print("df_mcp_oldgtr: " + str(len(set(df_mcp_oldgtr['id'].values))))
print("df_mcp_oldhtr: " + str(len(set(df_mcp_oldhtr['id'].values))))
print("df_mcp_oldctr: " + str(len(set(df_mcp_oldctr['id'].values))))

print("df_mcp_newgte: " + str(len(set(df_mcp_newgte['id'].values))))
print("df_mcp_newhte: " + str(len(set(df_mcp_newhte['id'].values))))
print("df_mcp_newcte: " + str(len(set(df_mcp_newcte['id'].values))))
print("df_mcp_newgtr: " + str(len(set(df_mcp_newgtr['id'].values))))
print("df_mcp_newhtr: " + str(len(set(df_mcp_newhtr['id'].values))))
print("df_mcp_newctr: " + str(len(set(df_mcp_newctr['id'].values))))

The number of Hard points is (267201, 107)
The number of Clay points is (121752, 107)
The number of Grass points is (52890, 107)
df_mcp_oldgte: 10
df_mcp_oldhte: 121
df_mcp_oldcte: 46
df_mcp_oldgtr: 32
df_mcp_oldhtr: 400
df_mcp_oldctr: 108
df_mcp_newgte: 88
df_mcp_newhte: 322
df_mcp_newcte: 171
df_mcp_newgtr: 136
df_mcp_newhtr: 775
df_mcp_newctr: 418


In [25]:
new_comp_list = ['elo_p1', 'elo_p2', 'serve_pct_p1', 'serve_pct_p2', 'point_num', 'acesw_perc1', 'acesw_perc2', 'first_deep1', 'first_deep2', 'second_deep1', 'second_deep2', 'unfor_perc1', 'unfor_perc2', 'winner1', 'winner2', 'current_serv_num1','p1_ptwinsprob', 'p2_ptwinsprob','breaks_p1*game_no', 'gamept_marginp1*gamept_no', 'match_score_p1', 'match_score_p2']

old_comp_list = ['elo_p1', 'elo_p2', 'serve_pct_p1', 'serve_pct_p2', 'point_num', 'acesw_perc1', 'acesw_perc2', 'first_deep1', 'first_deep2', 'second_deep1', 'second_deep2', 'unfor_perc1', 'unfor_perc2', 'winner1', 'winner2', 'prev_acesw_perc1', 'prev_acesw_perc2', 'prev_first_deep1', 'prev_first_deep2', 'prev_second_deep1', 'prev_second_deep2', 'prev_winner1', 'prev_winner2', 'prev_unfor_perc1', 'prev_unfor_perc2', 'rec1', 'rec2', 'current_serv_num1', 'p1_ptwinsprob', 'p2_ptwinsprob', 'breaks_p1*game_no', 'gamept_marginp1*gamept_no', 'match_score_p1', 'match_score_p2']


new_comp_list_s = ['elo_p1', 'elo_p2', 'serve_pct_p1', 'serve_pct_p2', 'point_num', 'acesw_perc1', 'acesw_perc2', 'first_deep1', 'first_deep2', 'second_deep1', 'second_deep2', 'unfor_perc1', 'unfor_perc2', 'winner1', 'winner2','p1_ptwinsprob', 'p2_ptwinsprob','breaks_p1*game_no', 'gamept_marginp1*gamept_no', 'match_score_p1', 'match_score_p2']

old_comp_list_s = ['elo_p1', 'elo_p2', 'serve_pct_p1', 'serve_pct_p2', 'point_num', 'acesw_perc1', 'acesw_perc2', 'first_deep1', 'first_deep2', 'second_deep1', 'second_deep2', 'unfor_perc1', 'unfor_perc2', 'winner1', 'winner2', 'prev_acesw_perc1', 'prev_acesw_perc2', 'prev_first_deep1', 'prev_first_deep2', 'prev_second_deep1', 'prev_second_deep2', 'prev_winner1', 'prev_winner2', 'prev_unfor_perc1', 'prev_unfor_perc2', 'rec1', 'rec2', 'p1_ptwinsprob', 'p2_ptwinsprob', 'breaks_p1*game_no', 'gamept_marginp1*gamept_no', 'match_score_p1', 'match_score_p2']

column_num = [x for x in df_mcp_old.columns if '1' in x or '2' in x]
column_num_tup = [(x, x.replace("1", "2")) if '1' in x else (x,x.replace("2", "1")) for x in column_num ]
col_rename_dict = {x[0]:x[1] for x in column_num_tup}

def swap(df):
    df2 = df.copy()
    df2.rename(columns = col_rename_dict,inplace=True)
    return df2

def createX(dfc, comp_list):
    # returner record, server record
    #return dfc[['breaks_p1', 'game_no', 'gamept_marginp1', 'gamept_no'] + comp_list]
    return dfc[['breaks_p1', 'game_no','game_score_p1', 'game_score_p2', 'gamept_no', 'momentum'] + comp_list]

    
# NEED TO CHANGE SUCH THAT THE PERSON SERVING IS ALWAYS P1
def createXY_point(df, comp_list):
    df_toswap = df[df.apply(lambda x: x['current_serv_num1'] == 0, axis = 1)]
    df_okay = df[df.apply(lambda x: x['current_serv_num1'] == 1, axis = 1)]
    df = pd.concat([df_okay, swap(df_toswap)])
    #dfc = df.copy()[['breaks_p1', 'game_no', 'gamept_marginp1', 'gamept_no','pt_winner1'] + comp_list].dropna()
    dfc = df.copy()[['breaks_p1', 'game_no', 'game_score_p1', 'game_score_p2', 'gamept_no', 'momentum', 'pt_winner1'] + comp_list].dropna()
    X = createX(dfc, comp_list)
    Y = dfc['pt_winner1'].copy()
    
    return X,Y


In [26]:
def log_predict(Xtr, Ytr, Xte, Yte):
    est = sm.Logit(Ytr, Xtr)
    est2 = est.fit()
    print(est2.summary())
    teYpred = est2.predict(Xte)
    return Yte, teYpred, est2

def rfr_predict(Xtr,Ytr,Xte,Yte):
    Xtr_stand = StandardScaler().fit_transform(Xtr)
    Xte_stand = StandardScaler().fit_transform(Xte)
    
    rfr = RandomForestRegressor(n_estimators = 200, min_samples_split = 10)
    rfr.fit(Xtr_stand, Ytr)
    teYpred = rfr.predict(Xte_stand)
    return Yte, teYpred

def svm_predict(Xtr, Ytr, Xte, Yte):
    clf = svm.SVC()
    clf.fit(np.array(Xtr), np.array(Ytr))
    Yte_pred = clf.predict(np.array(Xte))
    return np.sum(Yte_pred != Yte)

In [27]:
"""Using MCP Data to predict next point outcome"""
"""Looking specifically at old games"""
y_true_l = []
y_pred_l = []

Xtr,Ytr = createXY_point(df_mcp_oldgtr, old_comp_list_s)
Xte,Yte = createXY_point(df_mcp_oldgte, old_comp_list_s)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old grass is %f" % float(log_loss(true,pred,eps=1e-15)))
pred_round = np.round(pred)
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_round, true) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred)))
print("Number of predicted 0 values is " + str(len(pred_round)-sum(pred_round)))
print("min and max is %f and %f " % (float(max(pred)), float(min(pred))))
inds = [i for i,x in enumerate(list(true)) if x == 0]
true0 = [x for i,x in enumerate(list(true)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))

coefs = {x:[] for x in Xtr.columns}
for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))
    


Xtr,Ytr = createXY_point(df_mcp_oldhtr, old_comp_list_s)
Xte,Yte, = createXY_point(df_mcp_oldhte, old_comp_list_s)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old hard is %f" % float(log_loss(true,pred,eps=1e-15)))
pred_round = np.round(pred)
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_round, true) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred)))
print("Number of predicted 0 values is " + str(len(pred_round)-sum(pred_round)))
print("min and max is %f and %f " % (float(max(pred)), float(min(pred))))
inds = [i for i,x in enumerate(list(true)) if x == 0]
true0 = [x for i,x in enumerate(list(true)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

Xtr,Ytr = createXY_point(df_mcp_oldctr, old_comp_list_s)
Xte,Yte = createXY_point(df_mcp_oldcte, old_comp_list_s)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old clay is %f" % float(log_loss(true,pred,eps=1e-15)))
pred_round = np.round(pred)
print("error for grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_round, true) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred)))
print("Number of predicted 0 values is " + str(len(pred_round)-sum(pred_round)))
print("min and max is %f and %f " % (float(max(pred)), float(min(pred))))
inds = [i for i,x in enumerate(list(true)) if x == 0]
true0 = [x for i,x in enumerate(list(true)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

"""Looking specifically at new games"""
Xtr,Ytr = createXY_point(df_mcp_newgtr, new_comp_list_s)
Xte,Yte = createXY_point(df_mcp_newgte, new_comp_list_s)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new grass is %f" % float(log_loss(true,pred,eps=1e-15)))
pred_round = np.round(pred)
print("error for grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_round, true) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred)))
print("Number of predicted 0 values is " + str(len(pred_round)-sum(pred_round)))
print("min and max is %f and %f " % (float(max(pred)), float(min(pred))))
inds = [i for i,x in enumerate(list(true)) if x == 0]
true0 = [x for i,x in enumerate(list(true)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

Xtr,Ytr = createXY_point(df_mcp_newhtr, new_comp_list_s)
Xte,Yte = createXY_point(df_mcp_newhte, new_comp_list_s)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new hard is %f" % float(log_loss(true,pred,eps=1e-15)))
pred_round = np.round(pred)
print("error for grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_round, true) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred)))
print("Number of predicted 0 values is " + str(len(pred_round)-sum(pred_round)))
print("min and max is %f and %f " % (float(max(pred)), float(min(pred))))
inds = [i for i,x in enumerate(list(true)) if x == 0]
true0 = [x for i,x in enumerate(list(true)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

Xtr,Ytr = createXY_point(df_mcp_newctr, new_comp_list_s)
Xte,Yte = createXY_point(df_mcp_newcte, new_comp_list_s)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new clay is %f" % float(log_loss(true,pred,eps=1e-15)))
pred_round = np.round(pred)
print("error for grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Number of predicted correctly misses is " + str(sum([1 for x,y in zip(pred_round, true) if x == 0 and y == 0])))
print("Length of Yte_pred is " + str(len(pred)))
print("Number of predicted 0 values is " + str(len(pred_round)-sum(pred_round)))
print("min and max is %f and %f " % (float(max(pred)), float(min(pred))))
inds = [i for i,x in enumerate(list(true)) if x == 0]
true0 = [x for i,x in enumerate(list(true)) if i in inds]
pred0 = [x for i,x in enumerate(list(pred)) if i in inds]
print("Accuracy is %f" % (len([i for i,x in enumerate(list(true0)) if np.round(pred0)[i] == x])/len(true0)))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

print("Total error using this strategy is " + str(log_loss(y_true_l,y_pred_l, eps=1e-15)))

Optimization terminated successfully.
         Current function value: 0.629938
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:             pt_winner1   No. Observations:                 6085
Model:                          Logit   Df Residuals:                     6046
Method:                           MLE   Df Model:                           38
Date:                Sun, 12 Apr 2020   Pseudo R-squ.:                0.006130
Time:                        13:01:05   Log-Likelihood:                -3833.2
converged:                       True   LL-Null:                       -3856.8
                                        LLR p-value:                    0.1437
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
breaks_p1                    -0.0169      0.088     -0.193      0.847      -0.

Accuracy is 0.000000
Optimization terminated successfully.
         Current function value: 0.668280
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:             pt_winner1   No. Observations:                15349
Model:                          Logit   Df Residuals:                    15310
Method:                           MLE   Df Model:                           38
Date:                Sun, 12 Apr 2020   Pseudo R-squ.:                0.005102
Time:                        13:01:17   Log-Likelihood:                -10257.
converged:                       True   LL-Null:                       -10310.
                                        LLR p-value:                 3.193e-08
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
breaks_p1                     0.0860      0.052      1.64

Accuracy is 0.002399
Optimization terminated successfully.
         Current function value: 0.648622
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:             pt_winner1   No. Observations:                94519
Model:                          Logit   Df Residuals:                    94492
Method:                           MLE   Df Model:                           26
Date:                Sun, 12 Apr 2020   Pseudo R-squ.:                0.008096
Time:                        13:01:32   Log-Likelihood:                -61307.
converged:                       True   LL-Null:                       -61808.
                                        LLR p-value:                2.473e-194
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
breaks_p1                     0.0176      0.022      0.81

In [29]:
"""Old regression from PBP style but on this dataset"""
comp_listpbp = ['serve_pct_p1', 'return_win_p1', 'elo_p1', 'elo_p2', 'momentum']

def createXpbp(dfc, comp_list):
    # returner record, server record
    #return dfc[['breaks_p1', 'game_no', 'gamept_marginp1', 'gamept_no'] + comp_list]
    return dfc[['breaks_p1', 'game_no','game_score_p1', 'game_score_p2', 'match_score_p1', 'match_score_p2'] + comp_list]

    
# NEED TO CHANGE SUCH THAT THE PERSON SERVING IS ALWAYS P1
def createXY_pointpbp(df, comp_list):
    df['return_win_p1'] = 1- df['serve_pct_p2']
    df['return_win_p2'] = 1- df['serve_pct_p1']
    df_toswap = df[df.apply(lambda x: x['current_serv_num1'] == 0, axis = 1)]
    df_okay = df[df.apply(lambda x: x['current_serv_num1'] == 1, axis = 1)]
    df = pd.concat([df_okay, swap(df_toswap)])
    #dfc = df.copy()[['breaks_p1', 'game_no', 'gamept_marginp1', 'gamept_no','pt_winner1'] + comp_list].dropna()
    dfc = df.copy()[['breaks_p1', 'game_no', 'game_score_p1', 'game_score_p2', 'match_score_p1', 'match_score_p2', 'pt_winner1'] + comp_list].dropna()
    X = createXpbp(dfc, comp_list)
    Y = dfc['pt_winner1'].copy()
    
    return X,Y

In [30]:
y_true_l,y_pred_l = [],[]
y_true_l2,y_pred_l2 = [],[]
y_true_l3,y_pred_l3 = [],[]

Xtr,Ytr = createXY_pointpbp(df_mcp_newgtr, comp_listpbp)
Xte,Yte = createXY_pointpbp(df_mcp_newgte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new grass is %f" % float(log_loss(true,pred,eps=1e-15)))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))

Xtr,Ytr = createXY_pointpbp(df_mcp_oldgtr, comp_listpbp)
Xte,Yte = createXY_pointpbp(df_mcp_oldgte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old grass is %f" % float(log_loss(true,pred,eps=1e-15)))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
print("Total grass log error is " + str(log_loss(y_true_l,y_pred_l,eps=1e-15)))

Xtr,Ytr = createXY_pointpbp(df_mcp_newhtr, comp_listpbp)
Xte,Yte = createXY_pointpbp(df_mcp_newhte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new hard is %f" % float(log_loss(true,pred,eps=1e-15)))
y_true_l2.extend(list(true))
y_pred_l2.extend(list(pred))

Xtr,Ytr = createXY_pointpbp(df_mcp_oldhtr, comp_listpbp)
Xte,Yte = createXY_pointpbp(df_mcp_oldhte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old hard is %f" % float(log_loss(true,pred,eps=1e-15)))
y_true_l2.extend(list(true))
y_pred_l2.extend(list(pred))
print("Total hard log error is " + str(log_loss(y_true_l2,y_pred_l2,eps=1e-15)))

Xtr,Ytr = createXY_pointpbp(df_mcp_newctr, comp_listpbp)
Xte,Yte = createXY_pointpbp(df_mcp_newcte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new clay is %f" % float(log_loss(true,pred,eps=1e-15)))
y_true_l3.extend(list(true))
y_pred_l3.extend(list(pred))

Xtr,Ytr = createXY_pointpbp(df_mcp_oldctr, comp_listpbp)
Xte,Yte = createXY_pointpbp(df_mcp_oldcte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old clay is %f" % float(log_loss(true,pred,eps=1e-15)))
y_true_l3.extend(list(true))
y_pred_l3.extend(list(pred))
print("Total grass log error is " + str(log_loss(y_true_l3,y_pred_l3,eps=1e-15)))

y_true_tot = y_true_l + y_true_l2 + y_true_l3
y_pred_tot = y_pred_l + y_pred_l2 + y_pred_l3
print("Total log error is " + str(log_loss(y_true_tot,y_pred_tot,eps=1e-15)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Optimization terminated successfully.
         Current function value: 0.629728
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:             pt_winner1   No. Observations:                21805
Model:                          Logit   Df Residuals:                    21794
Method:                           MLE   Df Model:                           10
Date:                Sun, 12 Apr 2020   Pseudo R-squ.:                0.007189
Time:                        13:02:31   Log-Likelihood:                -13731.
converged:                       True   LL-Null:                       -13831.
                                        LLR p-value:                 2.812e-37
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
breaks_p1          0.0195      0.023      0.862      0.389      -0.025       0.064
game_no          

Optimization terminated successfully.
         Current function value: 0.662515
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:             pt_winner1   No. Observations:                58621
Model:                          Logit   Df Residuals:                    58610
Method:                           MLE   Df Model:                           10
Date:                Sun, 12 Apr 2020   Pseudo R-squ.:                0.004932
Time:                        13:02:50   Log-Likelihood:                -38837.
converged:                       True   LL-Null:                       -39030.
                                        LLR p-value:                 1.444e-76
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
breaks_p1          0.0410      0.011      3.766      0.000       0.020       0.062
game_no          

# MCP Dataset, to predict Match outcomes

In [31]:
def createXY_set(df, comp_list):
    dfc = df.copy()[['breaks_p1', 'game_no', 'game_score_p1', 'game_score_p2','set_true_label_p1', 'gamept_no', 'momentum'] + comp_list].dropna()
    X = createX(dfc, comp_list)
    Y = dfc['set_true_label_p1'].copy()
    
    return X,Y

def createXY_match(df, comp_list):
    dfc = df.copy()[['breaks_p1', 'game_no', 'game_score_p1', 'game_score_p2', 'gamept_no','momentum', 'match_true_label_p1'] + comp_list].dropna()
    X = createX(dfc, comp_list)
    Y = dfc['match_true_label_p1'].copy()
    
    return X,Y

In [32]:
df_mcp = pd.concat([df_mcp_old, df_mcp_new])

df_mcpg_tr = pd.concat([df_mcp_oldgtr, df_mcp_newgtr])
df_mcph_tr = pd.concat([df_mcp_oldhtr, df_mcp_newhtr])
df_mcpc_tr = pd.concat([df_mcp_oldctr, df_mcp_newctr])
df_mcpg_te = pd.concat([df_mcp_oldgte, df_mcp_newgte])
df_mcph_te = pd.concat([df_mcp_oldhte, df_mcp_newhte])
df_mcpc_te = pd.concat([df_mcp_oldcte, df_mcp_newcte])

In [33]:
"""Using MCP Data to predict overall match outcome, without prev information split"""
y_true_l = []
y_pred_l = []
tot_err = 0
tot_len = 0

Xtr,Ytr = createXY_match(df_mcpg_tr, new_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcpg_tr), new_comp_list)
Xte,Yte = createXY_match(df_mcpg_te, new_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcpg_te), new_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
tot_err += sum(np.round(pred) != Yte)
tot_len += len(Yte)

Xtr,Ytr = createXY_match(df_mcph_tr, new_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcph_tr), new_comp_list)
Xte,Yte = createXY_match(df_mcph_te, new_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcph_te), new_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for hard is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
tot_err += sum(np.round(pred) != Yte)
tot_len += len(Yte)

Xtr,Ytr = createXY_match(df_mcpc_tr, new_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcpc_tr), new_comp_list)
Xte,Yte = createXY_match(df_mcpc_te, new_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcpc_te), new_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for clay is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
tot_err += sum(np.round(pred) != Yte)
tot_len += len(Yte)


print("Total error using this strategy is " + str(log_loss(y_true_l,y_pred_l, eps=1e-15)))
print("Misclassification error is %f" % float(tot_err/tot_len))


Optimization terminated successfully.
         Current function value: 0.388150
         Iterations 7
                            Logit Regression Results                           
Dep. Variable:     match_true_label_p1   No. Observations:                50428
Model:                           Logit   Df Residuals:                    50400
Method:                            MLE   Df Model:                           27
Date:                 Sun, 12 Apr 2020   Pseudo R-squ.:                  0.4400
Time:                         13:02:55   Log-Likelihood:                -19574.
converged:                        True   LL-Null:                       -34954.
                                         LLR p-value:                     0.000
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
breaks_p1                    -0.2086      0.039     -5.363      0.000 

error for clay is 0.503829
Misclassification error is 0.249274
Total error using this strategy is 0.5440293990462445
Misclassification error is 0.263990


In [34]:
"""Similar Model but controlling for OLD and NEW, using extra variables"""
"""Using MCP Data to predict overall match outcome, without prev information split"""
y_true_ln = []
y_pred_ln = []
y_true_lo = []
y_pred_lo = []
tot_err_o,tot_err_n = 0,0
tot_len_o,tot_len_n = 0,0

"""OLD ONES"""
Xtr,Ytr = createXY_match(df_mcp_oldgtr, old_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_oldgtr), old_comp_list)
Xte,Yte = createXY_match(df_mcp_oldgte, old_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_oldgte), old_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_lo.extend(list(true))
y_pred_lo.extend(list(pred))
tot_err_o += sum(np.round(pred) != Yte)
tot_len_o += len(Yte)

coefs = {x:[] for x in Xtr.columns}
for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

Xtr,Ytr = createXY_match(df_mcp_oldhtr, old_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_oldhtr), old_comp_list)
Xte,Yte = createXY_match(df_mcp_oldhte, old_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_oldhte), old_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old hard is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_lo.extend(list(true))
y_pred_lo.extend(list(pred))
tot_err_o += sum(np.round(pred) != Yte)
tot_len_o += len(Yte)

for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

Xtr,Ytr = createXY_match(df_mcp_oldctr, old_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_oldctr), old_comp_list)
Xte,Yte = createXY_match(df_mcp_oldcte, old_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_oldcte), old_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old clay is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_lo.extend(list(true))
y_pred_lo.extend(list(pred))
tot_err_o += sum(np.round(pred) != Yte)
tot_len_o += len(Yte)

for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

print("Total error using this strategy is " + str(log_loss(y_true_lo,y_pred_lo, eps=1e-15)))
print("Misclassification error is %f" % float(tot_err_o/tot_len_o))

"""NEW ONES"""
Xtr,Ytr = createXY_match(df_mcp_newgtr, new_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_newgtr), new_comp_list)
Xte,Yte = createXY_match(df_mcp_newgte, new_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_newgte), new_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_ln.extend(list(true))
y_pred_ln.extend(list(pred))
tot_err_n += sum(np.round(pred) != Yte)
tot_len_n += len(Yte)

for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

Xtr,Ytr = createXY_match(df_mcp_newhtr, new_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_newhtr), new_comp_list)
Xte,Yte = createXY_match(df_mcp_newhte, new_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_newhte), new_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new hard is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_ln.extend(list(true))
y_pred_ln.extend(list(pred))
tot_err_n += sum(np.round(pred) != Yte)
tot_len_n += len(Yte)


for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

Xtr,Ytr = createXY_match(df_mcp_newctr, new_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_newctr), new_comp_list)
Xte,Yte = createXY_match(df_mcp_newcte, new_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_newcte), new_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new clay is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_ln.extend(list(true))
y_pred_ln.extend(list(pred))
tot_err_n += sum(np.round(pred) != Yte)
tot_len_n += len(Yte)

for i,coef,p in zip(Xtr.columns, est.params.values, est.pvalues):
    if p < .01:
        coefs[i].append("{:.2e}".format(coef) + "**")
    elif p <=.05:
        coefs[i].append("{:.2e}".format(coef) + "*")
    else:
        coefs[i].append("{:.2e}".format(coef))

print("Total error using this strategy is " + str(log_loss(y_true_ln,y_pred_ln, eps=1e-15)))
print("Misclassification error is %f" % float(tot_err_n/tot_len_n))


print("Complete Total error using this strategy is " + str(log_loss(y_true_ln + y_true_lo,y_pred_ln+y_pred_lo, eps=1e-15)))
print("Complete Misclassification error is %f" % float((tot_err_n+tot_err_o)/(tot_len_n+tot_len_o)))

Optimization terminated successfully.
         Current function value: 0.275996
         Iterations 8
                            Logit Regression Results                           
Dep. Variable:     match_true_label_p1   No. Observations:                12170
Model:                           Logit   Df Residuals:                    12130
Method:                            MLE   Df Model:                           39
Date:                 Sun, 12 Apr 2020   Pseudo R-squ.:                  0.6018
Time:                         13:03:03   Log-Likelihood:                -3358.9
converged:                        True   LL-Null:                       -8435.6
                                         LLR p-value:                     0.000
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
breaks_p1                     0.1680      0.099      1.693      0.090 

Optimization terminated successfully.
         Current function value: 0.393248
         Iterations 7
                            Logit Regression Results                           
Dep. Variable:     match_true_label_p1   No. Observations:                30698
Model:                           Logit   Df Residuals:                    30658
Method:                            MLE   Df Model:                           39
Date:                 Sun, 12 Apr 2020   Pseudo R-squ.:                  0.4325
Time:                         13:03:05   Log-Likelihood:                -12072.
converged:                        True   LL-Null:                       -21273.
                                         LLR p-value:                     0.000
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
breaks_p1                     0.7017      0.047     14.794      0.000 

Optimization terminated successfully.
         Current function value: 0.458986
         Iterations 6
                            Logit Regression Results                           
Dep. Variable:     match_true_label_p1   No. Observations:               189038
Model:                           Logit   Df Residuals:                   189010
Method:                            MLE   Df Model:                           27
Date:                 Sun, 12 Apr 2020   Pseudo R-squ.:                  0.3378
Time:                         13:03:07   Log-Likelihood:                -86766.
converged:                        True   LL-Null:                   -1.3103e+05
                                         LLR p-value:                     0.000
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
breaks_p1                     0.6274      0.017     36.505      0.000 

In [36]:
"""Similar Model but controlling for OLD and NEW, using extra variables with RFR and SVM"""
"""Using MCP Data to predict overall match outcome, without prev information split"""
y_true_ln = []
y_pred_ln = []
y_true_lo = []
y_pred_lo = []
tot_err_l,tot_err_l2,tot_err_l3 = 0,0,0
tot_len_l,tot_len_l2,tot_len_l3 = 0,0,0
tot_svm_err_l, tot_svm_err_l2, tot_svm_err_l3 = 0,0,0

y_true_l,y_pred_l = [],[]
y_true_l2,y_pred_l2 = [],[]
y_true_l3,y_pred_l3 = [],[]

"""OLD ONES"""
Xtr,Ytr = createXY_match(df_mcp_oldgtr, old_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_oldgtr), old_comp_list)
Xte,Yte = createXY_match(df_mcp_oldgte, old_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_oldgte), old_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred = rfr_predict(Xtr, Ytr, Xte, Yte)
print("error for old grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
tot_err_l += sum(np.round(pred) != Yte)
tot_len_l += len(Yte)
#svm_err = svm_predict(Xtr,Ytr,Xte,Yte)
#print("Misclassification error for SVM is %f" % (svm_err/len(Yte)))
#tot_svm_err_o += svm_err

Xtr,Ytr = createXY_match(df_mcp_newgtr, new_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_newgtr), new_comp_list)
Xte,Yte = createXY_match(df_mcp_newgte, new_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_newgte), new_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred = rfr_predict(Xtr, Ytr, Xte, Yte)
print("error for new grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
tot_err_l += sum(np.round(pred) != Yte)
tot_len_l += len(Yte)
#svm_err = svm_predict(Xtr,Ytr,Xte,Yte)
#print("Misclassification error for SVM is %f" % (svm_err/len(Yte)))
#tot_svm_err_n += svm_err
print("error for all grass is %f" % float(log_loss(y_true_l,y_pred_l,eps=1e-15)))
print('Misclassification error is %f' % np.mean(np.round(y_pred_l) != y_true_l))

Xtr,Ytr = createXY_match(df_mcp_oldhtr, old_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_oldhtr), old_comp_list)
Xte,Yte = createXY_match(df_mcp_oldhte, old_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_oldhte), old_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred = rfr_predict(Xtr, Ytr, Xte, Yte)
print("error for old hard is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l2.extend(list(true))
y_pred_l2.extend(list(pred))
tot_err_l2 += sum(np.round(pred) != Yte)
tot_len_l2 += len(Yte)
#svm_err = svm_predict(Xtr,Ytr,Xte,Yte)
#print("Misclassification error for SVM is %f" % (svm_err/len(Yte)))
#tot_svm_err_o += svm_err


Xtr,Ytr = createXY_match(df_mcp_newhtr, new_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_newhtr), new_comp_list)
Xte,Yte = createXY_match(df_mcp_newhte, new_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_newhte), new_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred = rfr_predict(Xtr, Ytr, Xte, Yte)
print("error for new hard is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l2.extend(list(true))
y_pred_l2.extend(list(pred))
tot_err_l2 += sum(np.round(pred) != Yte)
tot_len_l2 += len(Yte)
#svm_err = svm_predict(Xtr,Ytr,Xte,Yte)
#print("Misclassification error for SVM is %f" % svm_err/len(Yte))
#tot_svm_err_o += svm_err

print("error for all hard is %f" % float(log_loss(y_true_l2,y_pred_l2,eps=1e-15)))
print('Misclassification error is %f' % np.mean(np.round(y_pred_l2) != y_true_l2))
#print("Misclassification error for SVM on OLD is %f" % (tot_svm_err_o/tot_len_o))

""" NEW SET """


Xtr,Ytr = createXY_match(df_mcp_oldctr, old_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_oldctr), old_comp_list)
Xte,Yte = createXY_match(df_mcp_oldcte, old_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_oldcte), old_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred = rfr_predict(Xtr, Ytr, Xte, Yte)
print("error for old clay is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l3.extend(list(true))
y_pred_l3.extend(list(pred))
tot_err_l3 += sum(np.round(pred) != Yte)
tot_len_l3 += len(Yte)
#svm_err = svm_predict(Xtr,Ytr,Xte,Yte)
#print("Misclassification error for SVM is %f" % (svm_err/len(Yte)))
#tot_svm_err_n += svm_err


Xtr,Ytr = createXY_match(df_mcp_newctr, new_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_newctr), new_comp_list)
Xte,Yte = createXY_match(df_mcp_newcte, new_comp_list)
Xte_s, Yte_s = createXY_match(swap(df_mcp_newcte), new_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
Xte = pd.concat([Xte, Xte_s])
Yte = pd.concat([Yte, Yte_s])
true,pred = rfr_predict(Xtr, Ytr, Xte, Yte)
print("error for new clay is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l3.extend(list(true))
y_pred_l3.extend(list(pred))
tot_err_l3 += sum(np.round(pred) != Yte)
tot_len_l3 += len(Yte)
#svm_err = svm_predict(Xtr,Ytr,Xte,Yte)
#print("Misclassification error for SVM is %f" % (svm_err/len(Yte)))
#tot_svm_err_n += svm_err

print("error for all clay is %f" % float(log_loss(y_true_l3,y_pred_l3,eps=1e-15)))
print('Misclassification error is %f' % np.mean(np.round(y_pred_l3) != y_true_l3))
#print("Misclassification error for SVM on NEW is %f" % (tot_svm_err_n/tot_len_n))


print("Complete Total error using this strategy is " + str(log_loss(y_true_l + y_true_l2+y_true_l3,y_pred_l+y_pred_l2+y_pred_l3, eps=1e-15)))
print("Complete Misclassification error is %f" % np.mean(np.round(y_pred_l + y_pred_l2 + y_pred_l3) != (y_true_l + y_true_l2+y_true_l3)))
#print("Complete Misclassification error for SVM is %f" % float((tot_svm_err_n+tot_svm_err_o)/(tot_len_n+tot_len_o)))

error for old grass is 0.749061
Misclassification error is 0.367755
error for new grass is 2.233454
Misclassification error is 0.345245
error for all grass is 2.056338
Misclassification error is 0.347931
error for old hard is 1.261828
Misclassification error is 0.349715
error for new hard is 0.884644
Misclassification error is 0.316581
error for all hard is 0.986473
Misclassification error is 0.325526
error for old clay is 0.919968
Misclassification error is 0.467044
error for new clay is 0.604889
Misclassification error is 0.305798
error for all clay is 0.670323
Misclassification error is 0.339284
Complete Total error using this strategy is 1.071273350728506
Complete Misclassification error is 0.332700


In [79]:
## Code from JEFF SACKMANN
##calculate the probability of the current server winning
## a 6-game, tiebreak set, given prob. of server winning any
## given service point (s) or return point (u), and the current
## game score (v, w)
 
## some results:
## http://summerofjeff.wordpress.com/2010/12/02/single-set-win-expectancy-tables/

from tennisGameProbability import gameProb
from tennisTiebreakProbability import tiebreakProb
 
def fact(x):
    if x in [0, 1]:  return 1
    r = 1
    for a in range(1, int(x+1)):  r = r*a
    return r
 
def ch(a, b):
    return fact(a)/(fact(b)*fact(a-b))
 
def setOutcome(final, sGames, rGames, vw, g, h):
    pOutcome = 0
    for j in range((int(sGames+1))):
        for k in range(int(rGames+1)):
            if (j + k) == (6 - 1 - vw):
                m = sGames - j
                n = rGames - k
                p = (g**j)*(h**k)*((1-g)**m)*((1-h)**n)*ch(sGames,j)*ch(rGames,k)*final
                pOutcome += p
            else:   continue
    return pOutcome
 
def setGeneral(s, u, v=0, w=0, tb=1):
    ## calculate the probability of the current server winning
    ## a 6-game, tiebreak set, given prob. of server winning any
    ## given service point (s) or return point (u), and the current
    ## game score (v, w)
    ## get prob of current server winning a service game:
    g = gameProb(s)
    ## and current server winning a return game:
    h = gameProb(u)
    ## is set over?
    if tb:
        if v == 7:  return 1
        elif w == 7:    return 0
        elif v == 6 and (v-w) > 1:  return 1
        elif w == 6 and (w-v) > 1:  return 0
        else:   pass
    else:
        if v >= 6 and (v-w) > 1:    return 1
        elif w >= 6 and (w-v) > 1:  return 0
        else:   pass
    ## if not over, re-adjust down to no higher than 6-6
    while True:
        if (v+w) > 12:
            v -= 1
            w -= 1
        else:   break
    ## if no tiebreak, chance that server wins set is ratio of server's prob of winning
    ## two games in a row to returner's prob of winning two games in a row
    if not tb:  deuceprob = (g*h)/((g*h) + (1-g)*(1-h))
    outcomes = {}
    ## special cases, 11 games or more already
    if (v+w) == 12:
        if tb:
            tp = tiebreakProb(s, u)
            outcomes['76'] = tp
            outcomes['67'] = 1 - tp
        else:
            outcomes['75'] = deuceprob
            outcomes['57'] = 1-deuceprob 
    elif (v+w) == 11:
        if tb:
            tp = tiebreakProb((1-u), (1-s))
            if v == 6:
                outcomes['75'] = g
                x = (1-g)
                outcomes['76'] = x*(1 - tp)
                outcomes['67'] = x*tp
            else:
                outcomes['57'] = 1-g
                x = g
                outcomes['76'] = x*(1 - tp)
                outcomes['67'] = x*tp
        else:
            if v == 6:
                outcomes['75'] = g
                outcomes['57'] = 0
                f = 1 - g ## f is p(getting to 6-6)
            else:
                outcomes['57'] = 1-g
                outcomes['75'] = 0
                f = g ## f is p(getting to 6-6)
            outcomes['75'] += f*deuceprob
            outcomes['57'] += f*(1-deuceprob)            
    else:   
        ## win probabilities
        for i in range(5): ## i = 0
            t = 6 + i - v - w ## total games remaining in set
            if t < 1:   continue
            if t % 2 == 0:
                final = h
                sGames = t/2
                rGames = sGames - 1
            else:
                final = g
                sGames = (t-1)/2
                rGames = (t-1)/2
            pOutcome = setOutcome(final, sGames, rGames, v, g, h)
            key = '6' + str(i)
            outcomes[key] = pOutcome
        ## loss probabilities
        ## this section isn't necessary, but I wrote it for informal
        ## testing purposes
        for i in range(5):
            t = 6 + i - v - w ## total games in set; here it's 6
            if t < 1:   continue
            if t % 2 == 0:
                final = 1-h
                sGames = t/2
                rGames = sGames - 1
            else:
                final = 1-g
                sGames = (t-1)/2
                rGames = (t-1)/2
            pOutcome = setOutcome(final, sGames, rGames, w, (1-g), (1-h))
            key = str(i) + '6'
            outcomes[key] = pOutcome       
        ## prob of getting to 5-5
        t = 10 - v - w
        if t % 2 == 0:
            sGames = t/2
            rGames = t/2
        else:
            sGames = (t-1)/2 + 1
            rGames = (t-1)/2
        f = setOutcome(1, sGames, rGames, v, g, h)
        if tb == 1:
            outcomes['75'] = f*g*h
            outcomes['57'] = f*(1-g)*(1-h)
            x = f*g*(1-h) + f*(1-g)*h ## p(getting to 6-6)    
            if (v+w) % 2 == 0:
                tp = tiebreakProb(s, u)
            else:
                tp = tiebreakProb(u, s)
            outcomes['76'] = x*tp
            outcomes['67'] = x - x*tp
        else:
            outcomes['75'] = f*deuceprob
            outcomes['57'] = f*(1-deuceprob)        
    win = 0
    for o in outcomes:
        if o in ['60', '61', '62', '63', '64', '75', '76']:
            win += outcomes[o]
        else:   pass
    return win

In [84]:
## calculates probability of winning a tennis match from any given score dependent on the skill levels
## of the two players
 
def fact(x):
    if x in [0, 1]:  return 1
    r = 1
    for a in range(1, int(x+1)):  r = r*a
    return r
 
def ch(a, b):
    return fact(a)/(fact(b)*fact(a-b))
 
def matchGeneral(e, v=0, w=0, s=3):
    ## calculates probability of winning the match
    ## from the beginning of a set
    ## e is p(winning a set)
    ## v and w is current set score
    ## s is total number of sets ("best of")
    towin = (s+1)/2
    left = towin - v
    if left == 0:   return 1
    remain = s - v - w
    if left > remain:   return 0
    win = 0
    for i in range(int(left), int(remain+1)):
        add = ch((i-1), (left-1))*(e**(left-1))*((1-e)**(i-left))*e
        win += add
    return win
 
def matchProb(s, t, gv=0, gw=0, sv=0, sw=0, mv=0, mw=0, sets=3):
    ## calculates probability of winning a match from any given score,
    ## given:
    ## s, t: p(server wins a service point), p(server wins return point)
    ## gv, gw: current score within the game. e.g. 30-15 is 2, 1
    ## sv, sw: current score within the set. e.g. 5, 4
    ## mv, mw: current score within the match (number of sets for each player)
    ## v's are serving player; w's are returning player
    ## sets: "best of", so default is best of 3
    a = gameProb(s)
    b = gameProb(t)
    c = setGeneral(s, t)
    if gv == 0 and gw == 0: ## no point score
        if sv == 0 and sw == 0: ## no game score
            return matchGeneral(c, v=mv, w=mw, s=sets)
        else:   ## we're in mid-set, no point score
            sWin = setGeneral(s, t, v=sv, w=sw)
            sLoss = 1 - sWin
    elif sv == 6 and sw == 6:         
        sWin = tiebreakProb(s, t, v=gv, w=gw)
        sLoss = 1 - sWin       
    else:
        gWin = gameProb(s, v=gv, w=gw)
        gLoss = 1 - gWin
        sWin = gWin*(1 - setGeneral((1-t), (1-s), v=sw, w=(sv+1)))
        sWin += gLoss*(1 - setGeneral((1-t), (1-s), v=(sw+1), w=sv))
        sLoss = 1 - sWin
    mWin = sWin*matchGeneral(c, v=(mv+1), w=mw, s=sets)
    mWin += sLoss*matchGeneral(c, v=mv, w=(mw+1), s=sets)
    return mWin

#print(matchProb(.678, .4145, 0, 1, 0,0,0,0,3))

In [132]:
"""Calculating error over the OLD and NEW datasets using a Markov Model using a point probability estimate"""

def getMarkovp(row):
    surf = row['surface']
    sfrac = serv_pr_tr_elo_surf[row['server']][surf][get_elo_bucket(row['elo_server'], row['elo_returner'])]
    if sfrac[1] == 0:
        sfrac = serv_pr_tr_surf[row['server']][surf]
    if sfrac[1] == 0:
        if surf == 'Grass':
            sfrac = (all_serv_wing, 1)
        elif surf == 'Hard':
            sfrac = (all_serv_winh, 1)
        else:
            sfrac = (all_serv_winc, 1)
    tfrac = serv_pr_tr_elo_surf[row['returner']][surf][get_elo_bucket(row['elo_server'], row['elo_returner'])]
    if tfrac[1] == 0:
        tfrac = serv_pr_tr_surf[row['returner']][surf]
    if tfrac[1] == 0:
        if surf == 'Grass':
            tfrac = (all_serv_wing, 1)
        elif surf == 'Hard':
            tfrac = (all_serv_winh, 1)
        else:
            tfrac = (all_serv_winc, 1)

    return matchProb(sfrac[0]/sfrac[1], 1-tfrac[0]/tfrac[1], row['game_score_s'], row['game_score_r'], row['set_score_s'], row['set_score_r'], row['match_score_s'], row['match_score_r'], row['bestof'])

df_mcp_oldgtr['serv_prob_markovp'] = df_mcp_oldgtr.apply(lambda row: getMarkovp(row), axis = 1)
df_mcp_oldgte['serv_prob_markovp'] = df_mcp_oldgte.apply(lambda row: getMarkovp(row), axis = 1)
df_mcp_oldhtr['serv_prob_markovp'] = df_mcp_oldhtr.apply(lambda row: getMarkovp(row), axis = 1)
df_mcp_oldhte['serv_prob_markovp'] = df_mcp_oldhte.apply(lambda row: getMarkovp(row), axis = 1)
df_mcp_oldctr['serv_prob_markovp'] = df_mcp_oldctr.apply(lambda row: getMarkovp(row), axis = 1)
df_mcp_oldcte['serv_prob_markovp'] = df_mcp_oldcte.apply(lambda row: getMarkovp(row), axis = 1)

df_mcp_newgtr['serv_prob_markovp'] = df_mcp_newgtr.apply(lambda row: getMarkovp(row), axis = 1)
df_mcp_newgte['serv_prob_markovp'] = df_mcp_newgte.apply(lambda row: getMarkovp(row), axis = 1)
df_mcp_newhtr['serv_prob_markovp'] = df_mcp_newhtr.apply(lambda row: getMarkovp(row), axis = 1)
df_mcp_newhte['serv_prob_markovp'] = df_mcp_newhte.apply(lambda row: getMarkovp(row), axis = 1)
df_mcp_newctr['serv_prob_markovp'] = df_mcp_newctr.apply(lambda row: getMarkovp(row), axis = 1)
df_mcp_newcte['serv_prob_markovp'] = df_mcp_newcte.apply(lambda row: getMarkovp(row), axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [133]:
#serv_pr_tr_elo[df_mcp_oldgtr.iloc[0]['server']]
df_mcp_oldgtr['serv_prob_markovp1'] = df_mcp_oldgtr.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)
df_mcp_oldgte['serv_prob_markovp1'] = df_mcp_oldgte.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)
df_mcp_oldhtr['serv_prob_markovp1'] = df_mcp_oldhtr.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)
df_mcp_oldhte['serv_prob_markovp1'] = df_mcp_oldhte.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)
df_mcp_oldctr['serv_prob_markovp1'] = df_mcp_oldctr.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)
df_mcp_oldcte['serv_prob_markovp1'] = df_mcp_oldcte.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)

df_mcp_newgtr['serv_prob_markovp1'] = df_mcp_newgtr.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)
df_mcp_newgte['serv_prob_markovp1'] = df_mcp_newgte.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)
df_mcp_newhtr['serv_prob_markovp1'] = df_mcp_newhtr.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)
df_mcp_newhte['serv_prob_markovp1'] = df_mcp_newhte.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)
df_mcp_newctr['serv_prob_markovp1'] = df_mcp_newctr.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)
df_mcp_newcte['serv_prob_markovp1'] = df_mcp_newcte.apply(lambda row: row['serv_prob_markovp'] if row['current_serv_num1'] == 1 else 1-row['serv_prob_markovp'], axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [134]:
def get_true_pred(df, comp_list):
    dfc = df.copy()[['breaks_p1', 'game_no', 'game_score_p1', 'game_score_p2', 'gamept_no','momentum', 'match_true_label_p1', 'serv_prob_markovp1'] + comp_list].dropna()
    dfc = dfc[dfc.apply(lambda x: x['serv_prob_markovp1'] <= 1 and x['serv_prob_markovp1'] >= 0, axis = 1)]
    return dfc['match_true_label_p1'], dfc['serv_prob_markovp1']

true1,pred1 = get_true_pred(df_mcp_oldgte, old_comp_list)
true2,pred2 = get_true_pred(df_mcp_newgte, new_comp_list)
print("error for all grass is %f" % float(log_loss(list(true1)+list(true2),list(pred1)+list(pred2),eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(list(pred1)+list(pred2)) != (list(true1)+list(true2))))

true1,pred1 = get_true_pred(df_mcp_oldhte, old_comp_list)
true2,pred2 = get_true_pred(df_mcp_newhte, new_comp_list)
print("error for all hard is %f" % float(log_loss(list(true1)+list(true2),list(pred1)+list(pred2),eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(list(pred1)+list(pred2)) != (list(true1)+list(true2))))

true1,pred1 = get_true_pred(df_mcp_oldcte, old_comp_list)
true2,pred2 = get_true_pred(df_mcp_newcte, new_comp_list)
print("error for all clay is %f" % float(log_loss(list(true1)+list(true2),list(pred1)+list(pred2),eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(list(pred1)+list(pred2)) != (list(true1)+list(true2))))


error for all grass is 0.857802
Misclassification error is 0.272154
error for all hard is 0.905029
Misclassification error is 0.371294
error for all clay is 0.821533
Misclassification error is 0.380202


In [None]:
old_comp_list = ['elo_p1', 'elo_p2', 'serve_pct_p1', 'serve_pct_p2', 'point_num', 'acesw_perc1', 'acesw_perc2', 'first_deep1', 'first_deep2', 'second_deep1', 'second_deep2', 'unfor_perc1', 'unfor_perc2', 'winner1', 'winner2', 'prev_acesw_perc1', 'prev_acesw_perc2', 'prev_first_deep1', 'prev_first_deep2', 'prev_second_deep1', 'prev_second_deep2', 'prev_winner1', 'prev_winner2', 'prev_unfor_perc1', 'prev_unfor_perc2', 'rec1', 'rec2', 'current_serv_num1', 'p1_ptwinsprob', 'p2_ptwinsprob', 'breaks_p1*game_no', 'gamept_marginp1*gamept_no', 'match_score_p1', 'match_score_p2']
df_ex = df_mcp_oldhte[df_mcp_oldhte['id'].apply(lambda x: x == '20190311-M-Indian_Wells_Masters-R64-Stanislas_Wawrinka-Marton_Fucsovics')]

Xtr,Ytr = createXY_match(df_mcp_oldhtr, old_comp_list)
Xtr_s, Ytr_s = createXY_match(swap(df_mcp_oldhtr), old_comp_list)
Xte,Yte = createXY_match(df_ex, old_comp_list)
Xtr = pd.concat([Xtr, Xtr_s])
Ytr = pd.concat([Ytr, Ytr_s])
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)

plt.rcParams['figure.figsize'] = [14, 10]
font = {'weight': 'normal','size'   : 12}

plt.rc('font', **font)
plt.plot(Xte['point_num'], pred)
plt.xlabel("Point number")
plt.ylabel("Probability of %s winning the match" % df_ex.iloc[0]['server'])
plt.title("%s vs %s at 2019 Indian Wells R64" % (df_ex.iloc[0]['server'], df_ex.iloc[0]['returner']))
plt.axvline(260, color='r', linestyle='dashed')
plt.text(255,.95,'Fucsovics breaks back', rotation='vertical')
plt.axvline(75, color='r', linestyle='dashed')
plt.text(76, .80, 'Wawrinka breaks to clinch first set 6-4', rotation='vertical')
plt.axvline(51, color='r', linestyle='dashed')
plt.text(52, .95, 'Fucsovics breaks to lead 4-3', rotation='vertical')
plt.axvline(174, color='r', linestyle='dashed')
plt.text(169, .77, 'Fucsovics wins tiebreak to tie set score at 1-1', rotation='vertical')
plt.axvline(183, color='r', linestyle='dashed')
plt.text(184, .75, 'Wawrinka breaks Fucsovics first game', rotation='vertical')
plt.axvline(265, color='r', linestyle='dashed')
plt.text(266,.77,'Wawrinka breaks to take 6-5 lead', rotation='vertical')

In [None]:
"""Old regression from PBP style but on this dataset"""
comp_listpbp = ['serve_pct_p1', 'return_win_p1', 'elo_p1', 'elo_p2', 'momentum']

def createXpbp(dfc, comp_list):
    # returner record, server record
    #return dfc[['breaks_p1', 'game_no', 'gamept_marginp1', 'gamept_no'] + comp_list]
    return dfc[['breaks_p1', 'game_no','game_score_p1', 'game_score_p2', 'match_score_p1', 'match_score_p2'] + comp_list]

    
# NEED TO CHANGE SUCH THAT THE PERSON SERVING IS ALWAYS P1
def createXY_matchpbp(df, comp_list):
    df['return_win_p1'] = 1- df['serve_pct_p2']
    df['return_win_p2'] = 1- df['serve_pct_p1']
    df_toswap = df[df.apply(lambda x: x['current_serv_num1'] == 0, axis = 1)]
    df_okay = df[df.apply(lambda x: x['current_serv_num1'] == 1, axis = 1)]
    df = pd.concat([df_okay, swap(df_toswap)])
    #dfc = df.copy()[['breaks_p1', 'game_no', 'gamept_marginp1', 'gamept_no','pt_winner1'] + comp_list].dropna()
    dfc = df.copy()[['breaks_p1', 'game_no', 'game_score_p1', 'game_score_p2', 'match_score_p1', 'match_score_p2', 'match_true_label_p1'] + comp_list].dropna()
    X = createXpbp(dfc, comp_list)
    Y = dfc['match_true_label_p1'].copy()
    
    return X,Y

In [None]:
y_true_l,y_pred_l = [],[]
y_true_l2,y_pred_l2 = [],[]
y_true_l3,y_pred_l3 = [],[]
mis_1, tot_1 = 0,0
mis_2, tot_2 = 0,0
mis_3, tot_3 = 0,0

Xtr,Ytr = createXY_matchpbp(df_mcp_newgtr, comp_listpbp)
Xte,Yte = createXY_matchpbp(df_mcp_newgte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
mis_1 += sum(np.round(pred) != Yte)
tot_1 += len(Yte)

Xtr,Ytr = createXY_matchpbp(df_mcp_oldgtr, comp_listpbp)
Xte,Yte = createXY_matchpbp(df_mcp_oldgte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old grass is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l.extend(list(true))
y_pred_l.extend(list(pred))
mis_1 += sum(np.round(pred) != Yte)
tot_1 += len(Yte)
print("Total grass log error is " + str(log_loss(y_true_l,y_pred_l,eps=1e-15)))
print('Misclassification error for all grass is %f' % float(mis_1/tot_1))

Xtr,Ytr = createXY_matchpbp(df_mcp_newhtr, comp_listpbp)
Xte,Yte = createXY_matchpbp(df_mcp_newhte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new hard is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l2.extend(list(true))
y_pred_l2.extend(list(pred))
mis_2 += sum(np.round(pred) != Yte)
tot_2 += len(Yte)

Xtr,Ytr = createXY_matchpbp(df_mcp_oldhtr, comp_listpbp)
Xte,Yte = createXY_matchpbp(df_mcp_oldhte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old hard is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l2.extend(list(true))
y_pred_l2.extend(list(pred))
mis_2 += sum(np.round(pred) != Yte)
tot_2 += len(Yte)
print("Total hard log error is " + str(log_loss(y_true_l2,y_pred_l2,eps=1e-15)))
print('Misclassification error for all hard is %f' % float(mis_2/tot_2))


Xtr,Ytr = createXY_matchpbp(df_mcp_newctr, comp_listpbp)
Xte,Yte = createXY_matchpbp(df_mcp_newcte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for new clay is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l3.extend(list(true))
y_pred_l3.extend(list(pred))
mis_3 += sum(np.round(pred) != Yte)
tot_3 += len(Yte)


Xtr,Ytr = createXY_matchpbp(df_mcp_oldctr, comp_listpbp)
Xte,Yte = createXY_matchpbp(df_mcp_oldcte, comp_listpbp)
true,pred,est = log_predict(Xtr, Ytr, Xte, Yte)
print("error for old clay is %f" % float(log_loss(true,pred,eps=1e-15)))
print("Misclassification error is %f" % np.mean(np.round(pred) != Yte))
y_true_l3.extend(list(true))
y_pred_l3.extend(list(pred))
print("Total grass clay error is " + str(log_loss(y_true_l3,y_pred_l3,eps=1e-15)))
mis_3 += sum(np.round(pred) != Yte)
tot_3 += len(Yte)
print('Misclassification error for all clay is %f' % float(mis_3/tot_3))


y_true_tot = y_true_l + y_true_l2 + y_true_l3
y_pred_tot = y_pred_l + y_pred_l2 + y_pred_l3
print("Total log error is " + str(log_loss(y_true_tot,y_pred_tot,eps=1e-15)))
print("Misclassification error for total is %f" % float((mis_1 + mis_2+mis_3)/(tot_1+tot_2+tot_3)))