In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns=150
pd.options.display.max_seq_items=150
pd.options.display.max_rows=150

## This is lightly processed, sampled version of a historical tennis match dataset that can be found at http://tennis-data.co.uk/data.php

In [2]:
df = pd.read_csv('mini_tennis.csv')
print(df.shape)
df.head()

(63790, 52)


Unnamed: 0,Location1,Tournament,Date,Court,Surface,Round,Best of,Player1,Player2,P1Rank,P2Rank,P1Pts,P2Pts,P1_1,P2_1,P1_2,P2_2,P1_3,P2_3,P1_4,P2_4,P1_5,P2_5,P1_sets,P2_sets,B365_P1,B365_P2,PS_P1,PS_P2,year,month,set_win_pct,P1_games,P2_games,games_count,game_win_pct,p1_id,p2_id,location_id,month_id,round_id,indoor,outdoor,grass,clay,hard,carpet,bestof3,bestof5,match_last_night,games_last_night,match_win
0,"doha, qatar",Qatar Exxon Mobil Open,2007-01-01,Outdoor,Hard,1st Round,3,Martin A.,Rochus C.,61.0,78.0,641.0,504.0,4.0,6.0,5.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.9,1.8,2.03,1.885,2007,1,0.0,9.0,13.0,22.0,0.409091,579,762,33,0,0,0,1,0,0,1,0,1,0,0,0.0,0
1,"chennai, india",Chennai Open,2007-01-01,Outdoor,Hard,1st Round,3,Benneteau J.,Gabashvili T.,40.0,114.0,877.0,385.0,6.0,3.0,6.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.53,2.37,1.585,2.61,2007,1,1.0,12.0,7.0,19.0,0.631579,79,284,31,0,0,0,1,0,0,1,0,1,0,0,0.0,1
2,"Adelaide, australia",Next Generation Adelaide International,2007-01-01,Outdoor,Hard,Round Robin,3,Guccione C.,Becker B.,153.0,58.0,287.0,686.0,7.0,5.0,6.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.37,1.53,2.56,1.602,2007,1,1.0,13.0,8.0,21.0,0.619048,351,68,1,0,5,0,1,0,0,1,0,1,0,0,0.0,1
3,"Adelaide, australia",Next Generation Adelaide International,2007-01-01,Outdoor,Hard,Round Robin,3,Hrbaty D.,Luczak P.,21.0,166.0,1238.0,245.0,6.0,7.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.36,2.87,1.437,3.09,2007,1,0.0,9.0,13.0,22.0,0.409091,394,552,1,0,5,0,1,0,0,1,0,1,0,0,0.0,0
4,"Adelaide, australia",Next Generation Adelaide International,2007-01-01,Outdoor,Hard,Round Robin,3,Johansson J.,Serra F.,193.0,60.0,202.0,665.0,6.0,2.0,3.0,6.0,4.0,6.0,0.0,0.0,0.0,0.0,1.0,2.0,1.33,3.0,1.472,2.92,2007,1,0.333333,13.0,14.0,27.0,0.481481,421,823,1,0,5,0,1,0,0,1,0,1,0,0,0.0,0


## We notice right away that there is not a ton of match-play statistics. We could go find a secondary source, but is there a way to determine player attributes from game results? The idea is to use player embeddings as a collaborative filtering technique. I add a twist to make the embedding conditional on certain features below but the idea should be fairly straightforward.

In [3]:
df = df[['Location1', 'Tournament', 'Date', 'Court', 'Surface', 'Round',
       'Best of', 'Player1', 'Player2', 'P1Rank', 'P2Rank', 'P1Pts', 'P2Pts',
       'P1_1', 'P2_1', 'P1_2', 'P2_2', 'P1_3', 'P2_3', 'P1_4', 'P2_4', 'P1_5',
       'P2_5', 'P1_sets', 'P2_sets','year', 'month', 'set_win_pct', 'P1_games',
       'P2_games', 'games_count','game_win_pct', 'p1_id', 'p2_id', 'location_id', 
       'month_id', 'round_id','indoor', 'outdoor', 'grass', 'clay', 'hard', 'carpet', 
       'bestof3','bestof5', 'match_last_night', 'games_last_night', 'match_win']]

print(df.shape)
df.dropna(inplace=True)
print(df.shape)
le = LabelEncoder()
df['p1_id'] = le.fit_transform(df['Player1'])
df['p2_id'] = le.transform(df['Player2'])

df['Date'] = pd.to_datetime(df['Date'])

df = df[(df['P1Rank']<=100)]  ###Only matchups involving players who rank in the top 100 in the ATP are included.
df = df[(df['P2Rank']<=100)]

print(df.shape)

df['P1Rank'] = df['P1Rank'].astype(float)
df['P2Rank'] = df['P2Rank'].astype(float)

df.reset_index(drop=True,inplace=True)

(63790, 48)
(63364, 48)
(41215, 48)


In [4]:
df.columns

Index(['Location1', 'Tournament', 'Date', 'Court', 'Surface', 'Round',
       'Best of', 'Player1', 'Player2', 'P1Rank', 'P2Rank', 'P1Pts', 'P2Pts',
       'P1_1', 'P2_1', 'P1_2', 'P2_2', 'P1_3', 'P2_3', 'P1_4', 'P2_4', 'P1_5',
       'P2_5', 'P1_sets', 'P2_sets', 'year', 'month', 'set_win_pct',
       'P1_games', 'P2_games', 'games_count', 'game_win_pct', 'p1_id', 'p2_id',
       'location_id', 'month_id', 'round_id', 'indoor', 'outdoor', 'grass',
       'clay', 'hard', 'carpet', 'bestof3', 'bestof5', 'match_last_night',
       'games_last_night', 'match_win'],
      dtype='object')

In [5]:
df_train = df[(df['Date']<pd.to_datetime('2018-01-01'))&
              (df['Date']>pd.to_datetime('2016-01-01'))]
df_train.reset_index(inplace=True)

df_test = df[df['Date']>=pd.to_datetime('2018-01-01')]
df_test.reset_index(inplace=True)

## The functions below are a slightly altered version of the backpropagation algorithm. I modified an old version of backprop in pure numpy that I wrote last year

In [6]:
#First we will initialize our variables.

def initialize_weights_glorot(length,width,depth=1):
    '''
    Please be aware that this same function will be used to initialize biases.
    I am not at all sure this makes sense given the variance argument in paper.
    '''
    std_glo = np.sqrt(2.0/(length+width))
    weights = np.random.normal(loc=0.0,scale=std_glo,size=(depth,length,width))
    return weights

def initialize_weights_zeros(length,width):
    weights = np.zeros(shape=(length,width))
    return weights


def initialize_weights_glorot2D(length,width):
    '''
    Please be aware that this same function will be used to initialize biases.
    I am not at all sure this makes sense given the variance argument in paper.
    '''
    std_glo = np.sqrt(2.0/(length+width))
    weights = np.random.normal(loc=0.0,scale=std_glo,size=(length,width))
    return weights

In [7]:
def get_relu(z):
    return np.where(z<=0,0.0,z)

def feed_forward(weights,weights_1,weights_2,biases,biases_1,biases_2,inputs):
    z = inputs.dot(weights)+biases
    relu_act = get_relu(z)
    z_1 = relu_act.dot(weights_1)+biases_1
    relu_act_1 = get_relu(z_1)
    z_2 = relu_act_1.dot(weights_2)+biases_2
    #soft_preds = get_softmax(z_2)
    return z_2,relu_act,relu_act_1

#Get the derivative functions for updates
def derivatives_weights_final(labels,preds,relu_act):
    dL_dz = 2*(preds - labels)
    dL_w = relu_act.T.dot(dL_dz)
    return dL_w
    
def derivatives_biases_final(labels,preds):
    dL_db = 2*(preds - labels)
    return dL_db
    
def relu_diff(relu_act):
    return np.where(relu_act<=0,0.0,1.0)

def derivatives_weights_1(labels,preds,weights_2,relu_act_1,relu_act):
    dL_dz = 2*(preds - labels)
    dL_dA = dL_dz.dot(weights_2.T)
    dL_dz1 = dL_dA*relu_diff(relu_act_1)
    dL_dw1 = relu_act.T.dot(dL_dz1)
    return dL_dw1

def derivatives_biases_1(labels,preds,weights_2,relu_act_1):
    dL_dz = 2*(preds - labels)
    dL_dA = dL_dz.dot(weights_2.T)
    dL_dz1 = dL_dA*relu_diff(relu_act_1)
    dL_db = dL_dz1
    return dL_db

def derivatives_weights_2(labels,preds,weights_2,relu_act_1,relu_act,weights_1,inputs):
    dL_dz2 = 2*(preds - labels)
    dL_dA1 = dL_dz2.dot(weights_2.T)
    dL_dz1 = dL_dA1*relu_diff(relu_act_1)
    dL_dA = dL_dz1.dot(weights_1.T)
    dL_dz = dL_dA*relu_diff(relu_act)
    dL_dw = inputs.T.dot(dL_dz)
    return dL_dw
    
def derivatives_biases_2(labels,preds,weights_2,relu_act_1,relu_act,weights_1):
    dL_dz2 = 2*(preds - labels)
    dL_dA1 = dL_dz2.dot(weights_2.T)
    dL_dz1 = dL_dA1*relu_diff(relu_act_1)
    dL_dA = dL_dz1.dot(weights_1.T)
    dL_dz = dL_dA*relu_diff(relu_act)
    dL_db = dL_dz
    return dL_db

def get_updates(labels,preds,weights_2,relu_act_1,relu_act,weights_1,inputs):
    dL_w = derivatives_weights_2(labels,preds,weights_2,relu_act_1,relu_act,weights_1,inputs)
    dL_w1 = derivatives_weights_1(labels,preds,weights_2,relu_act_1,relu_act)
    dL_w2 = derivatives_weights_final(labels,preds,relu_act_1)
    dL_b = derivatives_biases_2(labels,preds,weights_2,relu_act_1,relu_act,weights_1)
    dL_b1 = derivatives_biases_1(labels,preds,weights_2,relu_act_1)
    dL_b2 = derivatives_biases_final(labels,preds)
    return dL_w,dL_w1,dL_w2,dL_b,dL_b1,dL_b2



def get_loss(labels,preds):
    return np.mean((labels-preds)**2)

## The variables we are conditioning our embeddings on (and the target set_win_pct which we are not conditioning on):

In [8]:
train_conditionals = df_train[['p1_id','set_win_pct','outdoor', 'grass', 'clay', 'hard', 'games_last_night']]

In [9]:
dict_length = np.max(df['p1_id'].unique())+1

In [10]:
biases = initialize_weights_zeros(dict_length,25)
#biases = biases.reshape(32)
weights = initialize_weights_glorot(5,25,dict_length)
biases_1 = initialize_weights_zeros(100,1)
biases_1 = biases_1.reshape(100)
weights_1 = initialize_weights_glorot2D(25,100)
biases_2 = initialize_weights_zeros(1,1)
biases_2 = biases_2.reshape(1)
weights_2 = initialize_weights_glorot2D(100,1)


In [11]:
input_embedding = train_conditionals[df_train['p1_id']==5]
input_embedding.head()

Unnamed: 0,p1_id,set_win_pct,outdoor,grass,clay,hard,games_last_night


In [12]:
label_embedding = train_conditionals['set_win_pct']
label_embedding[0:5]

0    1.000000
1    0.000000
2    1.000000
3    0.333333
4    0.000000
Name: set_win_pct, dtype: float64

## Notice that we have created a conditional embedding for just one player per matchup. So the first hidden layer should represent the characteristics of this player when playing on a given court type and having player last night or not. We could condition on many other features; this is just an example.

In [13]:
def conditional_embedding_very_specific(train_conditionals,weights,biases,weights_1,biases_1,
                                    weights_2,biases_2,learning_rate=0.0005):
    
    weights_tracker = np.zeros((weights_1.shape))
    bias_tracker = np.zeros((biases_1.shape))
    
    LR = learning_rate
    
    dict_list = train_conditionals['p1_id'].unique()
    
    for i in (dict_list):
        #print(i)
        input_embedding = train_conditionals[train_conditionals['p1_id']==i]
        label_embedding = np.array(input_embedding['set_win_pct'])
        label_embedding = label_embedding.reshape((-1,1))
        
        input_embedding.drop(columns=['p1_id','set_win_pct'],inplace=True)
        
        weight_embedding = weights[i,:,:]
        biases_embedding = biases[i,:]
        
        preds,relu_act,relu_act_1 = feed_forward(weight_embedding,weights_1,weights_2,biases_embedding,
                                                 biases_1,biases_2,input_embedding)
        preds = preds.reshape((-1,1))
        
        
        dL_w,dL_w1,dL_w2,dL_b,dL_b1,dL_b2 = get_updates(label_embedding,preds,weights_2,relu_act_1,relu_act,
                                                        weights_1,input_embedding)

        
        weights[i,:,:] -= LR*dL_w
        biases[i,:] -= LR*np.sum(dL_b,axis=0)    
        weights_1 -= LR*dL_w1/float(input_embedding.shape[0])
        biases_1 -= LR*np.sum(dL_b1,axis=0)/float(input_embedding.shape[0])
        weights_2 -= LR*dL_w2/float(input_embedding.shape[0])
        biases_2 -= LR*np.sum(dL_b2,axis=0)/float(input_embedding.shape[0])
        
        loss_calc = get_loss(label_embedding,preds)
    

In [14]:
for i in range(50):
    conditional_embedding_very_specific(train_conditionals,weights,biases,weights_1,biases_1,weights_2,biases_2)
    
    
    if i%10==0:
        predict_list = []
    
        for idx,item in enumerate(df_test['Date']):
            
            input_embedding = df_test.loc[idx]
            
            weight_emb = weights[input_embedding['p1_id'],:,:]
            bias_emb = biases[input_embedding['p1_id'],:]
            
            input_embedding = input_embedding[['outdoor', 'grass', 'clay', 'hard', 'games_last_night']]
            
            set_pct,_,_ = feed_forward(weight_emb,weights_1,weights_2,bias_emb,biases_1,biases_2,input_embedding)
            
            predict_list.append(set_pct[0])
            
        df_test['pred_set_win'] = predict_list
            
        print(get_loss(df_test['set_win_pct'],df_test['pred_set_win']))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1.6176564179123343
0.19719817018084315
0.1886608578230616
0.18723561490152382
0.18668765296459403


In [15]:
df_test.to_csv('testset.csv',index=False)

## It is worth noticing that a simple concatenation and some rearranging of matrix shapes is all that is necessary to train an embedding for both players in a matchup simultaneously. The conditional_embedding_very_specific function will have to be changed to gather derivatives observation by observation, but this should be an easy fix for those motivated. These changes will create a CF example in the product recommendation vein and analogies to Ryan Adam's https://arxiv.org/abs/1003.4944 become relevant.