In [1]:
import numpy as np
import pandas as pd
import os
import eli5
from eli5.sklearn import PermutationImportance
import re
import datetime
import itertools

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

Using TensorFlow backend.


In [2]:
def data_cleaning(df, 
                  tourneys_to_include = ['Grand Slams', 'Masters', '250 or 500', 'Tour Finals', 'Davis Cup'], 
                  start_year=2000 ):
    
    #Renaming columns
    new_cols = [
    'tourney_id', 'tourney_name', 'surface', 'draw_size',
       'tourney_level', 'match_num', 'winner_id', 'winner_seed',
       'winner_entry', 'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc',
       'winner_age', 'winner_rank', 'winner_rank_points', 'loser_id',
       'loser_seed', 'loser_entry', 'loser_name', 'loser_hand', 'loser_ht',
       'loser_ioc', 'loser_age', 'loser_rank', 'loser_rank_points', 'score',
       'best_of', 'round', 'minutes', 'winner_ace', 'winner_df', 'winner_svpt', 'winner_1stIn',
       'winner_1stWon', 'winner_2ndWon', 'winner_SvGms', 'winner_bpSaved', 'winner_bpFaced', 'loser_ace',
       'loser_df', 'loser_svpt', 'loser_1stIn', 'loser_1stWon', 'loser_2ndWon', 'loser_SvGms',
       'loser_bpSaved', 'loser_bpFaced', 'W1', 'W2', 'W3', 'W4', 'W5', 'L1', 'L2',
       'L3', 'L4', 'L5', 'retirement', 'WTB1', 'LTB1', 'WTB2', 'LTB2', 'WTB3',
       'LTB3', 'WTB4', 'LTB4', 'WTB5', 'LTB5', 'tourney_start_date', 'year',
       'match_id'
    ]
    
    df.columns = new_cols
    
    #You can change what matches to include. I've chosen to exclude Futures matches and the Challenger tour
    # tourney_levels = 'Grand Slams', '250 or 500', 'Davis Cup', 'Masters', 'Challenger', 'Tour Finals', 'Futures'
    df = df[(df['tourney_level'].isin(tourneys_to_include)) &\
            (df['year'] >= start_year) & (df['surface'] == 'Hard')&\
            (~df['round'].isin(['Q1', 'Q2', 'Q3', 'Q4']))
           ]

    #Converting dates to datetime
    df.loc[:,'tourney_start_date'] = pd.to_datetime(df['tourney_start_date'])
    df.loc[:,'year'] = pd.to_datetime(df['year'])
    
    #Parsing scores
    scores = df.loc[:,'score'].str.split(' ')
    scores = scores.fillna(0)     
    loser_total_games = []
    winner_total_games = []
    
    for index, value in scores.items():
        loser_game_score = 0
        winner_game_score = 0
        try:
            if value == 0 or value == ['W/O']:            
                loser_total_games.append(loser_game_score)
                winner_total_games.append(winner_game_score)

            else:
                loser_game_score = 0
                winner_game_score = 0

                for set_ in value:                
                    try:
                        text = re.match(r"(\d)\-(\d)",set_ )
                        loser_game_score += int(text.group(2))
                        winner_game_score += int(text.group(1))
                    except:
                        pass
                loser_total_games.append(loser_game_score)
                winner_total_games.append(winner_game_score)
        except:
            print(index, value)

    df.loc[:,'winner_total_games'] = winner_total_games
    df.loc[:,'loser_total_games'] = loser_total_games
    df.loc[:,'total_games'] = df['winner_total_games'] + df['loser_total_games']
    df.loc[:,'loser_RtGms'] = df['winner_SvGms']
    df.loc[:,'winner_RtGms'] = df['loser_SvGms']
    df.loc[:,'loser_bp'] = df['winner_bpFaced']
    df.loc[:,'winner_bp'] = df['loser_bpFaced']


    df.loc[:,'loser_bpWon'] = df['winner_bpFaced'] - df['winner_bpSaved'] 
    df.loc[:,'winner_bpWon'] = df['loser_bpFaced'] - df['loser_bpSaved'] 
    
    #Imputing returns data so we can construct features
    df.loc[:,'winner_2ndIn'] = df['winner_svpt'] - df['winner_1stIn'] - df['winner_df']
    df.loc[:,'loser_2ndIn'] = df['loser_svpt'] - df['loser_1stIn'] - df['loser_df']
    df.loc[:,'loser_rtpt'] = df['winner_svpt']
    df.loc[:,'winner_rtpt'] = df['loser_svpt']
    df.loc[:,'winner_rtptWon'] = df['loser_svpt'] -  df['loser_1stWon'] - df['loser_2ndWon']
    df.loc[:,'loser_rtptWon'] = df['winner_svpt'] -  df['winner_1stWon'] - df['winner_2ndWon']
    df.loc[:,'winner_svptWon'] = df['winner_1stWon'] + df['winner_2ndWon']
    df.loc[:,'loser_svptWon'] = df['loser_1stWon'] + df['loser_2ndWon']
    df.loc[:,'winner_total_points'] = df['winner_svptWon'] + df['winner_rtptWon']
    df.loc[:,'loser_total_points'] = df['loser_svptWon'] + df['loser_rtptWon']
    df.loc[:,'total_points'] = df['winner_total_points'] + df['loser_total_points']
    
    #Dropping columns
    cols_to_drop =[
        'draw_size',
        'winner_seed',
        'winner_entry',
        'loser_seed',
        'loser_entry',
        'score',
        'W1', 'W2', 'W3', 'W4', 'W5', 'L1', 'L2',
        'L3', 'L4', 'L5', 'WTB1', 'LTB1', 'WTB2', 'LTB2', 'WTB3',
        'LTB3', 'WTB4', 'LTB4', 'WTB5', 'LTB5'
        ]
    
    df.drop(cols_to_drop, axis=1, inplace=True)
    
    #Filling nans values
    df.loc[:,'loser_rank'] = df['loser_rank'].fillna(500)
    df.loc[:,'winner_rank'] = df['winner_rank'].fillna(500)
    df = df.fillna(df.mean())
    
    return(df)

def convert_long(df):
    
    #Separating features into winner and loser so we can create rolling averages for each major tournament
    winner_cols = [col for col in df.columns if col.startswith('w')]
    loser_cols = [col for col in df.columns if col.startswith('l')]
    common_cols = [
        'tourney_id', 'tourney_name', 'surface', 'tourney_level',
       'match_num','best_of', 'round',
       'minutes','retirement', 'tourney_start_date', 'year', 'match_id',
        'total_points', 'total_games'
    ]
    
    #Will also add opponent's rank
    df_winner = df[winner_cols + common_cols + ['loser_rank']]
    df_loser = df[loser_cols + common_cols + ['winner_rank']]
    
    df_winner['won'] = 1
    df_loser['won'] = 0
    
    #Renaming columns
    df_winner.columns = [col.replace('winner','player').replace('loser', 'opponent') for col in df_winner.columns]
    df_loser.columns = df_winner.columns
    
    df_long = df_winner.append(df_loser, ignore_index=True)
    
    return(df_long)

def get_new_features(df):
    
    #Creating new features we can play around with, note that not all features may be used
    df.loc[:,'player_serve_win_ratio'] = (df['player_1stWon'] + df['player_2ndWon'])/\
    (df['player_1stIn'] + df['player_2ndIn'] + df['player_df'] )
    
    df.loc[:,'player_return_win_ratio'] = df['player_rtptWon']/df['player_rtpt']
    
    df.loc[:,'player_bp_per_game'] = df['player_bp']/df['player_RtGms']
    
    df.loc[:,'player_bp_conversion_ratio'] = df['player_bpWon']/df['player_bp']
    
    #Setting nans to zero for breakpoint conversion ratio
    df.loc[:,'player_bp_conversion_ratio'].fillna(0, inplace=True)
    
    df.loc[:,'player_game_win_ratio'] = df['player_total_games']/df['total_games']
    
    df.loc[:,'player_point_win_ratio'] = df['player_total_points']/df['total_points']
    
    #df['player_set_Win_Ratio'] = df['Player_Sets_Won']/df['Total_Sets']
    
    df.loc[:,'player_clutch_factor'] = df['player_game_win_ratio'] - df['player_point_win_ratio']
    
    df.loc[:,'player_log_rank'] = np.log(df['player_rank'])
    
    df.loc[:,'player_win_weight'] = df['won'] * np.exp(-df['opponent_rank']/100)

    #Let's try weighting some of the features by the opponent's rank
    
    #df['Player_Set_Win_Ratio_Weighted'] = df['Player_Set_Win_Ratio']*np.exp((df['Player_Rank']-df['Opponent_Rank'])/500)
    df.loc[:,'player_game_win_ratio_weighted'] = df['player_game_win_ratio']*np.exp((df['player_rank']-df['opponent_rank'])/500)
    df.loc[:,'player_point_win_ratio_weighted'] = df['player_point_win_ratio']*np.exp((df['player_rank']-df['opponent_rank'])/500)
    
    return(df)

def get_rolling_features(df, date_df, rolling_cols, last_cols, window):
    
    #This code is basically copied straight from Qile Tan's notebook
    
    df = df.sort_values(['player_name', 'tourney_name', 'tourney_start_date'], ascending=True)
    
    for index, tournament_date in enumerate(date_df.tourney_start_date):
        print(index, tournament_date)
        
        #Subsetting to tournaments at most 1 year before tournament date to reduce computation time
        df_temp = df.loc[(df['tourney_start_date']< tournament_date) & (df['tourney_start_date'] > tournament_date - datetime.timedelta(days=365))]

        #Only taking the most recent value for the feature, if specified in last_cols
        if last_cols != None:
            df_temp_last = df_temp.groupby('player_name')[last_cols].last().reset_index()

        #Taking a rolling average of the x (window_length) most recent matches before specified tournament date,
        #for features specified in rolling_cols
        df_temp = df_temp.groupby('player_name')[rolling_cols].rolling(window,1).mean().reset_index()

        #Only taking the most recent rolling average
        df_temp = df_temp.groupby('player_name').tail(1)

        df_temp = df_temp.merge(df_temp_last, on = 'player_name', how='left')

        #Adding a column telling us what tournament the rolling average is for
        if index == 0:
            df_result = df_temp
            df_result['tournament_date_index'] = tournament_date

        else:
            df_temp['tournament_date_index'] = tournament_date
            df_result = df_result.append(df_temp)
        
    
    df_result.drop('level_1', axis=1, inplace=True)
    
    return(df_result)

def merge_data(df, df_rolling_atp):
    
    df_atp = df.copy()
    #Subsetting match data to Grand Slams and Masters
    df_atp = df_atp.loc[df_atp['tourney_level'].isin(['Grand Slams', 'Masters'])]

    #Removing unnecessary columns from match data
    cols_to_keep = ['winner_name','loser_name','tourney_name','tourney_start_date', 'tourney_level']

    df_atp = df_atp[cols_to_keep]
    df1 = df_atp.copy()
    df1.columns = ['player_1','player_2','tourney_name','tourney_start_date', 'tourney_level']
    df1['player_1_win'] = 1

    df2 = df_atp.copy()
    df2.columns = ['player_2','player_1','tourney_name','tourney_start_date', 'tourney_level']
    df2['player_1_win'] = 0

    df_atp = pd.concat([df1, df2], sort=False)
    df_atp.reset_index(drop=True, inplace=True)
    

    #Joining rolling features for p1 with match data
    df_atp = df_atp.merge(df_rolling_atp, how='left',
                         left_on = ['player_1', 'tourney_start_date'],
                         right_on = ['player_name', 'tournament_date_index'],
                         validate = 'm:1')


    df_atp = df_atp.merge(df_rolling_atp, how='left',
                         left_on = ['player_2', 'tourney_start_date'],
                         right_on = ['player_name', 'tournament_date_index'],
                         validate = 'm:1',
                         suffixes=('_p1', '_p2'))
    
    return(df_atp)

def get_player_difference(df, diff_cols = None):
    
    p1_cols = [i + '_p1' for i in diff_cols] # column names for player 1 stats
    p2_cols = [i + '_p2' for i in diff_cols] # column names for player 2 stats


    # Filling missing values
    df['player_rank_p1'] = df['player_rank_p1'].fillna(500)
    df['player_log_rank_p1'] = df['player_log_rank_p1'].fillna(np.log(500))
    df[p1_cols] = df[p1_cols].fillna(-1)
    
    df['player_rank_p2'] = df['player_rank_p2'].fillna(500)
    df['player_log_rank_p2'] = df['player_log_rank_p2'].fillna(np.log(500))
    df[p2_cols] = df[p2_cols].fillna(-1)

    
    new_column_name = [i + '_diff' for i in diff_cols]

    # Take the difference
    df_p1 = df[p1_cols]
    df_p2 = df[p2_cols]
    
    df_p1.columns=new_column_name
    df_p2.columns=new_column_name
    
    df_diff = df_p1 - df_p2
    df_diff.columns = new_column_name
    
    #Dropping spare columns
    df.drop(p1_cols + p2_cols, axis=1, inplace=True)
    
    # Concat the df_diff and raw_df
    df = pd.concat([df, df_diff], axis=1)
    
    return(df)

In [3]:
deuce_atp = pd.read_csv('Name.csv')
deuce_atp = data_cleaning(deuce_atp, ['Grand Slams', '250 or 500', 'Davis Cup', 'Masters', 'Challenger', 'Tour Finals'])
deuce_atp_long = convert_long(deuce_atp)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  df = df.fillna(df.mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [4]:
deuce_atp_long = get_new_features(deuce_atp_long)

In [5]:
# These are the tournaments we want to get the rolling average of features for, they will then be used for training
roll_dates = deuce_atp.loc[deuce_atp['tourney_level'].isin(['Grand Slams'])].groupby(['tourney_name', 'tourney_start_date'])\
.size().reset_index()[['tourney_name', 'tourney_start_date']]

# We also want to aggregate matches just before the 2020 AO
roll_dates.loc[-1] = ['Australian Open', pd.to_datetime('2020-01-20')]

last_cols = ['player_rank', 'player_log_rank']
rolling_cols = [
    'player_serve_win_ratio', 'player_return_win_ratio',
    'player_bp_per_game', 'player_bp_conversion_ratio',
    'player_game_win_ratio', 'player_point_win_ratio',
    'player_clutch_factor', 'player_win_weight',
    'player_game_win_ratio_weighted', 'player_point_win_ratio_weighted'
]

rolling_features = get_rolling_features(deuce_atp_long, roll_dates, rolling_cols, last_cols, 21  )

0 2000-01-17 00:00:00
1 2001-01-15 00:00:00
2 2002-01-14 00:00:00
3 2003-01-13 00:00:00
4 2004-01-19 00:00:00
5 2005-01-17 00:00:00
6 2006-01-16 00:00:00
7 2007-01-15 00:00:00
8 2008-01-14 00:00:00
9 2009-01-19 00:00:00
10 2010-01-18 00:00:00
11 2011-01-17 00:00:00
12 2012-01-16 00:00:00
13 2013-01-14 00:00:00
14 2014-01-13 00:00:00
15 2015-01-19 00:00:00
16 2016-01-18 00:00:00
17 2017-01-16 00:00:00
18 2018-01-15 00:00:00
19 2019-01-14 00:00:00
20 2000-08-28 00:00:00
21 2001-08-27 00:00:00
22 2002-08-26 00:00:00
23 2003-08-25 00:00:00
24 2004-08-30 00:00:00
25 2005-08-29 00:00:00
26 2006-08-28 00:00:00
27 2007-08-27 00:00:00
28 2008-08-25 00:00:00
29 2009-08-31 00:00:00
30 2010-08-30 00:00:00
31 2011-08-29 00:00:00
32 2012-08-27 00:00:00
33 2013-08-26 00:00:00
34 2014-08-25 00:00:00
35 2015-08-31 00:00:00
36 2016-08-29 00:00:00
37 2017-08-28 00:00:00
38 2018-08-27 00:00:00
39 2019-08-26 00:00:00
40 2020-01-20 00:00:00


In [6]:
deuce_atp_features = merge_data(deuce_atp, rolling_features)

In [7]:
diff_cols = [
    'player_rank', 'player_log_rank',
    'player_serve_win_ratio', 'player_return_win_ratio',
    'player_bp_per_game', 'player_bp_conversion_ratio',
    'player_game_win_ratio', 'player_point_win_ratio',
    'player_clutch_factor', 'player_win_weight',
    'player_game_win_ratio_weighted', 'player_point_win_ratio_weighted'
]

deuce_atp_final = get_player_difference(deuce_atp_features, diff_cols)

In [8]:
def train_val_split(df_atp_final, ML_cols):
    
    df_atp_train = df_atp_final.loc[(df_atp_final.tourney_start_date<'2018-01-15') &\
                                    (df_atp_final.tourney_level.isin(['Grand Slams'])), ML_cols]
    #df_atp_val = df_atp_final.loc[df_atp_final.tourney_start_date=='2018-01-15', ML_cols]
    df_atp_val = df_atp_final.loc[('2018-01-15' <= df_atp_final.tourney_start_date) &\
                                  (df_atp_final.tourney_start_date < '2019-12-31') &\
                                  (df_atp_final.tourney_level.isin(['Grand Slams'])), ML_cols]

    X_train = df_atp_train.drop('player_1_win', axis =1)
    y_train = df_atp_train['player_1_win']
    X_val = df_atp_val.drop('player_1_win', axis =1)
    y_val = df_atp_val['player_1_win']

    return(X_train, X_val, y_train, y_val)

In [9]:
ML_cols = [
       'player_1_win', 'player_rank_diff', 'player_log_rank_diff',
       'player_serve_win_ratio_diff', 'player_return_win_ratio_diff',
       'player_bp_per_game_diff', 'player_bp_conversion_ratio_diff',
       'player_game_win_ratio_diff', 'player_point_win_ratio_diff',
       'player_clutch_factor_diff', 'player_win_weight_diff',
       'player_game_win_ratio_weighted_diff',
       'player_point_win_ratio_weighted_diff'
]

ML_cols_subset = ['player_log_rank_diff',
 'player_rank_diff',
 'player_serve_win_ratio_diff',
 'player_return_win_ratio_diff',
 'player_game_win_ratio_diff',
 'player_point_win_ratio_weighted_diff',
 'player_1_win']

X_train, X_val, y_train, y_val = train_val_split(deuce_atp_final, ML_cols_subset)

In [10]:
from xgboost import XGBClassifier
#Changing some settings to prevent xgboost from killing the kernal
#see https://stackoverflow.com/questions/51164771/python-xgboost-kernel-died
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [11]:
model = XGBClassifier(
    objective = "binary:logistic",
    n_estimators = 300,
    learning_rate = 0.02,
    max_depth = 6
)

eval_set = [(X_val, y_val)]
model.fit(X_train,
          y_train,
         eval_set = eval_set,
         eval_metric="auc",
         early_stopping_rounds = 20)

[0]	validation_0-auc:0.76966
[1]	validation_0-auc:0.77419
[2]	validation_0-auc:0.77639
[3]	validation_0-auc:0.77479
[4]	validation_0-auc:0.77506
[5]	validation_0-auc:0.77539
[6]	validation_0-auc:0.77673
[7]	validation_0-auc:0.77678
[8]	validation_0-auc:0.77652
[9]	validation_0-auc:0.77671
[10]	validation_0-auc:0.77686
[11]	validation_0-auc:0.77709
[12]	validation_0-auc:0.77693
[13]	validation_0-auc:0.77716
[14]	validation_0-auc:0.77753
[15]	validation_0-auc:0.77750
[16]	validation_0-auc:0.77720
[17]	validation_0-auc:0.77725
[18]	validation_0-auc:0.77730
[19]	validation_0-auc:0.77723
[20]	validation_0-auc:0.77718
[21]	validation_0-auc:0.77693
[22]	validation_0-auc:0.77698
[23]	validation_0-auc:0.77658
[24]	validation_0-auc:0.77685
[25]	validation_0-auc:0.77669
[26]	validation_0-auc:0.77672
[27]	validation_0-auc:0.77690
[28]	validation_0-auc:0.77699
[29]	validation_0-auc:0.77719




[30]	validation_0-auc:0.77729
[31]	validation_0-auc:0.77715
[32]	validation_0-auc:0.77737
[33]	validation_0-auc:0.77764
[34]	validation_0-auc:0.77770
[35]	validation_0-auc:0.77750
[36]	validation_0-auc:0.77799
[37]	validation_0-auc:0.77790
[38]	validation_0-auc:0.77778
[39]	validation_0-auc:0.77818
[40]	validation_0-auc:0.77843
[41]	validation_0-auc:0.77856
[42]	validation_0-auc:0.77897
[43]	validation_0-auc:0.77909
[44]	validation_0-auc:0.77929
[45]	validation_0-auc:0.77923
[46]	validation_0-auc:0.77949
[47]	validation_0-auc:0.77973
[48]	validation_0-auc:0.77984
[49]	validation_0-auc:0.77997
[50]	validation_0-auc:0.78002
[51]	validation_0-auc:0.77995
[52]	validation_0-auc:0.77982
[53]	validation_0-auc:0.77998
[54]	validation_0-auc:0.78014
[55]	validation_0-auc:0.78008
[56]	validation_0-auc:0.78017
[57]	validation_0-auc:0.78010
[58]	validation_0-auc:0.78009
[59]	validation_0-auc:0.78020
[60]	validation_0-auc:0.78032
[61]	validation_0-auc:0.78046
[62]	validation_0-auc:0.78066
[63]	valid

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.02, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=300, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [12]:
import http.client

conn = http.client.HTTPSConnection("tennis-live-data.p.rapidapi.com")

headers = {
    'X-RapidAPI-Key': "57fcc65541msh4841dc21fef332dp1f0486jsn7e8c59408325",
    'X-RapidAPI-Host': "tennis-live-data.p.rapidapi.com"
    }

conn.request("GET", "/players/ATP", headers=headers)

res = conn.getresponse()
data = res.read()

print(data.decode("utf-8"))

{"meta":{"title":"Live Tennis API - Players By Tour: ATP","description":"List of Players in Specified Tour","fields":{"id":"Integer - player_id","first_name":"String","last_name":"String","full_name":"String","country":"String"}},"results":{"players":[{"id":89304,"first_name":"Novak","last_name":"Djokovic","full_name":"Novak Djokovic","country":"Serbia"},{"id":86928,"first_name":"Rafael","last_name":"Nadal","full_name":"Rafael Nadal","country":"Spain"},{"id":342990,"first_name":"Alexander","last_name":"Zverev","full_name":"Alexander Zverev","country":"Germany"},{"id":91668,"first_name":"Potro","last_name":"Del","full_name":"Potro Del","country":"Argentina"},{"id":93378,"first_name":"Kevin","last_name":"Anderson","full_name":"Kevin Anderson","country":"South Africa"},{"id":86064,"first_name":"Roger","last_name":"Federer","full_name":"Roger Federer","country":"Switzerland"},{"id":94410,"first_name":"Kei","last_name":"Nishikori","full_name":"Kei Nishikori","country":"Japan"},{"id":262500,

In [13]:
import json
dictData = json.loads(data.decode("utf-8"))

print(dictData)

{'meta': {'title': 'Live Tennis API - Players By Tour: ATP', 'description': 'List of Players in Specified Tour', 'fields': {'id': 'Integer - player_id', 'first_name': 'String', 'last_name': 'String', 'full_name': 'String', 'country': 'String'}}, 'results': {'players': [{'id': 89304, 'first_name': 'Novak', 'last_name': 'Djokovic', 'full_name': 'Novak Djokovic', 'country': 'Serbia'}, {'id': 86928, 'first_name': 'Rafael', 'last_name': 'Nadal', 'full_name': 'Rafael Nadal', 'country': 'Spain'}, {'id': 342990, 'first_name': 'Alexander', 'last_name': 'Zverev', 'full_name': 'Alexander Zverev', 'country': 'Germany'}, {'id': 91668, 'first_name': 'Potro', 'last_name': 'Del', 'full_name': 'Potro Del', 'country': 'Argentina'}, {'id': 93378, 'first_name': 'Kevin', 'last_name': 'Anderson', 'full_name': 'Kevin Anderson', 'country': 'South Africa'}, {'id': 86064, 'first_name': 'Roger', 'last_name': 'Federer', 'full_name': 'Roger Federer', 'country': 'Switzerland'}, {'id': 94410, 'first_name': 'Kei', 'l

In [14]:
DataList = []
for i in dictData['results']['players']:
    DataList.append(i['full_name'])
print(DataList)

['Novak Djokovic', 'Rafael Nadal', 'Alexander Zverev', 'Potro Del', 'Kevin Anderson', 'Roger Federer', 'Kei Nishikori', 'Dominic Thiem', 'John Isner', 'Marin Cilic', 'Karen Khachanov', 'Stefanos Tsitsipas', 'Borna Coric', 'Milos Raonic', 'Fabio Fognini', 'Daniil Medvedev', 'Lucas Pouille', 'Agut Bautista', 'Marco Cecchinato', 'Diego Schwartzman', 'David Goffin', 'Nikoloz Basilashvili', 'Busta Carreno', 'Grigor Dimitrov', 'Denis Shapovalov', 'Fernando Verdasco', 'Richard Gasquet', 'Minaur De', 'Kyle Edmund', 'Frances Tiafoe', 'Gilles Simon', 'Philipp Kohlschreiber', 'Gael Monfils', 'Steve Johnson', 'Jeremy Chardy', 'John Millman', 'Andreas Seppi', 'Martin Klizan', 'Joao Sousa', 'Taylor Fritz', 'Nicolas Jarry', 'Dusan Lajovic', 'Malek Jaziri', 'Pierre Herbert', 'Matthew Ebden', 'Adrian Mannarino', 'Marton Fucsovics', 'Sam Querrey', 'Leonardo Mayer', 'Hyeon Chung', 'Jan Struff', 'Damir Dzumhur', 'Matteo Berrettini', 'Robin Haase', 'Mikhail Kukushkin', 'Marius Copil', 'Stan Wawrinka', 'Ben

In [15]:
players_df = pd.DataFrame(DataList)
players_df.to_csv('players_api.csv')

In [16]:
player_permutations = list(itertools.permutations(DataList, 2))
dummy_submission_df = pd.DataFrame(player_permutations, columns=['player_1','player_2'])
dummy_submission_df.loc[:,'player_1_win_probability'] = 0.5

In [17]:
dummy_submission_df.to_csv('dummy_submission_2022.csv')

In [18]:
dummy_submission_df

Unnamed: 0,player_1,player_2,player_1_win_probability
0,Novak Djokovic,Rafael Nadal,0.5
1,Novak Djokovic,Alexander Zverev,0.5
2,Novak Djokovic,Potro Del,0.5
3,Novak Djokovic,Kevin Anderson,0.5
4,Novak Djokovic,Roger Federer,0.5
...,...,...,...
818115,August Holmgren,Arthur Fery,0.5
818116,August Holmgren,Philip Sekulic,0.5
818117,August Holmgren,Sebastian Gima,0.5
818118,August Holmgren,Sandro Kopp,0.5


In [19]:
df_predict_atp = pd.read_csv('dummy_submission_2022.csv')
df_predict_atp['player_1'] = df_predict_atp['player_1'].str.lower() 
df_predict_atp['player_2'] = df_predict_atp['player_2'].str.lower()

rolling_features['player_name'] = rolling_features['player_name'].str.lower() 

#Adding tournament date to prediction df
df_predict_atp['tourney_start_date'] = pd.to_datetime('2020-01-20')


df_predict_atp = df_predict_atp.merge(rolling_features, how='left',
                                     left_on = ['player_1', 'tourney_start_date'],
                                     right_on = ['player_name', 'tournament_date_index'],
                                     validate = 'm:1')

In [20]:
#Used to check for players who do not have rolling features to match or players whose names are spelt incorrectly
missing_names = df_predict_atp[df_predict_atp.isnull().any(axis=1)]['player_1'].unique()
missing_names.sort()
missing_names

array(['abedallah shelbayh', 'acosta diaz', 'adam moundir', 'adam walton',
       'aguilar merida', 'agut bautista', 'alafia ayeni', 'alastair gray',
       'albano olivetti', 'alejandro gomez', 'aleksandre metreveli',
       'alessandro petrone', 'alexander igoshin',
       'alexander pavlioutchenkov', 'alexander shevchenko',
       'alexander ward', 'alexander weis', 'alexandr dolgopolov',
       'alexandru jecan', 'alexis gautier', 'alfredo perez',
       'aliassime auger', 'alves meligeni', 'andreas haider-maurer',
       'andrej martin', 'andrew paulson', 'andrey chepelev',
       'andrey kuznetsov', 'anis ghorbel', 'antoine bellier',
       'antoine hoang', 'anton matusevich', 'arthur cazaux',
       'arthur fery', 'arthur fils', 'assche van', 'august holmgren',
       'austin krajicek', 'azorin vidal', 'baena carballes',
       'bague toledo', 'bart stevens', 'bastian malla', 'batalla roca',
       'ben patael', 'ben shelton', 'billy harris', 'bu yunchaokete',
       'busta carr

In [21]:
all_players = deuce_atp_final.player_1.unique()
all_players.sort()
all_players

array(['Adam Pavlasek', 'Adrian Mannarino', 'Adrian Menendez Maceiras',
       'Adrian Ungur', 'Adrian Voinea', 'Agustin Calleri',
       'Aisam Ul Haq Qureshi', 'Alan Mackin', 'Albano Olivetti',
       'Albert Costa', 'Albert Montanes', 'Albert Portas', 'Albert Ramos',
       'Alberto Berasategui', 'Alberto Martin', 'Alejandro Falla',
       'Alejandro Gonzalez', 'Aleksandr Nedovyesov',
       'Alessandro Giannessi', 'Alessio Di Mauro', 'Alex Bogdanovic',
       'Alex Bogomolov Jr', 'Alex Bolt', 'Alex Calatrava',
       'Alex Corretja', 'Alex De Minaur', 'Alex Kim', 'Alex Kuznetsov',
       'Alex Lopez Moron', 'Alex Obrien', 'Alexander Bublik',
       'Alexander Kudryavtsev', 'Alexander Peya', 'Alexander Popp',
       'Alexander Sarkissian', 'Alexander Waske', 'Alexander Zverev',
       'Alexandr Dolgopolov', 'Alexandre Simoni', 'Alexei Popyrin',
       'Aljaz Bedene', 'Alun Jones', 'Amer Delic', 'Amir Weintraub',
       'Andre Agassi', 'Andre Sa', 'Andrea Gaudenzi', 'Andrea Stoppini'

In [22]:
df_predict_atp = df_predict_atp.merge(rolling_features, how='left',
                                     left_on = ['player_2', 'tourney_start_date'],
                                     right_on = ['player_name', 'tournament_date_index'],
                                     validate = 'm:1',
                                     suffixes = ('_p1','_p2'))

In [23]:
#Filling in missing values for players with no aggregates prior to 2020 AO
df_predict_atp.loc[:,'player_rank_p1'].fillna(500, inplace=True)
df_predict_atp.loc[:,'player_rank_p2'].fillna(500, inplace=True)
df_predict_atp.loc[:,'player_log_rank_p1'].fillna(np.log(500), inplace=True)
df_predict_atp.loc[:,'player_log_rank_p2'].fillna(np.log(500), inplace=True)
df_predict_atp.loc[:,['player_log_rank_p1','player_log_rank_p2']].fillna(np.log(500), inplace=True)
df_predict_atp.fillna(-1, inplace=True) ##### <- important

#These players with no previous match history are slightly suspect, good to check if we have filled in their values correctly
df_predict_atp[(df_predict_atp.player_1.isin(['alejandro fokina', 'andrew harris', 'christopher oconnell',
       'hugo gaston', 'james duckworth', 'john patrick smith',
       'michael mmoh', 'yen hsun lu']))]

Unnamed: 0.1,Unnamed: 0,player_1,player_2,player_1_win_probability,tourney_start_date,player_name_p1,player_serve_win_ratio_p1,player_return_win_ratio_p1,player_bp_per_game_p1,player_bp_conversion_ratio_p1,player_game_win_ratio_p1,player_point_win_ratio_p1,player_clutch_factor_p1,player_win_weight_p1,player_game_win_ratio_weighted_p1,player_point_win_ratio_weighted_p1,player_rank_p1,player_log_rank_p1,tournament_date_index_p1,player_name_p2,player_serve_win_ratio_p2,player_return_win_ratio_p2,player_bp_per_game_p2,player_bp_conversion_ratio_p2,player_game_win_ratio_p2,player_point_win_ratio_p2,player_clutch_factor_p2,player_win_weight_p2,player_game_win_ratio_weighted_p2,player_point_win_ratio_weighted_p2,player_rank_p2,player_log_rank_p2,tournament_date_index_p2
98536,98536,michael mmoh,novak djokovic,0.5,2020-01-20,michael mmoh,0.621964,0.392531,0.679286,0.460622,0.523931,0.506639,0.017292,0.074843,0.488298,0.476068,152.0,5.023881,2020-01-20 00:00:00,novak djokovic,0.722207,0.412843,0.670426,0.492210,0.613324,0.560366,0.052957,0.470841,0.561517,0.513802,1.0,0.000000,2020-01-20 00:00:00
98537,98537,michael mmoh,rafael nadal,0.5,2020-01-20,michael mmoh,0.621964,0.392531,0.679286,0.460622,0.523931,0.506639,0.017292,0.074843,0.488298,0.476068,152.0,5.023881,2020-01-20 00:00:00,rafael nadal,0.701927,0.395480,0.639543,0.442393,0.622003,0.542192,0.070236,0.567970,0.567058,0.492981,2.0,0.693147,2020-01-20 00:00:00
98538,98538,michael mmoh,alexander zverev,0.5,2020-01-20,michael mmoh,0.621964,0.392531,0.679286,0.460622,0.523931,0.506639,0.017292,0.074843,0.488298,0.476068,152.0,5.023881,2020-01-20 00:00:00,alexander zverev,0.669859,0.377520,0.666958,0.388043,0.529257,0.517622,0.011636,0.380498,0.458381,0.452615,6.0,1.791759,2020-01-20 00:00:00
98539,98539,michael mmoh,potro del,0.5,2020-01-20,michael mmoh,0.621964,0.392531,0.679286,0.460622,0.523931,0.506639,0.017292,0.074843,0.488298,0.476068,152.0,5.023881,2020-01-20 00:00:00,-1,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1
98540,98540,michael mmoh,kevin anderson,0.5,2020-01-20,michael mmoh,0.621964,0.392531,0.679286,0.460622,0.523931,0.506639,0.017292,0.074843,0.488298,0.476068,152.0,5.023881,2020-01-20 00:00:00,kevin anderson,0.646510,0.347592,0.392992,0.526786,0.483566,0.500116,-0.016550,0.417504,0.441765,0.461121,7.0,1.945910,2020-01-20 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501715,501715,christopher oconnell,philip sekulic,0.5,2020-01-20,christopher oconnell,0.686896,0.408094,0.693553,0.415291,0.583213,0.545209,0.038004,0.095245,0.506093,0.478462,596.0,6.390241,2020-01-20 00:00:00,-1,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1
501716,501716,christopher oconnell,sebastian gima,0.5,2020-01-20,christopher oconnell,0.686896,0.408094,0.693553,0.415291,0.583213,0.545209,0.038004,0.095245,0.506093,0.478462,596.0,6.390241,2020-01-20 00:00:00,-1,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1
501717,501717,christopher oconnell,sandro kopp,0.5,2020-01-20,christopher oconnell,0.686896,0.408094,0.693553,0.415291,0.583213,0.545209,0.038004,0.095245,0.506093,0.478462,596.0,6.390241,2020-01-20 00:00:00,-1,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1
501718,501718,christopher oconnell,alfredo perez,0.5,2020-01-20,christopher oconnell,0.686896,0.408094,0.693553,0.415291,0.583213,0.545209,0.038004,0.095245,0.506093,0.478462,596.0,6.390241,2020-01-20 00:00:00,-1,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1


In [24]:
df_predict_atp


Unnamed: 0.1,Unnamed: 0,player_1,player_2,player_1_win_probability,tourney_start_date,player_name_p1,player_serve_win_ratio_p1,player_return_win_ratio_p1,player_bp_per_game_p1,player_bp_conversion_ratio_p1,player_game_win_ratio_p1,player_point_win_ratio_p1,player_clutch_factor_p1,player_win_weight_p1,player_game_win_ratio_weighted_p1,player_point_win_ratio_weighted_p1,player_rank_p1,player_log_rank_p1,tournament_date_index_p1,player_name_p2,player_serve_win_ratio_p2,player_return_win_ratio_p2,player_bp_per_game_p2,player_bp_conversion_ratio_p2,player_game_win_ratio_p2,player_point_win_ratio_p2,player_clutch_factor_p2,player_win_weight_p2,player_game_win_ratio_weighted_p2,player_point_win_ratio_weighted_p2,player_rank_p2,player_log_rank_p2,tournament_date_index_p2
0,0,novak djokovic,rafael nadal,0.5,2020-01-20,novak djokovic,0.722207,0.412843,0.670426,0.49221,0.613324,0.560366,0.052957,0.470841,0.561517,0.513802,1.0,0.000000,2020-01-20 00:00:00,rafael nadal,0.701927,0.395480,0.639543,0.442393,0.622003,0.542192,0.070236,0.567970,0.567058,0.492981,2.0,0.693147,2020-01-20 00:00:00
1,1,novak djokovic,alexander zverev,0.5,2020-01-20,novak djokovic,0.722207,0.412843,0.670426,0.49221,0.613324,0.560366,0.052957,0.470841,0.561517,0.513802,1.0,0.000000,2020-01-20 00:00:00,alexander zverev,0.669859,0.377520,0.666958,0.388043,0.529257,0.517622,0.011636,0.380498,0.458381,0.452615,6.0,1.791759,2020-01-20 00:00:00
2,2,novak djokovic,potro del,0.5,2020-01-20,novak djokovic,0.722207,0.412843,0.670426,0.49221,0.613324,0.560366,0.052957,0.470841,0.561517,0.513802,1.0,0.000000,2020-01-20 00:00:00,-1,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1
3,3,novak djokovic,kevin anderson,0.5,2020-01-20,novak djokovic,0.722207,0.412843,0.670426,0.49221,0.613324,0.560366,0.052957,0.470841,0.561517,0.513802,1.0,0.000000,2020-01-20 00:00:00,kevin anderson,0.646510,0.347592,0.392992,0.526786,0.483566,0.500116,-0.016550,0.417504,0.441765,0.461121,7.0,1.945910,2020-01-20 00:00:00
4,4,novak djokovic,roger federer,0.5,2020-01-20,novak djokovic,0.722207,0.412843,0.670426,0.49221,0.613324,0.560366,0.052957,0.470841,0.561517,0.513802,1.0,0.000000,2020-01-20 00:00:00,roger federer,0.712826,0.416138,0.731101,0.448214,0.596273,0.553153,0.042751,0.539403,0.555605,0.518114,3.0,1.098612,2020-01-20 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818115,818115,august holmgren,arthur fery,0.5,2020-01-20,-1,-1.000000,-1.000000,-1.000000,-1.00000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1,-1,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1
818116,818116,august holmgren,philip sekulic,0.5,2020-01-20,-1,-1.000000,-1.000000,-1.000000,-1.00000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1,-1,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1
818117,818117,august holmgren,sebastian gima,0.5,2020-01-20,-1,-1.000000,-1.000000,-1.000000,-1.00000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1,-1,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1
818118,818118,august holmgren,sandro kopp,0.5,2020-01-20,-1,-1.000000,-1.000000,-1.000000,-1.00000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1,-1,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,500.0,6.214608,-1


In [25]:
X_train.columns

Index(['player_log_rank_diff', 'player_rank_diff',
       'player_serve_win_ratio_diff', 'player_return_win_ratio_diff',
       'player_game_win_ratio_diff', 'player_point_win_ratio_weighted_diff'],
      dtype='object')

In [31]:
df_predict_atp['player_1']

0          novak djokovic
1          novak djokovic
2          novak djokovic
3          novak djokovic
4          novak djokovic
               ...       
818115    august holmgren
818116    august holmgren
818117    august holmgren
818118    august holmgren
818119    august holmgren
Name: player_1, Length: 818120, dtype: object

In [32]:
pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

player_log_rank_diff                    0.619010
player_game_win_ratio_diff              0.108943
player_point_win_ratio_weighted_diff    0.080545
player_serve_win_ratio_diff             0.075001
player_rank_diff                        0.060340
player_return_win_ratio_diff            0.056161
dtype: float32

In [33]:
perm = PermutationImportance(model).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = X_val.columns.tolist())

Weight,Feature
0.1610  ± 0.0164,player_log_rank_diff
0.0163  ± 0.0176,player_game_win_ratio_diff
0.0136  ± 0.0040,player_rank_diff
0.0024  ± 0.0078,player_point_win_ratio_weighted_diff
0.0012  ± 0.0055,player_return_win_ratio_diff
-0.0047  ± 0.0076,player_serve_win_ratio_diff


In [35]:
df_predict_atp = get_player_difference(df_predict_atp, diff_cols=diff_cols)


atp_preds = model.predict_proba(df_predict_atp[X_train.columns])
df_predict_atp['player_1_win_probability'] = atp_preds[:,1]

atp_pred_submission = df_predict_atp[['player_1', 'player_2', 'player_1_win_probability']]
atp_pred_submission.to_csv('submission_from_api.csv')
atp_pred_submission

Unnamed: 0,player_1,player_2,player_1_win_probability
0,novak djokovic,rafael nadal,0.606594
1,novak djokovic,alexander zverev,0.836237
2,novak djokovic,potro del,0.838423
3,novak djokovic,kevin anderson,0.860619
4,novak djokovic,roger federer,0.446737
...,...,...,...
818115,august holmgren,arthur fery,0.503162
818116,august holmgren,philip sekulic,0.503162
818117,august holmgren,sebastian gima,0.503162
818118,august holmgren,sandro kopp,0.503162


In [36]:
atp_pred_submission.groupby('player_1')['player_1_win_probability'].agg('mean').sort_values(ascending=False).head(10)

player_1
alexander zverev    0.889402
rafael nadal        0.885930
novak djokovic      0.884984
roger federer       0.880868
kei nishikori       0.874128
daniil medvedev     0.862569
karen khachanov     0.860125
gael monfils        0.855037
dominic thiem       0.854107
david goffin        0.848131
Name: player_1_win_probability, dtype: float32

In [37]:
atp_pred_submission.groupby('player_1')['player_1_win_probability'].agg('mean').sort_values(ascending=False).tail(30)

player_1
minaur de               0.380043
min song                0.380043
juan londero            0.380043
juan otegui             0.380043
baena carballes         0.380043
joão monteiro           0.380043
jonathan eysseric       0.380043
nicolas arreche         0.380043
omar jasika             0.380043
olmedo ortega           0.380043
daniel cox              0.380043
oleksii krutykh         0.380043
oleksandr ovcharenko    0.380043
oleg prihodko           0.380043
nuno borges             0.380043
assche van              0.380043
jones pinnington        0.380043
jong de                 0.380043
jordan correia          0.380043
august holmgren         0.380043
jordi samper-montaña    0.380043
austin krajicek         0.380043
jorge panta             0.380043
niels lootsma           0.380043
jose bendeck            0.380043
jose pereira            0.380043
nicolas kicker          0.380043
azorin vidal            0.380043
jover sanchez           0.380043
abedallah shelbayh      0.380043
N