# Predicting ATP matches using multiple years

# Import packages

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Read in data

In [4]:
tennis_data_list = []

for i in range(2019, 2023):
    
    tennis_data_list.append(pd.read_csv('./match-data/atp_matches_' + str(i) + '.csv'))
    

In [5]:
tennis = pd.concat(tennis_data_list)

In [6]:
tennis.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2019-M020,Brisbane,Hard,32,A,20181231,300,105453,2.0,,...,54.0,34.0,20.0,14.0,10.0,15.0,9.0,3590.0,16.0,1977.0
1,2019-M020,Brisbane,Hard,32,A,20181231,299,106421,4.0,,...,52.0,36.0,7.0,10.0,10.0,13.0,16.0,1977.0,239.0,200.0
2,2019-M020,Brisbane,Hard,32,A,20181231,298,105453,2.0,,...,27.0,15.0,6.0,8.0,1.0,5.0,9.0,3590.0,40.0,1050.0
3,2019-M020,Brisbane,Hard,32,A,20181231,297,104542,,PR,...,60.0,38.0,9.0,11.0,4.0,6.0,239.0,200.0,31.0,1298.0
4,2019-M020,Brisbane,Hard,32,A,20181231,296,106421,4.0,,...,56.0,46.0,19.0,15.0,2.0,4.0,16.0,1977.0,18.0,1855.0


In [7]:
tennis.tail()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
2912,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,20220304,4,209362,,,...,68.0,42.0,12.0,10.0,11.0,13.0,1103.0,9.0,1130.0,8.0
2913,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,20220304,5,202065,,,...,56.0,40.0,20.0,15.0,4.0,8.0,808.0,23.0,1390.0,4.0
2914,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,20220304,1,138846,,,...,54.0,29.0,8.0,11.0,6.0,10.0,1059.0,10.0,1881.0,1.0
2915,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,20220304,2,209409,,,...,39.0,24.0,7.0,10.0,5.0,9.0,,,,
2916,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,20220304,4,208414,,,...,34.0,24.0,13.0,10.0,5.0,8.0,,,,


In [8]:
tennis.shape

(9918, 49)

# Manipulation of data

Make a copy of the data to work on.

In [9]:
tennis_data = tennis.copy()

Find the number of NAs in each column - this allows to best understand what to do with the missing values.

In [11]:
tennis_data.isna().sum()

tourney_id               0
tourney_name             0
surface                  0
draw_size                0
tourney_level            0
tourney_date             0
match_num                0
winner_id                0
winner_seed           5829
winner_entry          8407
winner_name              0
winner_hand              4
winner_ht               96
winner_ioc               0
winner_age               3
loser_id                 0
loser_seed            7485
loser_entry           7648
loser_name               0
loser_hand               8
loser_ht               257
loser_ioc                0
loser_age                3
score                    0
best_of                  0
round                    0
minutes                521
w_ace                  428
w_df                   428
w_svpt                 428
w_1stIn                428
w_1stWon               428
w_2ndWon               428
w_SvGms                427
w_bpSaved              428
w_bpFaced              428
l_ace                  428
l

Drop all of the data with NAs - it's hard to know what the correct values for these are, so it's worth justy dropping the data comlpletely.

In [14]:
def remove_tennis_na(df):
    
    atp_all = df.dropna(subset = ["w_df"]).reset_index().drop('index', axis = 1)
    return atp_all

In [63]:
atp_all = remove_tennis_na(tennis_data)

Create one dataframe with all of the winners and losers. This is done through creating two separate dataframes, and then combining at the end. Supplementary functions are made to get the column names for easier manipulation of the dataframes.

In [98]:
def get_winners_losers_data(df):
    
    winner_cols, loser_cols = get_winners_losers_columns(df)
    
    winner_df = df.loc[:, winner_cols]
    loser_df = df.loc[:, loser_cols]
    
    winner_df["won"] = 1
    loser_df["won"] = 0
    
    new_column_names = get_new_column_names(winner_df)
    
    winner_df.columns = new_column_names
    loser_df.columns = winner_df.columns
    
    atp_long = pd.concat([winner_df, loser_df], axis = 0)
    
    return atp_long
    
    

In [99]:
def get_winners_losers_columns(df):
    
    common_cols = pd.Series(["tourney_name", "tourney_date", "round", "match_num"])
    
    winner_cols = pd.Series(df.columns[(df.columns.str.startswith("winner")) | (df.columns.str.startswith("w_"))])
    
    loser_cols = pd.Series(df.columns[(df.columns.str.startswith("loser")) | df.columns.str.startswith("l_")])
    
    winner_cols = pd.concat([winner_cols, common_cols])
    
    loser_cols = pd.concat([loser_cols, common_cols])
    
    return winner_cols, loser_cols

In [100]:
def get_new_column_names(df):
    
    new_column_names = df.columns.str.replace("winner", "player")
    new_column_names = new_column_names.str.replace("w_", "player_")
    
    return new_column_names

In [101]:
atp_long = get_winners_losers_data(atp_all)

Get the features of the data to be used in the model.

In [113]:
def get_data_features(df):
    
    df_created_features = get_in_game_features(df)
    
    feature_columns = ['player_name', 'player_hand', 'player_age', 'player_rank', 'tourney_name',
                       'tourney_date', 'round', 'match_num', 'Player_Serve_Point_Win_Ratio',
                       'Player_Average_Break_Points_Faced', 'won']
    
    df_features = df.loc[:, feature_columns]
    
    return df_features
    

In [114]:
def get_in_game_features(df):
    
    df["Player_Serve_Point_Win_Ratio"] = (df["player_1stWon"] + df["player_2ndWon"] - 
                                          df["player_df"]) / df["player_svpt"]
    
    df["Player_Average_Break_Points_Faced"] = df["player_bpFaced"] / df["player_SvGms"]
    
    return df

In [115]:
atp_long_features = get_data_features(atp_long)

The data now needs to be randomized, to give a random assignment to players who have won and lost. This will be done on the original dataframe.

In [118]:
def get_randomized_matches(df):
    
    tennis_randomized = df.copy().dropna(subset = ["w_df"]).reset_index().drop("index", axis = 1)