In [985]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

### Merge the match data from one year

In [986]:
match_names = ["atp_matches", "atp_matches_futures", "atp_matches_qual_chall"]
frames = []

for name in match_names:
    frames.append(pd.read_csv(f"{name}_2021.csv"))

df = pd.concat(frames)

### Clean up the data

In [987]:
## irrelevant columns
irr_cols = []
player_cols = ["ace", "df", "svpt", "1stIn", "1stWon", "2ndWon", "SvGms", "bpSaved", "bpFaced"]
general_cols = ["winner_name", "tourney_name", "loser_name", "winner_ioc", "loser_ioc", "minutes", "score", "draw_size", "round", "loser_seed", "loser_entry", "loser_rank_points", "winner_rank_points", "winner_seed", "winner_entry", "match_num", "tourney_id", "tourney_level", "best_of"]
## add winner/loser cols
for el in player_cols:
    irr_cols.append(f"w_{el}")
    irr_cols.append(f"l_{el}")

## add the general cols
irr_cols.extend(general_cols)

## drop irrelevant cols
df.drop(columns=irr_cols, inplace=True)

## drop empty values
## NB! Might need more intricate filtering
df.dropna(axis=0, inplace=True)

df

Unnamed: 0,surface,tourney_date,winner_id,winner_hand,winner_ht,winner_age,loser_id,loser_hand,loser_ht,loser_age,winner_rank,loser_rank
0,Hard,20210724,126207,R,188.0,23.493498,126952,R,180.0,23.627652,53.0,71.0
1,Hard,20210724,105526,R,193.0,31.233402,106329,L,183.0,27.134839,48.0,95.0
3,Hard,20210724,105357,R,183.0,32.095825,207518,R,185.0,19.378508,44.0,61.0
6,Hard,20210724,136440,L,180.0,27.222450,105487,L,183.0,31.389459,57.0,80.0
8,Hard,20210724,106218,R,180.0,27.986311,105613,R,193.0,30.932238,64.0,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7324,Hard,20211129,208502,L,180.0,19.288159,104735,R,180.0,35.526352,242.0,246.0
7331,Hard,20211129,105060,R,193.0,33.941136,105655,R,170.0,31.121150,428.0,287.0
7344,Clay,20211129,111797,R,198.0,26.135524,106198,R,180.0,28.454483,160.0,119.0
7352,Clay,20211129,106198,R,180.0,28.454483,105180,L,175.0,33.401780,119.0,401.0


### Convert numerical values to floats

In [988]:
numeric_cols = ["winner_rank", "loser_rank", "winner_age", "loser_age", "winner_ht", "loser_ht"]
df[numeric_cols] = df[numeric_cols].astype(float)

### Adjust the dataset for performing predictions

In [989]:
## replace winner/loser with player1 & 2
for (colName, colData) in df.iteritems():
    if "winner" in colName:
        endingVal = colName.split("_")[1]
        df.rename(columns={colName: f"player1_{endingVal}"}, inplace=True)
    elif "loser" in colName:
        endingVal = colName.split("_")[1]
        df.rename(columns={colName: f"player2_{endingVal}"}, inplace=True)

first_half, second_half = df[:int(len(df) / 2)], df[int(len(df) / 2):]
first_half.insert(loc=0, column="label", value=1)

## swap columns
scols = list(second_half.columns)

np.warnings.filterwarnings('ignore')

## swap the columns for the second half
for attr in ["id", "hand", "ht", "age", "rank"]:
    scols[scols.index(f"player1_{attr}")], scols[scols.index(f"player2_{attr}")] = scols[scols.index(f"player2_{attr}")], scols[scols.index(f"player1_{attr}")]
    # second_half.insert(loc=0, column="label", value=0)
    second_half["label"] = 0

# second_half.head(5)
halves = []

df2 = pd.concat([first_half, second_half])
df2.head(5)

Unnamed: 0,label,surface,tourney_date,player1_id,player1_hand,player1_ht,player1_age,player2_id,player2_hand,player2_ht,player2_age,player1_rank,player2_rank
0,1,Hard,20210724,126207,R,188.0,23.493498,126952,R,180.0,23.627652,53.0,71.0
1,1,Hard,20210724,105526,R,193.0,31.233402,106329,L,183.0,27.134839,48.0,95.0
3,1,Hard,20210724,105357,R,183.0,32.095825,207518,R,185.0,19.378508,44.0,61.0
6,1,Hard,20210724,136440,L,180.0,27.22245,105487,L,183.0,31.389459,57.0,80.0
8,1,Hard,20210724,106218,R,180.0,27.986311,105613,R,193.0,30.932238,64.0,98.0


### Visualize data

In [990]:
# cols = ["winner_rank", "loser_rank", "winner_age", "loser_age", "winner_ht", "loser_ht"]

# plt.figure(1, figsize=(18, 8))

# for i in range(6):
#     plt.subplot(2, 3, i + 1)
#     df[cols[i]].plot(kind="hist", title=cols[i])

# # plt.ylim([0, 10])
# # plt.xlim([0, 10])
# plt.show()

### Logistic Regrsession