In [1155]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

### Merge the match data from one year

In [1156]:
match_names = ["atp_matches", "atp_matches_futures", "atp_matches_qual_chall"]
frames = []

for name in match_names:
    frames.append(pd.read_csv(f"{name}_2021.csv"))

df = pd.concat(frames)

### Clean up the data

In [1157]:
## irrelevant columns
irr_cols = []
player_cols = ["ace", "df", "svpt", "1stIn", "1stWon", "2ndWon", "SvGms", "bpSaved", "bpFaced"]
general_cols = ["winner_name", "tourney_name", "loser_name", "winner_ioc", "loser_ioc", "minutes", "score", "draw_size", "round", "loser_seed", "loser_entry", "loser_rank_points", "winner_rank_points", "winner_seed", "winner_entry", "match_num", "tourney_id", "tourney_level", "best_of"]
## add winner/loser cols
for el in player_cols:
    irr_cols.append(f"w_{el}")
    irr_cols.append(f"l_{el}")

## add the general cols
irr_cols.extend(general_cols)

## drop irrelevant cols
df.drop(columns=irr_cols, inplace=True)

## drop empty values
## NB! Might need more intricate filtering
df.dropna(axis=0, inplace=True)

# df

### Convert numerical values to floats

In [1158]:
numeric_cols = ["winner_rank", "loser_rank", "winner_age", "loser_age", "winner_ht", "loser_ht"]
df[numeric_cols] = df[numeric_cols].astype(float)

### Adjust the dataset for performing predictions

In [1159]:
## replace winner/loser with player1 & 2
for (colName, colData) in df.iteritems():
    if "winner" in colName:
        endingVal = colName.split("_")[1]
        df.rename(columns={colName: f"player1_{endingVal}"}, inplace=True)
    elif "loser" in colName:
        endingVal = colName.split("_")[1]
        df.rename(columns={colName: f"player2_{endingVal}"}, inplace=True)

first_half, second_half = df[:int(len(df) / 2)], df[int(len(df) / 2):]
first_half.insert(loc=0, column="label", value=1)

## swap columns
scols = list(second_half.columns)

np.warnings.filterwarnings('ignore')

## swap the columns for the second half
for attr in ["id", "hand", "ht", "age", "rank"]:
    scols[scols.index(f"player1_{attr}")], scols[scols.index(f"player2_{attr}")] = scols[scols.index(f"player2_{attr}")], scols[scols.index(f"player1_{attr}")]
    # second_half.insert(loc=0, column="label", value=0)
    second_half["label"] = 0

# second_half.head(5)
halves = []

df = pd.concat([first_half, second_half])
df.sample(5)

Unnamed: 0,label,surface,tourney_date,player1_id,player1_hand,player1_ht,player1_age,player2_id,player2_hand,player2_ht,player2_age,player1_rank,player2_rank
2674,0,Hard,20210305,105011,R,185.0,33.478439,122548,U,183.0,23.950719,177.0,387.0
1188,1,Grass,20210614,144750,R,193.0,24.301164,104792,R,193.0,34.784394,51.0,16.0
1723,0,Hard,20210830,207989,R,185.0,18.321697,111815,L,188.0,26.020534,55.0,29.0
2231,0,Clay,20210308,123755,R,191.0,24.720055,104665,R,180.0,35.12115,121.0,55.0
2010,0,Clay,20210426,104731,R,203.0,34.940452,126207,R,188.0,23.263518,105.0,64.0


### Visualize data

In [1160]:
# cols = ["player1_rank", "player2_rank", "player1_age", "player2_age", "player1_ht", "player2_ht"]

# plt.figure(1, figsize=(18, 8))

# for i in range(6):
#     plt.subplot(2, 3, i + 1)
#     df[cols[i]].plot(kind="hist", title=cols[i])

# # plt.ylim([0, 10])
# # plt.xlim([0, 10])
# plt.show()

### Logistic Regrsession

In [1161]:
features = ["player1_age", "player2_age", "player1_rank", "player2_rank"]
X = df[features]
y = df["label"]

## Initialize classifier
clf = LogisticRegression()
clf.fit(X, y)
y_pred = clf.predict(X)
accuracy = clf.score(X, y)

print(f"accuracy: {accuracy}")

ValueError: could not convert string to float: 'R'