In [None]:
# Import packages
import pandas as pd
import numpy as np
import re
from sklearn.naive_bayes import BernoulliNB
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, silhouette_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from scipy.stats import entropy
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
wd = "/content/drive/MyDrive/project/"

Mounted at /content/drive/


In [None]:
# Read in points csv
points = pd.concat([pd.read_csv(f"{wd}charting-m-points-2010s.csv"), pd.read_csv(f"{wd}charting-m-points-2020s.csv")])
# Create a function to read in match_data csv
def read_match_data(yr):
    data = pd.read_csv(f"{wd}atp_matches_{yr}.csv")
    return(data)

  points = pd.concat([pd.read_csv(f"{wd}charting-m-points-2010s.csv"), pd.read_csv(f"{wd}charting-m-points-2020s.csv")])


In [None]:
# Remove 7-9 (return depth)
pattern = r'[7-9]'
# Use .str.replace() with regex=True to clean the "point" column
points['1st'] = points['1st'].str.replace(pattern, '', regex=True)
points['2nd'] = points['2nd'].str.replace(pattern, '', regex=True)

In [None]:
# Select columns from points Data Frame
points = points[['match_id', 'Pt', 'PtWinner', 'Svr', '1st', '2nd']]
# Split the match_id column
points[['date', 'gender', 'event', 'round', 'p1', 'p2']] = points['match_id'].str.split('-', expand=True)
# Remove '_' from p1 and p2
points['p1'] = points['p1'].str.replace('_', ' ', regex=False)
points['p2'] = points['p2'].str.replace('_', ' ', regex=False)
# Set Svr and PtWinner cols. as names instead of numbers
points['Svr'] = points.apply(lambda row: row['p1'] if row['Svr'] == 1 else row['p2'], axis = 1)
points['PtWinner'] = points.apply(lambda row: row['p1'] if row['PtWinner'] == 1 else row['p2'], axis = 1)

In [None]:
# Create an empty list to store the data
data_list = []
# Iterate through the years 2010-2024
for yr in range(2010, 2025):
    # Read the data for the current year
    year_data = read_match_data(yr)
    # Append the DataFrame to the list
    data_list.append(year_data)
# Concatenate all DataFrames in the list into a single DataFrame
data = pd.concat(data_list, ignore_index=True)

In [None]:
def extract_tennis_point(example_string):
    if pd.isna(example_string) or example_string is None:
        return None, None, None  # Return None values for serve, rally, and final shot

    # Pattern to extract serve based on criteria
    serve_pattern = r'^[cC]*\d[\+\*#]?[wndx!e]?'  # Matches serve + optional error suffix

    serve_match = re.match(serve_pattern, example_string)

    if serve_match:
        serve = serve_match.group()
        remaining_string = example_string[len(serve):]
    else:
        serve = None
        remaining_string = example_string

    # Pattern for rally shots
    rally_pattern = r'[a-z]?[;\\+=^]?[1-3]?'
    rally_shots = re.findall(rally_pattern, remaining_string)

    # Pattern to extract the final shot
    final_shot_pattern = r'[a-z][;\\+=^]?[1-3]?[nwdx!e]?[;\\+=^]?[@#\*]'
    final_shot_match = re.search(final_shot_pattern, remaining_string)

    final_shot = None
    if final_shot_match:
        final_shot = final_shot_match.group()
        final_shot_code = final_shot[:-1]

        # Remove last 2 shots if final shot ends with '@' or '#', otherwise remove 1
        if final_shot.endswith('@') or final_shot.endswith('#'):
            rally_shots = rally_shots[:-3]
        else:
            rally_shots = rally_shots[:-2]

    return serve, rally_shots, final_shot

In [None]:
# Apply the function to '1st' and '2nd' columns
points[['1st_serve', '1st_rally', '1st_final_shot']] = points['1st'].apply(
    lambda x: pd.Series(extract_tennis_point(x) if pd.notna(x) else (None, None, None))
)
points[['2nd_serve', '2nd_rally', '2nd_final_shot']] = points['2nd'].apply(
    lambda x: pd.Series(extract_tennis_point(x) if pd.notna(x) else (None, None, None))
)

In [None]:
# Collect all the shots
shots_data = []

for _, row in points.iterrows():
    rally_shots = []
    shot_types = []
    shot_players = []
    is_first_serve = []

    # Function to process shots safely
    def process_shots(shot_entry):
        if isinstance(shot_entry, str):  # If it's a string, split it
            return shot_entry.split(', ')
        elif isinstance(shot_entry, list):  # If it's already a list, return as is
            return shot_entry
        else:  # If it's NaN or unexpected, return an empty list
            return []

    # Handle first serve
    first_serve_shots = process_shots(row['1st_serve'])
    rally_shots.extend(first_serve_shots)
    shot_types.extend(['serve'] * len(first_serve_shots))
    shot_players.extend([row['p1'] if row['Svr'] == row['p1'] else row['p2']] * len(first_serve_shots))
    is_first_serve.extend([1] * len(first_serve_shots))

    # Handle first rally
    first_rally_shots = process_shots(row['1st_rally'])
    rally_shots.extend(first_rally_shots)
    shot_types.extend(['rally'] * len(first_rally_shots))
    shot_players.extend([None] * len(first_rally_shots))
    is_first_serve.extend([1] * len(first_rally_shots))

    # Handle first final shot
    first_final_shot = process_shots(row['1st_final_shot'])
    rally_shots.extend(first_final_shot)
    shot_types.extend(['final_shot'] * len(first_final_shot))
    shot_players.extend([None] * len(first_final_shot))
    is_first_serve.extend([1] * len(first_final_shot))

    # Handle second serve
    second_serve_shots = process_shots(row['2nd_serve'])
    rally_shots.extend(second_serve_shots)
    shot_types.extend(['serve'] * len(second_serve_shots))
    shot_players.extend([row['p1'] if row['Svr'] == row['p1'] else row['p2']] * len(second_serve_shots))
    is_first_serve.extend([0] * len(second_serve_shots))

    # Handle second rally
    second_rally_shots = process_shots(row['2nd_rally'])
    rally_shots.extend(second_rally_shots)
    shot_types.extend(['rally'] * len(second_rally_shots))
    shot_players.extend([None] * len(second_rally_shots))
    is_first_serve.extend([0] * len(second_rally_shots))

    # Handle second final shot
    second_final_shot = process_shots(row['2nd_final_shot'])
    rally_shots.extend(second_final_shot)
    shot_types.extend(['final_shot'] * len(second_final_shot))
    shot_players.extend([None] * len(second_final_shot))
    is_first_serve.extend([0] * len(second_final_shot))

    # Assign alternating shot players after each serve
    for i in range(len(shot_players)):
        if shot_players[i] is None:
            shot_players[i] = row['p1'] if shot_players[i - 1] == row['p2'] else row['p2']

    # Add all shots to the new dataframe
    for shot, shot_type, shot_player, first_serve_flag in zip(rally_shots, shot_types, shot_players, is_first_serve):
        shots_data.append({
            'match_id': row['match_id'],
            'pt': row['Pt'],
            'Svr': row['Svr'],
            'p1': row['p1'],
            'p2': row['p2'],
            'shot': shot,
            'shot_player': shot_player,
            'shot_type': shot_type,
            'rally_num': rally_shots.index(shot) + 1,
            'PtWinner': row['PtWinner'],
            'is_first_serve': first_serve_flag
        })

# Create new DataFrame with the shot-by-shot data
df = pd.DataFrame(shots_data)

In [None]:
# String detection
# Identify shot types
fh_shots = ['f', 'r', 'v', 'u', 'o', 'l', 'j', 'h']
bh_shots = ['b', 's', 'z', 'p', 'y', 'm', 'i', 'k']
slices = ['r', 's']
volleys = ['v', 'j', 'h', 'z', 'i', 'k']
drop_shot = ['u', 'y']
overhead = ['o', 'p']
lob = ['l', 'm']
# Add in cols. for shot types
df['fh'] = df['shot'].apply(lambda x: 1 if any(char in x for char in fh_shots) else 0)
df['bh'] = df['shot'].apply(lambda x: 1 if any(char in x for char in bh_shots) else 0)
df['slice'] = df['shot'].apply(lambda x: 1 if any(char in x for char in slices) else 0)
df['volley'] = df['shot'].apply(lambda x: 1 if any(char in x for char in volleys) else 0)
df['drop_shot'] = df['shot'].apply(lambda x: 1 if any(char in x for char in drop_shot) else 0)
df['overhead'] = df['shot'].apply(lambda x: 1 if any(char in x for char in overhead) else 0)
df['lob'] = df['shot'].apply(lambda x: 1 if any(char in x for char in lob) else 0)
df['trick_shot'] = df['shot'].str.contains('t', na=False).astype(int)
df['unknown_shot'] = df['shot'].str.contains('q', na=False).astype(int)
# Detect point end
df['ace'] = df.apply(lambda row: 1 if '*' in row['shot'] and row['shot_type'] == 'serve' else 0, axis=1)
df['unreturned_serve'] = df.apply(lambda row: 1 if '#' in row['shot'] and row['shot_type'] == 'serve' else 0, axis=1)
df['winner'] = df.apply(lambda row: 1 if '*' in row['shot'] and row['shot_type'] != 'serve' else 0, axis=1)
df['forced_error'] = df.apply(lambda row: 1 if '#' in row['shot'] and row['shot_type'] != 'serve' else 0, axis=1)
df['unforced_error'] = df.apply(lambda row: 1 if '@' in row['shot'] and row['shot_type'] != 'serve' else 0, axis=1)
# Detect serve locs
df['serve_wide'] = df['shot'].str.contains('4', na=False).astype(int)
df['serve_body'] = df['shot'].str.contains('5', na=False).astype(int)
df['serve_t'] = df['shot'].str.contains('6', na=False).astype(int)
df['serve_unknown'] = df['shot'].str.contains('0', na=False).astype(int)
# Detect error types
df['net'] = df['shot'].str.contains('n', na=False).astype(int)
df['wide'] = df['shot'].str.contains('w', na=False).astype(int)
df['deep'] = df['shot'].str.contains('d', na=False).astype(int)
df['deep_wide'] = df['shot'].str.contains('x', na=False).astype(int)
df['shank'] = df['shot'].str.contains('!', na=False).astype(int)
df['unknown_error'] = df['shot'].str.contains('e', na=False).astype(int)
# Detect court position
df['approach'] = df['shot'].str.contains('+', na=False, regex = False).astype(int)
df['at_net'] = df['shot'].str.contains('-', na=False, regex = False).astype(int)
df['at_baseline'] = df['shot'].str.contains('=', na=False, regex = False).astype(int)
df['stop_volley'] = df['shot'].str.contains('^', na=False, regex = False).astype(int)
df['drop_volley'] = df['shot'].str.contains('~', na=False, regex = False).astype(int)
df['net_cord'] = df['shot'].str.contains(';', na=False, regex = False).astype(int)

In [None]:
# Get player handedness
w_hand = data[['winner_name', 'winner_hand']].rename(columns={'winner_name': 'player', 'winner_hand': 'hand'})
l_hand = data[['loser_name', 'loser_hand']].rename(columns={'loser_name': 'player', 'loser_hand': 'hand'})
hand = pd.concat([w_hand, l_hand])
hand.drop_duplicates(inplace=True)

In [None]:
# Create a column for opponent
df['opp_player'] = df.apply(lambda row: row['p2'] if row['shot_player'] == row['p1'] else row['p1'], axis = 1)
# Join opponent with listed handedness
df = pd.merge(df, hand, left_on='opp_player', right_on='player', how='left')

In [None]:
# Detect groundstroke locations
df['to_fh'] = np.where((df['shot'].str.contains('1', na=False) & (df['hand'] == "R")) |
                       (df['shot'].str.contains('3', na=False) & (df['hand'] == "L")), 1, 0)
df['to_middle'] = df['shot'].str.contains('2', na=False).astype(int)
df['to_bh'] = np.where((df['shot'].str.contains('3', na=False) & (df['hand'] == "R")) |
                       (df['shot'].str.contains('1', na=False) & (df['hand'] == "L")), 1, 0)

In [None]:
# Build naïve bayes shot wp model (currently not accounting for specific player ability)
# If the shot player won the point (response variable)
df['player_win'] = df.apply(lambda row: 1 if row['shot_player'] == row['PtWinner'] else 0, axis = 1)
# If the shot player hit the serve
df['is_server'] = df.apply(lambda row: 1 if row['shot_player'] == row['Svr'] else 0, axis = 1)
# If the shot type is a serve
df['is_serve'] = df.apply(lambda row: 1 if row['shot_type'] == 'serve' else 0, axis = 1)

In [None]:
# Select model features from df for nb model
model_features = df[['match_id', 'player_win', 'shot_player', 'is_server', 'is_first_serve', 'is_serve', 'rally_num', # Shot/point information
    'serve_wide', 'serve_body', 'serve_t', # Serve information
    'fh', 'bh', 'slice', 'volley', 'drop_shot', 'overhead', 'lob', # Shot type
    'to_fh', 'to_middle', 'to_bh', # Shot location
    'approach', 'at_net', 'at_baseline', 'stop_volley', 'drop_volley', 'net_cord' # Court position
    ]]

In [None]:
# Define features (X) and target variable (y)
features = model_features.drop(columns=['player_win', 'match_id', 'shot_player', 'rally_num'])  # Drop the response and other variables
response = model_features['player_win']  # Response variable
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, response, test_size=0.1, random_state=13210)
test_data = model_features.loc[X_test.index]

In [None]:
# Initialize and train the Bernoulli Naïve Bayes model
model = BernoulliNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}')

Model Accuracy: 56.59


In [None]:
# Make predictions and apply model results onto model_features data
y_pred = model.predict(X_test) # Classified result
y_prob = model.predict_proba(X_test)[:, 1] # Probability
# Add predictions back into the test dataset
test_data['predicted_win'] = y_pred
test_data['predicted_prob'] = y_prob

In [None]:
# Create serve error rate cols
df['wide_error'] = df.apply(lambda row: 1 if row['serve_wide'] == 1 and
 (row['net'] == 1 or row['wide'] == 1 or row['deep'] == 1 or row['deep_wide'] == 1) else 0, axis = 1)
df['body_error'] = df.apply(lambda row: 1 if row['serve_body'] == 1 and
 (row['net'] == 1 or row['wide'] == 1 or row['deep'] == 1 or row['deep_wide'] == 1) else 0, axis = 1)
df['t_error'] = df.apply(lambda row: 1 if row['serve_t'] == 1 and
 (row['net'] == 1 or row['wide'] == 1 or row['deep'] == 1 or row['deep_wide'] == 1) else 0, axis = 1)

In [None]:
# Create fh/bh error/winner cols.
df['fh_error'] = df.apply(lambda row: 1 if (row['fh'] == 1) and (row['unforced_error'] == 1 or row['forced_error'] == 1) else 0, axis = 1)
df['bh_error'] = df.apply(lambda row: 1 if (row['bh'] == 1) and (row['unforced_error'] == 1 or row['forced_error'] == 1) else 0, axis = 1)
df['fh_winner'] = df.apply(lambda row: 1 if row['fh'] == 1 and row['winner'] == 1 else 0, axis = 1)
df['bh_winner'] = df.apply(lambda row: 1 if row['bh'] == 1 and row['winner'] == 1 else 0, axis = 1)

In [None]:
# Aggregate shot data by player
# Store individual serves and rallies data frames
serves = df[df['shot_type'] == 'serve']
rallies = df[df['shot_type'] != 'serve']
# Aggregate serve data by player
serve_data = serves.groupby('shot_player').agg({
    'ace': 'sum',
    'unreturned_serve': 'sum',
    'serve_wide': 'sum',
    'serve_body': 'sum',
    'serve_t': 'sum',
    'wide_error': 'sum',
    'body_error': 'sum',
    't_error': 'sum'

}).assign(n=serves.groupby('shot_player').size()).reset_index()
# Aggregate rally data by player
rally_data = rallies.groupby('shot_player').agg({
    'fh': 'sum',
    'bh': 'sum',
    'slice': 'sum',
    'volley': 'sum',
    'drop_shot': 'sum',
    'to_fh': 'sum',
    'to_middle': 'sum',
    'to_bh': 'sum',
    'fh_error': 'sum',
    'bh_error': 'sum',
    'fh_winner': 'sum',
    'bh_winner': 'sum',
    'winner': 'sum',
    'forced_error': 'sum',
    'unforced_error': 'sum',
    'approach': 'sum',
    'at_net': 'sum',
    'at_baseline': 'sum',
}).assign(n=rallies.groupby('shot_player').size()).reset_index()

In [None]:
# Create empty data frame
serve_df = pd.DataFrame()
# Collect serve stats in a new serve_df
serve_df['player'] = serve_data['shot_player']
serve_df['ace_rate'] = serve_data['ace']/serve_data['n']
serve_df['unreturned_serve_rate'] = serve_data['unreturned_serve']/serve_data['n']
serve_df['wide_freq'] = serve_data['serve_wide']/serve_data['n']
serve_df['body_freq'] = serve_data['serve_body']/serve_data['n']
serve_df['t_freq'] = serve_data['serve_t']/serve_data['n']
serve_df['wide_accuracy'] = serve_data['serve_wide']/(serve_data['serve_wide'] + serve_data['wide_error'])
serve_df['body_accuracy'] = serve_data['serve_body']/(serve_data['serve_body'] + serve_data['body_error'])
serve_df['t_accuracy'] = serve_data['serve_t']/(serve_data['serve_t'] + serve_data['t_error'])

In [None]:
# Create empty data frame
rally_df = pd.DataFrame()
# Collect rally stats in a new rally_data_df
rally_df['player'] = rally_data['shot_player']
rally_df['fh_rate'] = rally_data['fh']/rally_data['n']
rally_df['bh_rate'] = rally_data['bh']/rally_data['n']
rally_df['slice_rate'] = rally_data['slice']/rally_data['n']
rally_df['volley_rate'] = rally_data['volley']/rally_data['n']
rally_df['drop_shot_rate'] = rally_data['drop_shot']/rally_data['n']
rally_df['to_fh_rate'] = rally_data['to_fh']/rally_data['n']
rally_df['to_middle_rate'] = rally_data['to_middle']/rally_data['n']
rally_df['to_bh_rate'] = rally_data['to_bh']/rally_data['n']
rally_df['fh_accuracy'] = (rally_data['fh'] - rally_data['fh_error'])/(rally_data['fh'])
rally_df['bh_accuracy'] = (rally_data['bh'] - rally_data['bh_error'])/(rally_data['bh'])
rally_df['fh_winner_rate'] = rally_data['fh_winner']/rally_data['fh']
rally_df['bh_winner_rate'] = rally_data['bh_winner']/rally_data['bh']
rally_df['winner_rate'] = rally_data['winner']/rally_data['n']
rally_df['ue_rate'] = rally_data['unforced_error']/rally_data['n']
rally_df['fe_rate'] = rally_data['forced_error']/rally_data['n']
rally_df['in_rate'] = 1 - (rally_df['ue_rate'] + rally_df['fe_rate'])
rally_df['approach_rate'] = rally_data['approach']/rally_data['n']

In [None]:
def calculate_entropy(df, cols):
    # Number of categories
    n = len(cols)

    # Sum the occurrences for each row
    total = df[cols].sum(axis=1)

    # Calculate probabilities (avoid division by zero)
    probs = df[cols].div(total, axis=0).replace(0, np.nan)  # Replace 0s with NaN to avoid log issues

    # Compute entropy row-wise and normalize
    return probs.apply(lambda x: entropy(x, base=2) / np.log2(n), axis=1)

# Apply entropy function to cols in the dataframe
serve_df['serve_loc_entropy'] = calculate_entropy(serve_df, ['wide_freq', 'body_freq', 't_freq'])
rally_df['shot_loc_entropy'] = calculate_entropy(rally_df, ['to_fh_rate', 'to_middle_rate', 'to_bh_rate'])
# Get shot mix entropy
rally_df['regular_rate'] = 1 - rally_df['slice_rate'] + rally_df['volley_rate'] + rally_df['drop_shot_rate']
rally_df['shot_mix_entropy'] = calculate_entropy(rally_df, ['fh_rate', 'bh_rate', 'regular_rate', 'slice_rate', 'volley_rate', 'drop_shot_rate'])
rally_df['fh_to_bh_rate'] = rally_df['fh_rate']/rally_df['bh_rate']

In [None]:
cluster_data = pd.merge(serve_df, rally_df, on='player')

In [None]:
# Aggregate match data by player
# Get data split by winners
winners = data[['winner_name', 'winner_ht', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced',
                'loser_name', 'loser_ht', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced']].rename(
    columns={'winner_name': 'player', 'winner_ht': 'ht', 'w_ace': 'ace', 'w_df': 'df', 'w_svpt': 'svpt', 'w_1stIn': 'first_in', 'w_1stWon': 'first_win',
             'w_2ndWon': 'second_win', 'w_SvGms': 'svgms', 'w_bpSaved': 'bp_saved', 'w_bpFaced': 'bp_faced', 'loser_ht': 'opp_ht',
             'l_svpt': 'return_pts', 'l_1stIn': 'first_returns', 'l_1stWon': 'first_return_wins', 'l_2ndWon': 'second_return_wins', 'loser_name': 'opp_name',
             'l_SvGms': 'return_gms', 'l_bpSaved': 'bp_att', 'l_bpFaced': 'bp_won'}
)
winners['res'] = 1
# Get data split by losers
losers = data[['loser_name', 'loser_ht', 'minutes', 'w_svpt', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
                'winner_name', 'winner_ht', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced']].rename(
    columns={'loser_name': 'player', 'loser_ht': 'ht', 'l_ace': 'ace', 'l_df': 'df', 'l_svpt': 'svpt', 'l_1stIn': 'first_in', 'l_1stWon': 'first_win',
             'l_2ndWon': 'second_win', 'l_SvGms': 'svgms', 'l_bpSaved': 'bp_saved', 'l_bpFaced': 'bp_faced', 'winner_ht': 'opp_ht', 'winner_name': 'opp_name',
             'w_1stIn': 'first_returns', 'w_1stWon': 'first_return_wins', 'w_2ndWon': 'second_return_wins', 'w_svpt': 'return_pts',
             'w_SvGms': 'return_gms', 'w_bpSaved': 'bp_att', 'w_bpFaced': 'bp_won'}
)
losers['res'] = 0
# Aggregate winner/loser data
match_data = pd.concat([winners, losers])

In [None]:
# Aggregate data
stats = match_data.groupby('player').agg({
    'minutes': 'sum',
    'ace': 'sum',
    'df': 'sum',
    'svpt': 'sum',
    'first_in': 'sum',
    'first_win': 'sum',
    'second_win': 'sum',
    'svgms': 'sum',
    'bp_saved': 'sum',
    'bp_faced': 'sum',
    'opp_ht': 'sum',
    'return_pts': 'sum',
    'first_returns': 'sum',
    'first_return_wins': 'sum',
    'second_return_wins': 'sum',
    'return_gms': 'sum',
    'bp_att': 'sum',
    'bp_won': 'sum'
}).reset_index()
stats['first_pct'] = stats['first_in']/stats['svpt']
stats['first_win_pct'] = stats['first_win']/stats['first_in']
stats['second_pct'] = (stats['svpt'] - stats['first_in'] - stats['df'])/(stats['svpt'] - stats['first_in'])
stats['second_win_pct'] = stats['second_win']/(stats['svpt'] - stats['first_in'] - stats['df'])
stats['first_return_win_pct'] = stats['first_return_wins']/stats['first_returns']
stats['second_return_win_pct'] = stats['second_return_wins']/(stats['return_pts'] - stats['first_returns'])
stats['bp_saved_pct'] = stats['bp_saved']/stats['bp_faced']
stats['bp_won_pct'] = stats['bp_won']/stats['bp_att']
stats['svgm_win_pct'] = (stats['svgms'] - (stats['bp_faced'] - stats['bp_saved']))/stats['svgms']
stats['retgm_win_pct'] = stats['bp_won']/stats['return_gms']

In [None]:
# Combine aggregated shot data with aggregated match data for cluster model dataset
# Create a serve_rates Data Frame
serve_rates = stats[['player', 'first_pct', 'second_pct']]
# Join cluster and serve Data Frames to build cluster model
cluster_data = pd.merge(cluster_data, serve_rates, on='player', how = 'left').dropna()

In [None]:
# Select cluster_data features
cluster_features = cluster_data[['player', 'ace_rate', 'serve_loc_entropy', 'shot_loc_entropy', 'shot_mix_entropy',
              'fh_to_bh_rate', 'winner_rate', 'in_rate', 'ue_rate', 'fe_rate']]

In [None]:
# Build cluster model to get player types based on match and shot data
cluster_model_data = cluster_features.drop(columns=['player']).values
# Find the best number of clusters using silhouette score
best_k = 2  # Start with at least 2 clusters
best_score = -1  # Initialize to a low value
silhouette_scores = {}

for k in range(2, 11):  # Test clusters from 2 to 10
    kmeans = KMeans(n_clusters=k, random_state=13210, n_init=10)
    cluster_labels = kmeans.fit_predict(cluster_model_data)
    score = silhouette_score(cluster_model_data, cluster_labels)

    silhouette_scores[k] = score
    if score > best_score:
        best_k = k
        best_score = score

print(f'Optimal number of clusters: {best_k} with silhouette score: {best_score:.3f}')

In [None]:
# Fit the final K-Means model with the optimal number of clusters
final_kmeans = KMeans(n_clusters=best_k, random_state=13210, n_init=25)
# Apply cluster results onto cluster data
cluster_data['cluster'] = final_kmeans.fit_predict(cluster_model_data)
# Check cluster assignments
clusters = cluster_data[['player', 'cluster']]

In [None]:
# Create Data Frame with match and cluster data
match_data_with_clusters = pd.merge(match_data, clusters, left_on='opp_name', right_on = 'player', how = 'left')
# Collect win data by cluster
wins = match_data_with_clusters.groupby(['player_x', 'cluster']).agg({
    'res': 'sum'
}).assign(n=match_data_with_clusters.groupby(['player_x', 'cluster']).size()).reset_index()
wins['win_pct'] = wins['res']/wins['n']
cluster_win_pct = wins[['player_x', 'cluster', 'win_pct']].rename(columns={'player_x': 'player'})

In [None]:
# Create matches Data Frame (with selected cols.)
matches = data[['tourney_id', 'tourney_name', 'tourney_date', 'winner_name', 'loser_name']]
# Create a 'winner' column based on the winner_name
matches['winner'] = matches['winner_name']
# Randomly shuffle 50% of matches
mask = np.random.rand(len(matches)) < 0.5
# Apply mask to have the data
matches.loc[mask, ["p1", "p2"]] = matches.loc[mask, ["winner_name", "loser_name"]].values
matches.loc[~mask, ["p1", "p2"]] = matches.loc[~mask, ["loser_name", "winner_name"]].values
# Modify winner column
matches['p1_win'] = matches.apply(lambda row: 1 if row['p1'] == row['winner'] else 0, axis = 1)
# Select cols. for Data Frame
matches = matches[['tourney_id', 'tourney_name', 'tourney_date', "p1", "p2", "p1_win"]]

In [None]:
# Modify stats Data Frame
stats = stats[['player', 'first_pct', 'first_win_pct', 'second_pct', 'second_win_pct', 'first_return_win_pct',
               'second_return_win_pct', 'bp_saved_pct','bp_won_pct', 'svgm_win_pct', 'retgm_win_pct']]

In [None]:
# Join p1 data first
stats_p1 = pd.merge(matches, stats, left_on='p1', right_on='player', how='left', suffixes=("", "_other"))
stats_p1 = pd.merge(stats_p1, cluster_features, left_on='p1', right_on='player', how='left', suffixes=("", "_other"))
stats_p2 = pd.merge(stats_p1, stats, left_on='p2', right_on='player', how='left', suffixes=("", "_p2"))
stats_p2 = pd.merge(stats_p2, cluster_features, left_on='p1', right_on='player', how='left', suffixes=("", "_other"))
# Apply cluster results
cluster_matches = pd.merge(stats_p1, clusters, left_on = ['p2'], right_on = ["player"], how = 'left', suffixes=("", "2"))
rf_data = pd.merge(cluster_matches, cluster_win_pct, left_on = ['p2', 'cluster'], right_on = ["player", "cluster"], how = 'left', suffixes=("", "_other"))
rf_data = rf_data.dropna()
rf_data = rf_data[~rf_data.isin([np.inf, -np.inf]).any(axis=1)]

In [None]:
rf_data.drop(columns=["p1", "p2", "player", "p1_win", "player2", "player_other"])

In [None]:
# Build Random Forest match win classification model
# Want to apply k-fold cv

# Define features (X) and target variable (y)
X = rf_data.drop(columns=["tourney_name", "tourney_id", "tourney_date", "p1", "p2", "player", "p1_win", "player2", "player_other"])  # Replace 'target_column' with your actual target
y = rf_data['p1_win']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13210)

test_data = rf_data.loc[X_test.index]

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=13210)  # Change to RandomForestRegressor() for regression
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] # Probability for event
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred) # Correct/correct + incorrect
precision = precision_score(y_test, y_pred) # True positives/true positives + false positives
recall = recall_score(y_test, y_pred) # True positives/true positives + false negatives
f1 = f1_score(y_test, y_pred) # 2 x precision x recall/precision + recall
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model Precision: {precision * 100:.2f}%")
print(f"Model Recall: {recall * 100:.2f}%")
print(f"Model F1: {f1 * 100:.2f}%")

In [None]:
# Assuming you have your true labels (y_true) and predicted labels (y_pred)
cm = confusion_matrix(y_test, y_pred)

# Create a heatmap using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

In [None]:
test_data['p1_wp'] = y_prob
test_data['p2_wp'] = 1 - test_data['p1_wp']
test_data[['tourney_name', 'tourney_date', 'p1', 'p2', 'p1_win', 'p1_wp', 'p2_wp']]