# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
from scipy.stats import linregress
import warnings

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

path = "/kaggle/input/tennis-atp-tour-australian-open-final-2019/"
output_path = "/kaggle/working/"

warnings.filterwarnings("ignore")

In [2]:
# Import all data
events = pd.read_csv(path + "events.csv").drop("Unnamed: 0", axis = 1)
points = pd.read_csv(path + "points.csv").drop("Unnamed: 0", axis = 1)
serves = pd.read_csv(path + "serves.csv").drop("Unnamed: 0", axis = 1)
rallies = pd.read_csv(path + "rallies.csv").drop("Unnamed: 0", axis = 1)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/tennis-atp-tour-australian-open-final-2019/events.csv'

# Data Cleaning, Feature Engineering

In [None]:
def get_true_rally_times(events_data, points_data = points):
    """
    Purpose: Clean points dataset to get true rally times from events dataset

    Input(s):
        events_data (pd.DataFrame): Contains event-level data for each rally
        points_data (pd.DataFrame): Contains data at the level of each rally

    Output(s):
        points_data_clean (pd.DataFrame): points dataset with totaltime column changed
    """

    # Group by rally and returning min, max time
    points_minmax_times = events_data.groupby("rallyid").time.agg(["min", "max"]).reset_index()

    # Create elapsed time variable from min and max time difference
    points_minmax_times["elapsed_time"] = points_minmax_times["max"] - points_minmax_times["min"]

    # Join times to points dataframe 
    points_data_clean = pd.merge(points_data, points_minmax_times[["rallyid", "elapsed_time"]]).drop("totaltime", axis = 1).rename(columns = {"elapsed_time": "totaltime"})
    points_data_clean = points_data_clean[points_data_clean.rallyid != 101]
    
    return points_data_clean

In [None]:
def clean_rally_hitter_receiver_names(events_data):
    """
    Purpose: Clean events dataset to fill in missing hitter and receiver column values

    Input(s):
        events_data (pd.DataFrame): Contains event-level data for each rally

    Output(s):
        events_data_clean (pd.DataFrame): events dataset with hitter/receiver column values changed    
    """

    # Create a DataFrame with cleaned rally names for specific rally and stroke IDs
    cleaned_rally_names = pd.DataFrame({
        "rallyid": [81, 110, 115, 155, 156, 12, 21, 29, 135, 149, 152, 154, 159, 160, 160, 160, 161, 168, 175, 176, 194, 196, 196, 196, 196],
        "strokeid": [3, 7, 8, 8, 4, 1, 1, 1, 7, 2, 11, 14, 1, 1, 2, 3, 7, 5, 2, 8, 13, 1, 2, 3, 4], 
        "hitter": ["Nadal", "Djokovic", "Nadal", "Djokovic", "Nadal", "Nadal", "Nadal", "Nadal", "Nadal", "Djokovic", "Nadal", "Djokovic", "Djokovic", "Djokovic", 
                   "Nadal", "Djokovic", "Djokovic", "Nadal", "Nadal", "Nadal", "Djokovic", "Djokovic", "Nadal", "Djokovic", "Nadal"], 
        "receiver": ["Djokovic", "Nadal", "Djokovic", "Nadal", "Djokovic", "Djokovic", "Djokovic", "Djokovic", "Djokovic", "Nadal", "Djokovic", "Nadal", "Nadal", "Nadal", 
                     "Djokovic", "Nadal", "Nadal", "Djokovic", "Djokovic", "Djokovic", "Nadal", "Nadal", "Djokovic", "Nadal", "Djokovic"]
    })

    # Merge the events data with the cleaned rally names on rallyid and strokeid
    events_merge = pd.merge(events_data, cleaned_rally_names, how="left", on=["rallyid", "strokeid"], suffixes=("_old", "_new"))

    # Update the hitter column with old hitter values
    events_merge["hitter"] = events_merge.hitter_old
    
    # Update the receiver column based on certain conditions
    events_merge["receiver"] = np.where((events_merge.hitter_old == events_merge.receiver_old) | (events_merge.receiver_old.isna()), 
                                        events_merge.receiver_new, events_merge.receiver_old)
    # Handle undefined receiver cases and assign values based on hitter
    events_merge["receiver"] = np.where((events_merge.receiver == "__undefined__") & (events_merge.hitter == "Nadal"), "Djokovic",
                                       np.where((events_merge.receiver == "__undefined__") & (events_merge.hitter == "Djokovic"), "Nadal", 
                                                events_merge.receiver))

    # Return relevant columns
    return_cols = ["rallyid", "frameid", "strokeid", "hitter", "receiver", "isserve", "serve", "type", "stroke", "hitter_x", "hitter_y", "receiver_x", "receiver_y", "time"]
    events_data_clean = events_merge[return_cols]
    
    return events_data_clean

In [None]:
def clean_bad_event_data_info(events_data):
    """
    Purpose: Clean events dataset to correct found data issues in multiple columns

    Input(s):
        events_data (pd.DataFrame): Contains event-level data for each rally

    Output(s):
        events_data_clean (pd.DataFrame): events dataset with uncovered data issues remedied   
    """
    
    # Remove data that is an error upon review
    events_data_clean = events_data[(events_data.rallyid != 5) & ~((events_data.rallyid == 11) & (events_data.strokeid == 1))]

    # Change rally 101 strokes, rally id
    events_data_clean.loc[events_data_clean.frameid == 157969, "strokeid"] = 16
    events_data_clean.loc[events_data_clean.frameid == 157982, "strokeid"] = 17

    events_data_clean.loc[events_data_clean.rallyid == 101, "rallyid"] = 100

    # Change any undefined stroke column values
    events_data_clean.loc[events_data_clean.stroke == "__undefined__", "stroke"] = "forehand"
    
    return events_data_clean

def clean_bad_event_data_locations(events_data):
    """
    Purpose: Clean events dataset to correct poor location data (e.g. Player is inside baseline when returning a serve)

    Input(s):
        events_data (pd.DataFrame): Contains event-level data for each rally

    Output(s):
        events_data (pd.DataFrame): events dataset with poor location data for specific events fixed 
    """

    # Clean bad location data on serves/returns to be behind the baseline
    events_data["receiver_y"] = np.where(
        (events_data.isserve) & (events_data.serve == "second") & 
            ((events_data.receiver_y > 1) & (events_data.receiver_y < 22) & (events_data.receiver_y > 12)), 24.0, 
        np.where(
            (events_data.isserve) & (events_data.serve == "first") & ((events_data.receiver_y > 1) & 
                                                                               (events_data.receiver_y < 22) & (events_data.receiver_y > 12)), 28.0,
            np.where(
                (events_data.isserve) & (events_data.serve == "second") & 
                    ((events_data.receiver_y > 1) & (events_data.receiver_y < 22) & (events_data.receiver_y < 12)), -0.2,
                np.where(
                    (events_data.isserve) & (events_data.serve == "first") & 
                        ((events_data.receiver_y > 1) & (events_data.receiver_y < 22) & (events_data.receiver_y < 12)), -4,  
                    events_data.receiver_y
                )
            )
        )
    )

    events_data["hitter_y"] = np.where(
        (events_data.isserve) & ((events_data.hitter_y > 1) & (events_data.hitter_y < 22) & (events_data.hitter_y > 12)), 24.0, 
        np.where(
            (events_data.isserve) & ((events_data.hitter_y > 1) & (events_data.hitter_y < 22) & (events_data.hitter_y < 12)), -0.2, 
            events_data.hitter_y
        )
    )
        
    return events_data

def clean_event_data(events_data):
    """
    Purpose: Combine sub-functions to perform narrow data cleaning tasks to fully clean event data

    Input(s):
        events_data (pd.DataFrame): Contains event-level data for each rally

    Output(s):
        event_data_cleaned_names_locations (pd.DataFrame): events dataset with data issues fixed
    """

    # Clean data errors with respect to individual events
    event_data_cleaned_names = clean_bad_event_data_info(events_data)

    # Clean errors with respect to locations
    event_data_cleaned_names_locations = clean_bad_event_data_locations(event_data_cleaned_names)

    return event_data_cleaned_names_locations

In [None]:
def create_new_features_multioutput_model(events_data):
    """
    Purpose: Read in events dataset to create features prior to training regression model

    Input(s):
        events_data (pd.DataFrame): Contains event data prior to transformation

    Output(s):
        events_data (pd.DataFrame): Event data with added features of time between rally hit and type of rally hit
    """
    # Create new feature: time difference from previous hit (seconds)
    events_data["time_diff"] = events_data['time'].diff()
    events_data.loc[events_data['strokeid'] == 1, 'time_diff'] = 0
    
    # Combine serve columns and type column to create new column with all information
    events_data["type_of_shot"] = np.where(
        (events_data.isserve) & (events_data.serve == "first") & (events_data.type == "serve"), "first_serve",
        np.where(
            (events_data.isserve) & (events_data.serve == "second") & (events_data.type == "serve"), "second_serve", events_data.type
        )
    )
    
    # For categorical columns in data, convert into type category
    for col in ['type_of_shot', 'stroke']:
        events_data[col] = events_data[col].astype('category')
    
    return events_data

In [None]:
def perform_ohc_transformation(data, cols_to_encode, one_hot_encoder = None):
    """
    Purpose: With raw dataset, transform input columns with one-hot encoder

    Input(s): 
        data (pd.DataFrame): Raw dataset
        cols_to_encode (list): contains names of column in input
        one_hot_encoder (NoneType or Sklearn object): default None, creates or uses preexisting one-hot encoder object

    Output(s):
        preprocessed_data (NumPy array): Dataset with one-hot encoded columns
        one_hot_encoder (Sklearn object): Newly created or existing OHC object
    """

    # Check if OHC object exists
    if one_hot_encoder is None:
        # Initialize OHC Object
        one_hot_encoder = OneHotEncoder(sparse=False)

        # Fit the encoder to the specified columns
        encoded_columns = one_hot_encoder.fit_transform(data[cols_to_encode])

    else:
        # Fit the encoder to the specified columns
        encoded_columns = one_hot_encoder.transform(data[cols_to_encode])
        
    # Convert encoded columns to DataFrame
    encoded_cols = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out(cols_to_encode))
        
    # Concatenate encoded columns with the original DataFrame
    preprocessed_data = np.array(pd.concat([data.drop(columns=cols_to_encode), encoded_cols], axis=1))    
    
    return(preprocessed_data, one_hot_encoder)

In [None]:
# Execute functions to clean events and points data
events_clean_names = clean_rally_hitter_receiver_names(events)
events_data_clean_evts = clean_event_data(events_clean_names)

# Set new points, events data to cleaned versions
points_clean = get_true_rally_times(events_data_clean_evts)
events_clean = events_data_clean_evts

In [None]:
#  Perform data transformations for multioutput model dataset
events_multioutput_model_data = create_new_features_multioutput_model(events_clean)

# Set columns to include in input 
model_data_cols = ["strokeid", "type_of_shot", "stroke", "time_diff", "hitter_x", "hitter_y", "receiver_x", "receiver_y"]
y_cols = ["receiver_x", "receiver_y"]

# Split dataset into train/validation dataset and prediction dataset (missing locations)
train_data = events_multioutput_model_data[~events_multioutput_model_data.receiver_x.isna()][model_data_cols].reset_index(drop=True)

train_X = train_data.drop(y_cols, axis = 1)
train_y = train_data[y_cols]

predict_data = events_multioutput_model_data[events_multioutput_model_data.receiver_x.isna()][model_data_cols].reset_index(drop=True)
predict_X = predict_data.drop(y_cols, axis = 1)

# Specify columns to be encoded
columns_to_encode = ['type_of_shot', 'stroke']

# OHC categorical columns for preprocessing
X_tr, encoder = perform_ohc_transformation(train_X, columns_to_encode)
y = np.array(train_y)

In [None]:
# Initialize RandomForest Regressor Model
multioutput_rforest = MultiOutputRegressor(RandomForestRegressor(min_samples_split = 4, max_depth = 4, random_state = 64))

# Retrain model on entire dataset, predict missing receiver locations
multioutput_rforest.fit(X_tr, y)

# On dataset with missing values, transform categorical columns with OHC object 
predict_X_tr, encoder = perform_ohc_transformation(predict_X, columns_to_encode, one_hot_encoder = encoder)

predict_data[["receiver_x", "receiver_y"]] = multioutput_rforest.predict(predict_X_tr)

# Using predictions, replace missing locations with predictions
imputed_location_event_data = pd.merge(
    events_clean[events_clean.receiver_x.isna()].drop(y_cols, axis = 1),
    predict_data
)

nonimputed_location_event_data = events_clean[~events_clean.receiver_x.isna()]

# Concat data with and without imputed locations
clean_dataset_cols = ["rallyid", "strokeid", "hitter", "receiver", "stroke", "type_of_shot", "hitter_x", "hitter_y", "receiver_x", "receiver_y", "time_diff"]

events_clean_final = pd.concat([imputed_location_event_data, nonimputed_location_event_data])[clean_dataset_cols]

## Location Data Transformations

In [None]:
def transform_player_loc_data(events_data, points_data):
    """
    Purpose: Reflect the server and receiver location points above y = 11.89, so that the server is on the side closest to the baseline in each row

    Input(s):
        events_data (pd.DataFrame): DataFrame containing the location data for players
        points_data (pd.DataFrame): DataFrame containing the identity of the server for each point

    Output(s):
        events_data (pd.DataFrame): Updated DataFrame with reflected location points for players
    """

    # Merge events and points dataset to create identity of server column
    rally_server_cols = ["rallyid", "server"]
    disqualifying_events = ["ace", "double_fault"]
        
    events_data = pd.merge(events_data, points_data[~points_data.reason.isin(disqualifying_events)][rally_server_cols], on = "rallyid")

    # Reflection parameters
    y_reflect_line = 11.89
    x_reflect_line = 5.49

    # Transform data into a single column for server, receiver locations
    events_data["server_x_tr"] = np.where(events_data.hitter == events_data.server, events_data.hitter_x, events_data.receiver_x)
    events_data["server_y_tr"] = np.where(events_data.hitter == events_data.server, events_data.hitter_y, events_data.receiver_y)
    
    events_data["receiver_x_tr"] = np.where(events_data.hitter != events_data.server, events_data.hitter_x, events_data.receiver_x)
    events_data["receiver_y_tr"] = np.where(events_data.hitter != events_data.server, events_data.hitter_y, events_data.receiver_y)

    # For location points of servers above y = 11.89
    
    # Reflect over x = 5.49
    events_data["server_x_refl"] = np.where(events_data.server_y_tr > y_reflect_line, 2 * x_reflect_line - events_data.server_x_tr, events_data.server_x_tr)

    # Reflect over y = 11.89
    events_data["server_y_refl"] = np.where(events_data.server_y_tr > y_reflect_line, 2 * y_reflect_line - events_data.server_y_tr, events_data.server_y_tr)

    # Perform the opposite for receivers, to standardize all location data as the server is closest to TV camera

    # Reflect over x = 5.49
    events_data["receiver_x_refl"] = np.where(events_data.server_y_tr > y_reflect_line, 2 * x_reflect_line - events_data.receiver_x_tr, events_data.receiver_x_tr)

    # Reflect over y = 11.89
    events_data["receiver_y_refl"] = np.where(events_data.server_y_tr > y_reflect_line, 2 * y_reflect_line - events_data.receiver_y_tr, events_data.receiver_y_tr)

    # Remove redundant columns from data
    redundant_cols = ["hitter", "receiver", "server", "hitter_x", "hitter_y", "receiver_x", "receiver_y", 
                      "server_x_tr", "server_y_tr", "receiver_x_tr", "receiver_y_tr"]
    
    events_data = events_data.drop(redundant_cols, axis = 1)

    # Rename _refl columns
    events_data = events_data.rename(columns = {
        "server_x_refl": "server_x",
        "server_y_refl": "server_y",
        "receiver_x_refl": "receiver_x",
        "receiver_y_refl": "receiver_y"
    })
    
    return events_data

In [None]:
def create_distance_measures_from_baseline(events_data):
    """
    Purpose: Create measures of distance from center of baseline for each player, conditional on the side of court each is on

    Input(s):
        events_data (pd.DataFrame): Contains event data with transformed location data for server/receiver for each point

    Output(s):
        events_data (pd.DataFrame): Event data with server and receiver distances from own baseline added
    """

    # Set value for height, width of court
    height_court = 10.97
    width_court = 11.89*2

    # Create coordinate for center of each baseline
    server_side_baseline_center = [height_court/2, 0]
    receiver_side_baseline_center = [height_court/2, width_court]
    
    # For server and receiver locations (transformed), calculate distance from respective baseline with L2 Norm
    server_locations = np.array(events_data[["server_x", "server_y"]])
    receiver_locations = np.array(events_data[["receiver_x", "receiver_y"]])
    
    server_distances_from_baseline = np.linalg.norm((server_locations - [server_side_baseline_center]), axis = 1)
    receiver_distances_from_baseline = np.linalg.norm((receiver_locations - [receiver_side_baseline_center]), axis = 1)
    
    events_data["server_distance_from_baseline"] = server_distances_from_baseline
    events_data["receiver_distance_from_baseline"] = receiver_distances_from_baseline

    return events_data

In [None]:
def create_distance_measures_from_previous_hit(events_data):
    """
    Purpose: Create measures of distance from each player's previous hit, using their transformed location data

    Input(s):
        events_data (pd.DataFrame): Contains event data with transformed location data for server/receiver for each point

    Output(s):
        events_data (pd.DataFrame): Event data with server and receiver distances from previous location added
    """

    # Create previous location variable
    events_data["server_x_prev"] = events_data['server_x'].shift(1)
    events_data["server_y_prev"] = events_data['server_y'].shift(1)
    
    events_data["receiver_x_prev"] = events_data['receiver_x'].shift(1)
    events_data["receiver_y_prev"] = events_data['receiver_y'].shift(1)
    
    # Set all previous locations for serve strokes to NA
    events_data.loc[events_data.strokeid == 1, 'server_x_prev'] = np.nan
    events_data.loc[events_data.strokeid == 1, 'server_y_prev'] = np.nan
    
    events_data.loc[events_data.strokeid == 1, 'receiver_x_prev'] = np.nan
    events_data.loc[events_data.strokeid == 1, 'receiver_y_prev'] = np.nan

    return events_data

In [None]:
# Perform feature additions/transformations using location data
events_data_transformed = transform_player_loc_data(events_clean_final, points_clean)
events_data_transformed = create_distance_measures_from_baseline(events_data_transformed)
events_data_transformed = create_distance_measures_from_previous_hit(events_data_transformed)

events_data_transformed.head(20)

# Dataset Creation

In [None]:
# Baseline Model ("Historical Average") 
# reference: https://github.com/JeffSackmann/tennis_MatchChartingProject/blob/master/charting-m-points-2010s.csv
print(f"% of Points Won by Server (2010-2019): {round(0.6423463 * 100, 1)}", )

In [None]:
# Create variable for target outcome: If the player on serve won the point
points_clean["ServerWinsPoint"] = np.where(points_clean.server == points_clean.winner, 1, 0)

# Join points dataset with rallyid and target outcome to dataset
event_model_dataset = pd.merge(
    events_data_transformed,
    points_clean[['rallyid', 'ServerWinsPoint']]
)

event_model_dataset.head(20)

In [None]:
# Sequential Model
model_train_set = event_model_dataset[event_model_dataset.rallyid < 169]

# Create test set to be the last 2 games on serve for each in the test set
model_test_set = event_model_dataset[event_model_dataset.rallyid >= 169]

model_train_set.to_csv(output_path + "training_set.csv", index = False)
model_test_set.to_csv(output_path + "test_set.csv", index = False)