In [None]:
import numpy as np
import pandas as pd
import warnings

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import optuna
import xgboost as xgb
from xgboost import plot_importance
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import log_loss, brier_score_loss

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model

path = "/kaggle/input/xspaa-data/"
output_path = "/kaggle/working/"

warnings.filterwarnings("ignore")

tf.random.set_seed(909)
np.random.seed(909)

In [None]:
# Read in Data
train_data = pd.read_csv(path + "training_set.csv")
test_data = pd.read_csv(path + "test_set.csv")
points_data = pd.read_csv(path + "points_data.csv")

In [None]:
# Set X and y columns
xgb_model_data_cols = ['stroke', 'type_of_shot', 'server_x', 'server_y', 'receiver_x', 'receiver_y']

train_X = train_data[xgb_model_data_cols].astype({"stroke":'category', "type_of_shot":'category'}) 
train_y = train_data["ServerWinsPoint"]

# Baseline

In [None]:
# Create a copy of test_data to maintain a baseline
baseline_test_data = test_data.copy()

# Define the baseline prediction probability for server winning point
baseline_pred_proba = 0.642

test_rally_outcomes = baseline_test_data[["rallyid", "ServerWinsPoint"]].drop_duplicates()

# Add pred, probability columns
test_rally_outcomes["pred"] = round(baseline_pred_proba)
test_rally_outcomes["prob"] = baseline_pred_proba

In [None]:
# Calculate Log Loss and Brier Score for points
baseline_outcome_lloss = log_loss(test_rally_outcomes["ServerWinsPoint"], test_rally_outcomes["pred"])
baseline_outcome_brier = brier_score_loss(test_rally_outcomes["ServerWinsPoint"], test_rally_outcomes["prob"])

print(f"Baseline Outcome Model Log Loss: {baseline_outcome_lloss}")
print(f"Baseline Outcome Model Brier Score: {baseline_outcome_brier}")

In [None]:
# Calculate xSPW values, sums from taking outcome - probability
test_rally_outcomes["xSPW"] = test_rally_outcomes["ServerWinsPoint"] - test_rally_outcomes["prob"]
baseline_xSPW = pd.merge(points_data[["rallyid", "server"]], test_rally_outcomes).groupby("server").xSPW.sum().reset_index()
baseline_xSPW

# XGBoost

In [None]:
def objective(trial):
    """
    Purpose: Create objective function to optimize model hyperparameters using Optuna.

    Input(s):
        trial (optuna.trial.Trial): A trial object that suggests hyperparameters.

    Output(s):
        cv_logloss (float): Negative mean log loss score from cross-validation.
    """

    # Define the parameter space for XGBClassifier
    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        "booster": "gbtree",
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 9),
        "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }
    
    # Train XGBClassifier with the defined parameters and evaluate using log loss
    model = xgb.XGBClassifier(**param, eval_metric='logloss', use_label_encoder=True, enable_categorical=True)

    # Set up KFold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=909)
    logloss_scores = cross_val_score(model, train_X, train_y, scoring='neg_log_loss', cv=kf, n_jobs=-1)

    cv_logloss = -np.mean(logloss_scores)
    return cv_logloss

# Run the optimization study with direction to minimize the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print(f"Best Hyperparameters: {study.best_params}")
best_params = study.best_params

In [None]:
# Train final model with best params
xgb_model = xgb.XGBClassifier(**best_params, use_label_encoder = True, enable_categorical = True)
xgb_model.fit(train_X, train_y)

plot_importance(xgb_model, importance_type='weight')

In [None]:
# Repeat process for XGBoost, predicting only with final rally
test_data_final_stroke = test_data.groupby("rallyid").strokeid.max().reset_index()
test_X_xgb = pd.merge(test_data, test_data_final_stroke)[xgb_model_data_cols].astype({"stroke":'category', "type_of_shot":'category'}) 
test_y_xgb = test_data[["rallyid", "ServerWinsPoint"]].drop_duplicates().reset_index(drop = True)

test_y_outcomes = test_data.ServerWinsPoint
pred_y = xgb_model.predict(test_X_xgb)
pred_proba_y = xgb_model.predict_proba(test_X_xgb)

In [None]:
# Caluclate Log Loss and Brier Score
xgboost_lloss = log_loss(test_y_xgb.ServerWinsPoint, pred_y)
xgboost_brier = brier_score_loss(test_y_xgb.ServerWinsPoint, pred_proba_y[:, 1])

print(f"XGBoost Event Model Log Loss: {xgboost_lloss}")
print(f"XGBoost Event Model Brier Score: {xgboost_brier}")

In [None]:
# Calculate xSPW values, sums
test_y_xgb["prob"] = pred_proba_y[:, 1]
test_y_xgb["xSPW"] = test_y_xgb["ServerWinsPoint"] - test_y_xgb["prob"]

xgb_xSPW = pd.merge(points_data[["rallyid", "server"]], test_y_xgb).groupby("server").xSPW.sum().reset_index()
xgb_xSPW

# Neural Nets

In [None]:
# Set X and y columns
nnet_model_data_cols = ["rallyid", "strokeid", 'stroke', 'type_of_shot', 'time_diff','server_x', 'server_y', 'receiver_x', 'receiver_y',
                   'server_distance_from_baseline', 'receiver_distance_from_baseline']

train_X = train_data[nnet_model_data_cols].astype({"stroke":'category', "type_of_shot":'category'}) 
test_X = test_data[nnet_model_data_cols].astype({"stroke":'category', "type_of_shot":'category'}) 

train_y = train_data[["rallyid", "ServerWinsPoint"]].drop_duplicates().reset_index(drop = True)
test_y = test_data[["rallyid", "ServerWinsPoint"]].drop_duplicates().reset_index(drop = True)

In [None]:
def perform_ohc_transformation(data, cols_to_encode, one_hot_encoder = None):
    """
    Purpose: With raw dataset, transform input columns with one-hot encoder

    Input(s): 
        data (pd.DataFrame): Raw dataset
        cols_to_encode (list): contains names of column in input
        one_hot_encoder (NoneType or Sklearn object): default None, creates or uses preexisting one-hot encoder object

    Output(s):
        preprocessed_data (NumPy array): Dataset with one-hot encoded columns
        one_hot_encoder (Sklearn object): Newly created or existing OHC object
    """

    # Check if OHC object exists
    if one_hot_encoder is None:
        # Initialize OHC Object
        one_hot_encoder = OneHotEncoder(sparse=False, drop = "first", handle_unknown = "ignore")

        # Fit the encoder to the specified columns
        encoded_columns = one_hot_encoder.fit_transform(data[cols_to_encode])

    else:
        # Fit the encoder to the specified columns
        encoded_columns = one_hot_encoder.transform(data[cols_to_encode])
        
    # Convert encoded columns to DataFrame
    encoded_cols = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out(cols_to_encode))
        
    # Concatenate encoded columns with the original DataFrame
    preprocessed_data = pd.concat([data.drop(columns=cols_to_encode), encoded_cols], axis=1)
    
    return(preprocessed_data, one_hot_encoder)

In [None]:
def prepare_nnet_data(x_train, y_train, ohc_cols, fit_ohc=None):
    """
    Purpose: Prepare the data for training an LSTM model via OHC and segmenting data into lists of arrays

    Input(s):
        x_train (pd.DataFrame): Contains training data
        y_train (pd.DataFrame): Contains training data labels
        ohc_cols (list): List of columns to be one-hot encoded
        fit_ohc (OneHotEncoder): Pre-fitted OneHotEncoder. Default None, creates a new encoder if None

    Output(s):
        tuple: A tuple containing preprocessed data with padding (X, y), plus fitted OneHotEncoder.
    """
    
    # Perform one-hot encoding
    if fit_ohc is None:
        X_tr, encoder = perform_ohc_transformation(x_train, ohc_cols)
    else:
        X_tr, encoder = perform_ohc_transformation(x_train, ohc_cols, one_hot_encoder=fit_ohc)


    # Prepare the data for LSTM
    X = []
    y = []
    rallies = X_tr["rallyid"].unique()

    # Iterating through unique rallies
    for rally in rallies:
        # Select all events
        event_data = X_tr[X_tr["rallyid"] == rally]
        outcome = y_train[y_train["rallyid"] == rally]

        # Convert data into own array, add to list of data
        X.append(event_data.drop(["rallyid", "strokeid"], axis=1).values)
        y.append(outcome["ServerWinsPoint"].values[0])

    y = np.array(y)
    return (X, y, encoder)

In [None]:
# Specify columns to be encoded
columns_to_encode = ['type_of_shot', 'stroke']
X_nnet, y_nnet, ohc = prepare_nnet_data(train_X, train_y, columns_to_encode)

In [None]:
# Define number of timestamps, features to use in LSTM model
max_strokes_per_rally = train_X.groupby("rallyid").strokeid.count().reset_index().strokeid.max()
num_features = len(X_nnet[0][0])

# Pad preprocessed data to go up to max number of timestamps 
X_nnet_padded = pad_sequences(X_nnet, padding = 'post', dtype = 'float32', value = -100, maxlen = max_strokes_per_rally)

In [None]:
# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=909)
fold = 1

log_loss_scores = []

# Iterating through each fold
for train_index, test_index in kf.split(X_nnet_padded):
    print(f'Fold {fold}')
    X_train, X_test = X_nnet_padded[train_index], X_nnet_padded[test_index]
    y_train, y_test = y_nnet[train_index], y_nnet[test_index]

    # Defining the LSTM Sequential model (simple, heavily regularized)
    model = Sequential()
    model.add(LSTM(5, activation='relu', recurrent_dropout=0.1, 
              kernel_regularizer=l2(0.01), input_shape = (max_strokes_per_rally, num_features)))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='loss', patience=5, min_delta = 0.5,restore_best_weights=True)
    
    # Train the model with callback
    model.fit(X_train, y_train, epochs=25, batch_size=16, callbacks=[early_stopping])

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Accuracy for fold {fold}: {accuracy * 100:.2f}%')
    
    # Predict the outcome
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    print(f'Predicted outcomes for fold {fold}: {y_pred.flatten()}')

    log_loss_scores.append(loss)
    fold += 1

In [None]:
# Log-Loss as mean across all folds
print(f"LSTM Model, Mean CV Log-Loss: {np.mean(log_loss_scores)}")

# Train Final Model, Get Predictions

In [None]:
# Retrain model using all sequences
model = Sequential()
model.add(LSTM(5, activation='relu', recurrent_dropout=0.1, 
              kernel_regularizer=l2(0.01), input_shape = (max_strokes_per_rally, num_features)))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='loss', patience=5, min_delta = 0.5, restore_best_weights=True)
    
model.fit(X_nnet_padded, y_nnet, epochs=25, batch_size=16, callbacks=[early_stopping])

In [None]:
# Preprocess test set
test_X_nnet, test_y_nnet, ohc = prepare_nnet_data(test_X, test_y, columns_to_encode, ohc)
test_X_nnet_padded = pad_sequences(test_X_nnet, padding = 'post', dtype = 'float32', value = -100, maxlen = max_strokes_per_rally)

In [None]:
# Calculate test set preds, Log Loss, Brier Score
test_y["prob"] = model.predict(test_X_nnet_padded)
test_y["xSPW"] = test_y.ServerWinsPoint - test_y.prob

print(f"LSTM Model Log-Loss: {log_loss(test_y['ServerWinsPoint'], test_y['prob'])}")
print(f"LSTM Model Brier Score: {brier_score_loss(test_y['ServerWinsPoint'], test_y['prob'])}")

In [None]:
# Recalculate xSPW values, sums
player_xSPW = pd.merge(points_data[["rallyid", "server"]], test_y)
player_summary_final_games = player_xSPW.groupby("server").xSPW.sum().reset_index()
player_summary_final_games

In [None]:
# Save model
model.save(output_path + 'xspw_prototype.h5')

In [None]:
def create_cumulative_sequences(data, length, n, m):
    """
    Purpose: Create cumulative sequences of event data with padding

    Input(s):
        data (list): A list of sequences containing event data
        length (int): The maximum length of the sequences after padding
        n (int): The number of sequences
        m (int): The number of features in each sequence

    Output(s):
        events_component (np.ndarray): An array of padded cumulative event sequences with the shape (n, length, m).
    """
    
    # Initialize list to hold cumulative event sequences
    event_components = []

    # Iterate through each sequence in the data
    for ind in range(len(data)):
        for event_ind in range(len(data[ind])):
            # Create a cumulative sequence up to the current event
            sequence = [data[ind][:event_ind + 1]]
            
            # Pad the sequence to the specified length
            padded_sequence = pad_sequences(sequence, padding='post', dtype='float32', value=-100, maxlen=length)
            event_components.append(padded_sequence)
    
    # Reshape the event components to the desired shape
    event_components = np.array(event_components).reshape(n, length, m)

    return event_components

In [None]:
# Convert test set into list of cumulative padded sequences for each event
num_samples = test_X.shape[0]
n_features = test_X_nnet[0].shape[1]

X_test_sequential = create_cumulative_sequences(test_X_nnet, max_strokes_per_rally, num_samples, n_features)

In [None]:
# For each sequence, predict probabilty of server winning at each event
predict_df = test_X.copy()
predict_df["prob"] = model.predict(X_test_sequential)

#predict_df.to_csv("event_predictions.csv", index = False)