In [1]:
# Import dependencies
from utils.features import *
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras import metrics
from keras.models import load_model
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

In [2]:
# Import Aus open csv with model features
df = pd.read_csv('data/aus_open_features.csv')

df = df.dropna()

# Calculate column for difference in rank between the players
df['diff_rank'] = df['player_1_rank'] - df['player_2_rank']

In [3]:
# List with the features we want from our df for X
df_features_list = [
 'diff_rank',
 'diff_match_win_percent',
 'diff_games_win_percent',
 'diff_5_sets_match_win_percent',
 'diff_close_sets_percent',
 'diff_match_win_percent_hard',
 'diff_games_win_percent_hard',
 'diff_5_sets_match_win_percent_hard',
 'diff_close_sets_percent_hard',
 'diff_match_win_percent_60',
 'diff_games_win_percent_60',
 'diff_5_sets_match_win_percent_60',
 'diff_close_sets_percent_60',
 'diff_match_win_percent_hard_100',
 'diff_games_win_percent_hard_100',
 'diff_5_sets_match_win_percent_hard_100',
 'diff_close_sets_percent_hard_100',
 'diff_match_win_percent_h2h',
 'diff_games_win_percent_h2h',
 'diff_match_win_percent_hard_h2h',
 'diff_games_win_percent_hard_h2h']

In [4]:
# Adjust test_size to 0.20 as online guides suggest it is best - don't need to preprocess - this is already done
X = df[df_features_list]
y = df.result

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [5]:
# Create model and add layers - note that sigmoid is the output layer as it is best at handling probabilities between 0 and 1
model = Sequential()
model.add(Dense(units=64, activation='relu', input_shape=(len(X.columns),)))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

In [6]:
# Compile and fit the model
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                1408      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 3,521
Trainable params: 3,521
Non-trainable params: 0
_________________________________________________________________


In [8]:
# Adding Keras Callbacks to the model

# Early stopping monitoring value loss is added to try to ensure the model doesn't overfit - will stop after it doesn't improve after 500 iterations
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=300)

# Saves model
model_checkpoint = ModelCheckpoint('ml-model-and-predictions/model.h5', monitor='val_loss', mode='min', verbose=2, save_best_only=True)

# CSV logger to save the results of the training into a CSV file
csv_log = CSVLogger("ml-model-and-predictions/training-data.csv")

# Fit the model - verbose is 0 as recording info in csv log anyway - and only best model is being saved
history = model.fit(X_train, y_train, 
            epochs=1000, verbose=0, batch_size=128, 
            validation_data=(X_test, y_test), callbacks=[early_stopping, model_checkpoint, csv_log]) 

# Load the saved model
saved_model = load_model('ml-model-and-predictions/model.h5')


Epoch 00001: val_loss improved from inf to 0.83009, saving model to ml-model-and-predictions\model.h5

Epoch 00002: val_loss did not improve from 0.83009

Epoch 00003: val_loss improved from 0.83009 to 0.72470, saving model to ml-model-and-predictions\model.h5

Epoch 00004: val_loss improved from 0.72470 to 0.64403, saving model to ml-model-and-predictions\model.h5

Epoch 00005: val_loss improved from 0.64403 to 0.55696, saving model to ml-model-and-predictions\model.h5

Epoch 00006: val_loss did not improve from 0.55696

Epoch 00007: val_loss improved from 0.55696 to 0.53114, saving model to ml-model-and-predictions\model.h5

Epoch 00008: val_loss improved from 0.53114 to 0.52693, saving model to ml-model-and-predictions\model.h5

Epoch 00009: val_loss improved from 0.52693 to 0.51298, saving model to ml-model-and-predictions\model.h5

Epoch 00010: val_loss improved from 0.51298 to 0.50868, saving model to ml-model-and-predictions\model.h5

Epoch 00011: val_loss improved from 0.50868

In [9]:
# Accuracy of the best model
training_loss, training_accuracy = saved_model.evaluate(X_train, y_train, verbose=0)
test_loss, test_accuracy = saved_model.evaluate(X_test, y_test, verbose=0)

print('Training Loss: %.3f, Training Accuracy: %.3f, Test Loss: %.3f, Test Accuracy: %.3f' % (training_loss, training_accuracy, test_loss, test_accuracy))



Training Loss: 0.503, Training Accuracy: 0.762, Test Loss: 0.489, Test Accuracy: 0.789


In [13]:
# Read in the 2021 Australian Open data

df_2021 = pd.read_csv('data/ausopen2021.csv')
df_raw = pd.read_csv('data/combined_data.csv', low_memory=False)

df_2021['Date'] = '2021/02/17'
df_2021['Surface'] = 'Hard'
df_2021['diff_rank'] = df_2021['player_1_rank'] - df_2021['player_2_rank']

In [14]:
df_2021 = add_features(df_2021, df_raw)

Loading Player Career Stats on All Surfaces
Loading Player Career Stats on Hard Courts
Loading Player Career Stats on All Surfaces in the Last 60 Weeks
Loading Player Career Stats on Hard Court in the last 100 Weeks
Loading Player H2H Career Stats on All Surfaces
Loading Player H2H Career Stats On Hard Court
Loading variables for difference in player stats


In [15]:
# Model Predictions (0 means player_1 will win)
features_qtr = df_2021[df_features_list]

df_2021['prediction'] = saved_model.predict_classes(features_qtr)
df_2021['probability'] = 1 - np.abs(df_2021.prediction - saved_model.predict_proba(features_qtr).flatten())

aus_prediction = df_2021[['Round', 'player_1', 'player_2', 'prediction', 'probability']]



In [16]:
aus_prediction.to_csv("ml-model-and-predictions/predictions.csv", index=False)