<img src="https://cdn.freelogovectors.net/wp-content/uploads/2020/06/australian_open_logo.png" style="float: left; margin: 25px; height: 55px">

# Predicting the 2021 Australian Open
_Modelling_

**Data Dictionary**

- `year`: Year the match took place
- `score`: Set scores
- `round`: Round within tournament (QF, SF, F)
- `minutes`: How many minutes the match lasted
- `p1name`: Player 1 name
- `p1age`: Player 1 age
- `p1ace`: Number of aces scored by player 1
- `p1rank`: Player 1's rank at the time of the match 
- `p2name`: Player 2 name
- `p2age`: Player 2 age
- `p2ace`: Number of aces scored by player 2
- `p2rank`: Player 2's rank at the time of the match 
- `p1_win_pct`: Player 1 win percentage over the past 3 years
- `p1_win_pct_hsf`: Player 1 win percentage on hard surface over the past 3 years
- `p2_win_pct`: Player 2 win percentage over the past 3 years 
- `p2_win_pct_hsf`: Player 2 win percentage on hard surface over the past 3 years 
- `p1won`: Whether player 1 won or not (1,0)
- `p2won`: Whether player 2 won or not (1,0)

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from datetime import datetime
import seaborn as sns
import numpy as np 

%matplotlib inline

# setting default figure and font sizes
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14

In [2]:
ao = pd.read_csv('./data/final_ao.csv')
ao.head()

Unnamed: 0.1,Unnamed: 0,year,score,round,minutes,p1name,p1age,p1ace,p1rank,p1_win_pct,p1_win_pct_hsf,p2name,p2age,p2ace,p2rank,p2_win_pct,p2_win_pct_hsf,p1won,p2won
0,0,2014,6-4 7-6(5) 6-7(9) 6-2,R128,228.0,Carlos Berlocq,30.94319,9.0,41.0,0.111111,0.47,Edouard Roger Vasselin,30.12731,6.0,40.0,0.55,0.56,0,1
1,1,2016,7-5 6-3 6-2,R32,104.0,Stephane Robert,35.671458,3.0,225.0,0.25,0.47,Gael Monfils,29.379877,10.0,25.0,0.642857,0.710843,0,1
2,2,2018,6-3 6-2 6-1,R128,98.0,Dennis Novak,24.383299,3.0,226.0,0.394737,0.318182,Grigor Dimitrov,26.669405,7.0,3.0,0.550459,0.571429,0,1
3,3,2013,6-4 6-2 6-4,R128,106.0,Grigor Dimitrov,21.667351,5.0,41.0,0.550459,0.571429,Julien Benneteau,31.069131,6.0,38.0,0.421053,0.454545,0,1
4,4,2015,6-3 7-6(6) 6-1,R32,118.0,Malek Jaziri,30.997947,6.0,75.0,0.405797,0.388889,Nick Kyrgios,19.731691,27.0,53.0,0.642857,0.671875,0,1


In [3]:
ao.year.value_counts()

2019    127
2018    127
2017    127
2016    127
2015    127
2013    127
2011    127
2020    126
2014    126
2012    126
2010    126
Name: year, dtype: int64

In [4]:
ao.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
ao.head()

Unnamed: 0,year,score,round,minutes,p1name,p1age,p1ace,p1rank,p1_win_pct,p1_win_pct_hsf,p2name,p2age,p2ace,p2rank,p2_win_pct,p2_win_pct_hsf,p1won,p2won
0,2014,6-4 7-6(5) 6-7(9) 6-2,R128,228.0,Carlos Berlocq,30.94319,9.0,41.0,0.111111,0.47,Edouard Roger Vasselin,30.12731,6.0,40.0,0.55,0.56,0,1
1,2016,7-5 6-3 6-2,R32,104.0,Stephane Robert,35.671458,3.0,225.0,0.25,0.47,Gael Monfils,29.379877,10.0,25.0,0.642857,0.710843,0,1
2,2018,6-3 6-2 6-1,R128,98.0,Dennis Novak,24.383299,3.0,226.0,0.394737,0.318182,Grigor Dimitrov,26.669405,7.0,3.0,0.550459,0.571429,0,1
3,2013,6-4 6-2 6-4,R128,106.0,Grigor Dimitrov,21.667351,5.0,41.0,0.550459,0.571429,Julien Benneteau,31.069131,6.0,38.0,0.421053,0.454545,0,1
4,2015,6-3 7-6(6) 6-1,R32,118.0,Malek Jaziri,30.997947,6.0,75.0,0.405797,0.388889,Nick Kyrgios,19.731691,27.0,53.0,0.642857,0.671875,0,1


In [6]:
ao.columns

Index(['year', 'score', 'round', 'minutes', 'p1name', 'p1age', 'p1ace',
       'p1rank', 'p1_win_pct', 'p1_win_pct_hsf', 'p2name', 'p2age', 'p2ace',
       'p2rank', 'p2_win_pct', 'p2_win_pct_hsf', 'p1won', 'p2won'],
      dtype='object')

## 1. KNN classification

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

X = ao[['p1rank', 'p2rank', 'p1_win_pct', 'p2_win_pct', 'p1_win_pct_hsf', 'p2_win_pct_hsf']]
y = ao['p1won']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=123)

In [8]:
for i in [1, 5, 15, 50]:
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    print ("the score for", i, "neighbors:")
    print (knn.score(X_test, y_test))
    print ()
    print ("---"*20)

the score for 1 neighbors:
0.509325681492109

------------------------------------------------------------
the score for 5 neighbors:
0.509325681492109

------------------------------------------------------------
the score for 15 neighbors:
0.509325681492109

------------------------------------------------------------
the score for 50 neighbors:
0.509325681492109

------------------------------------------------------------


In [9]:
# Instantiating the model (using the value K=5).
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the model with data.
knn.fit(X_train, y_train)

# Store the predicted response values.
y_pred_class = knn.predict(X_test)

In [10]:
knn.score(X_test, y_test)

0.509325681492109

In [11]:
knn.predict_proba(X)

array([[0.6, 0.4],
       [0.4, 0.6],
       [1. , 0. ],
       ...,
       [0.4, 0.6],
       [0. , 1. ],
       [0.4, 0.6]])

In [12]:
# Storing the prediction for player 1 wins
ao['p1_win_pred_knn'] = knn.predict(X)

In [13]:
ao.head()

Unnamed: 0,year,score,round,minutes,p1name,p1age,p1ace,p1rank,p1_win_pct,p1_win_pct_hsf,p2name,p2age,p2ace,p2rank,p2_win_pct,p2_win_pct_hsf,p1won,p2won,p1_win_pred_knn
0,2014,6-4 7-6(5) 6-7(9) 6-2,R128,228.0,Carlos Berlocq,30.94319,9.0,41.0,0.111111,0.47,Edouard Roger Vasselin,30.12731,6.0,40.0,0.55,0.56,0,1,0
1,2016,7-5 6-3 6-2,R32,104.0,Stephane Robert,35.671458,3.0,225.0,0.25,0.47,Gael Monfils,29.379877,10.0,25.0,0.642857,0.710843,0,1,1
2,2018,6-3 6-2 6-1,R128,98.0,Dennis Novak,24.383299,3.0,226.0,0.394737,0.318182,Grigor Dimitrov,26.669405,7.0,3.0,0.550459,0.571429,0,1,0
3,2013,6-4 6-2 6-4,R128,106.0,Grigor Dimitrov,21.667351,5.0,41.0,0.550459,0.571429,Julien Benneteau,31.069131,6.0,38.0,0.421053,0.454545,0,1,0
4,2015,6-3 7-6(6) 6-1,R32,118.0,Malek Jaziri,30.997947,6.0,75.0,0.405797,0.388889,Nick Kyrgios,19.731691,27.0,53.0,0.642857,0.671875,0,1,0


## 2. Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#instantiating the model
logreg = LogisticRegression()

#defining the features and the y 
X = ao[['p1rank', 'p2rank', 'p1_win_pct', 'p2_win_pct', 'p1_win_pct_hsf', 'p2_win_pct_hsf']]
y = ao.p1won

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=123)

#fitting the model with data for training
logreg.fit(X_train, y_train)

#storing predictions
pred = logreg.predict(X_test)

#scoring the model
logreg.score(X_test, y_test)

0.5107604017216643

In [15]:
logreg.predict_proba(X_test)[0:10]

array([[0.55338533, 0.44661467],
       [0.46343782, 0.53656218],
       [0.50797374, 0.49202626],
       [0.46491336, 0.53508664],
       [0.40692692, 0.59307308],
       [0.52519997, 0.47480003],
       [0.43597979, 0.56402021],
       [0.53718635, 0.46281365],
       [0.49875655, 0.50124345],
       [0.4571768 , 0.5428232 ]])

In [16]:
# Storing the prediction for player 1 wins
ao['p1_win_pred_log'] = logreg.predict(X)

In [17]:
# Store the predicted probabilities of player 1 winning.
ao['p1_win_pred_prob_log'] = logreg.predict_proba(X)[:,1]

In [18]:
ao.head()

Unnamed: 0,year,score,round,minutes,p1name,p1age,p1ace,p1rank,p1_win_pct,p1_win_pct_hsf,...,p2age,p2ace,p2rank,p2_win_pct,p2_win_pct_hsf,p1won,p2won,p1_win_pred_knn,p1_win_pred_log,p1_win_pred_prob_log
0,2014,6-4 7-6(5) 6-7(9) 6-2,R128,228.0,Carlos Berlocq,30.94319,9.0,41.0,0.111111,0.47,...,30.12731,6.0,40.0,0.55,0.56,0,1,0,0,0.488739
1,2016,7-5 6-3 6-2,R32,104.0,Stephane Robert,35.671458,3.0,225.0,0.25,0.47,...,29.379877,10.0,25.0,0.642857,0.710843,0,1,1,1,0.534095
2,2018,6-3 6-2 6-1,R128,98.0,Dennis Novak,24.383299,3.0,226.0,0.394737,0.318182,...,26.669405,7.0,3.0,0.550459,0.571429,0,1,0,1,0.515815
3,2013,6-4 6-2 6-4,R128,106.0,Grigor Dimitrov,21.667351,5.0,41.0,0.550459,0.571429,...,31.069131,6.0,38.0,0.421053,0.454545,0,1,0,1,0.520832
4,2015,6-3 7-6(6) 6-1,R32,118.0,Malek Jaziri,30.997947,6.0,75.0,0.405797,0.388889,...,19.731691,27.0,53.0,0.642857,0.671875,0,1,0,1,0.504559


In [19]:
ao.p1won.value_counts()

1    699
0    694
Name: p1won, dtype: int64

In [20]:
ao.p1_win_pred_log.value_counts()

1    948
0    445
Name: p1_win_pred_log, dtype: int64

In [21]:
ao.p1_win_pred_knn.value_counts()

1    749
0    644
Name: p1_win_pred_knn, dtype: int64

In [22]:
logreg.score(X,y)

0.5183058147882269