In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
from utils import *

In [2]:
df_tennis=pd.read_csv('../data/prepared_data.csv')
df_tennis.head()

Unnamed: 0,player_id,player_name,player_hand,player_ht,player_age,ace,df,svpt,1stIn,1stWon,...,SvGms,bpSaved,bpFaced,rank,opponent_rank,surface,tourney_level,best_of,target,month
0,100644,Alexander Zverev,R,198,26,10,1,66,47,33,...,10,3,4,7,5,Hard,A,3,1,11
1,100644,Alexander Zverev,R,198,25,5,1,44,27,16,...,8,3,7,15,97,Hard,M,3,0,3
2,100644,Alexander Zverev,R,198,26,5,5,90,66,50,...,15,3,6,19,81,Clay,A,3,1,7
3,100644,Alexander Zverev,R,198,26,3,1,41,33,26,...,7,2,2,19,122,Clay,A,3,1,7
4,100644,Alexander Zverev,R,198,26,7,0,55,42,20,...,8,2,6,10,50,Hard,M,3,0,10


In [3]:
df_tennis.columns

Index(['player_id', 'player_name', 'player_hand', 'player_ht', 'player_age',
       'ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved',
       'bpFaced', 'rank', 'opponent_rank', 'surface', 'tourney_level',
       'best_of', 'target', 'month'],
      dtype='object')

In [4]:
df_tennis.drop('player_id',axis=1,inplace=True)
df_tennis.head()

Unnamed: 0,player_name,player_hand,player_ht,player_age,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,rank,opponent_rank,surface,tourney_level,best_of,target,month
0,Alexander Zverev,R,198,26,10,1,66,47,33,11,10,3,4,7,5,Hard,A,3,1,11
1,Alexander Zverev,R,198,25,5,1,44,27,16,7,8,3,7,15,97,Hard,M,3,0,3
2,Alexander Zverev,R,198,26,5,5,90,66,50,8,15,3,6,19,81,Clay,A,3,1,7
3,Alexander Zverev,R,198,26,3,1,41,33,26,4,7,2,2,19,122,Clay,A,3,1,7
4,Alexander Zverev,R,198,26,7,0,55,42,20,8,8,2,6,10,50,Hard,M,3,0,10


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

2023-12-12 03:52:09.154737: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
#label encoding
le=LabelEncoder()
df_tennis['player_hand_encoded']=le.fit_transform(df_tennis['player_hand'])

#hot encoding
df_tennis=pd.get_dummies(df_tennis, columns=['surface', 'tourney_level'], drop_first=True)

#standardization numeric features
scaler=StandardScaler()
numerical_cols=['player_ht', 'player_age', 'ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced', 'rank', 'opponent_rank']

df_tennis[numerical_cols]=scaler.fit_transform(df_tennis[numerical_cols])

#splitting data
X=df_tennis.drop(['target', 'player_name', 'player_hand'], axis=1)
y=df_tennis['target']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)





In [7]:
#building model
model=Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid')) #sigmoid because we have binary classification

#compiling model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [8]:
#training model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13279a020>

In [9]:
#evaluating model
y_pred=model.predict(X_test)
y_pred=(y_pred>0.5).astype(int).reshape(X_test.shape[0])

print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
print('Classification Report: \n', classification_report(y_test, y_pred))

Accuracy: 0.75
Confusion Matrix: 
 [[474  77]
 [197 352]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.71      0.86      0.78       551
           1       0.82      0.64      0.72       549

    accuracy                           0.75      1100
   macro avg       0.76      0.75      0.75      1100
weighted avg       0.76      0.75      0.75      1100



<h2>Let's hypertuning gridsearch </h2>

In [11]:
from tensorflow.keras.layers import Dropout

def create_model(optimizer='adam', activation='relu', neurons=64, dropout_rate=0.0):
    model=Sequential()
    model.add(Dense(neurons, input_dim=X_train.shape[1], activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid')) 

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [12]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [13]:
model=KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)


  model=KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)


In [14]:
param_Grid={
    'optimizer':['adam', 'sgd'],
    'dropout_rate':[0.0, 0.1, 0.2],
}

grid_search=GridSearchCV(estimator=model, param_grid=param_Grid, scoring='precision', cv=3)

#fit the model
grid_result=grid_search.fit(X_train, y_train)

print('Best parameters: ', grid_result.best_params_)

Best parameters:  {'dropout_rate': 0.2, 'optimizer': 'adam'}


In [15]:
def create_tuned_model(dropout_rate=0.2, optimizer='adam'):
    model=Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid')) 

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [16]:
tuned_model=create_tuned_model()

tuned_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred=tuned_model.predict(X_test)
y_pred=(y_pred>0.5).astype(int).reshape(X_test.shape[0])

print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
print('Classification Report: \n', classification_report(y_test, y_pred))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.76
Confusion Matrix: 
 [[398 153]
 [106 443]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.72      0.75       551
           1       0.74      0.81      0.77       549

    accuracy                           0.76      1100
   macro avg       0.77      0.76      0.76      1100
weighted avg       0.77      0.76      0.76      1100



<h2>Let's hypertuning learning rate </h2>

In [19]:
from tensorflow.keras.optimizers import Adam, SGD
def create_model_learning_rate( dropout_rate=0.2, learning_rate=0.001):
    optimizer=Adam(learning_rate=learning_rate)
    model=Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [20]:
model=KerasClassifier(build_fn=create_model_learning_rate, epochs=10, batch_size=32, verbose=0)

param_grid={
    'learning_rate':[0.001, 0.01, 0.1],
}

grid_search=GridSearchCV(estimator=model, param_grid=param_grid, scoring='precision', cv=3)

grid_result=grid_search.fit(X_train, y_train)

print('Best parameters: ', grid_result.best_params_)

  model=KerasClassifier(build_fn=create_model_learning_rate, epochs=10, batch_size=32, verbose=0)




  _warn_prf(average, modifier, msg_start, len(result))


Best parameters:  {'learning_rate': 0.01}


In [21]:
model_tuned=create_model_learning_rate(learning_rate=0.01)

In [22]:
model_tuned.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred=model_tuned.predict(X_test)
y_pred=(y_pred>0.5).astype(int).reshape(X_test.shape[0])

print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
print('Classification Report: \n', classification_report(y_test, y_pred))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.77
Confusion Matrix: 
 [[404 147]
 [106 443]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.73      0.76       551
           1       0.75      0.81      0.78       549

    accuracy                           0.77      1100
   macro avg       0.77      0.77      0.77      1100
weighted avg       0.77      0.77      0.77      1100



<h2>Let's hypertuning model architecture </h2>

In [23]:
def create_model_architecture(neurons_layer=64, neurons_layer2=32):
    model=Sequential()
    model.add(Dense(neurons_layer, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(neurons_layer2, activation='relu'))
    model.add(Dense(1, activation='sigmoid')) 

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model



In [24]:
model=KerasClassifier(build_fn=create_model_architecture, epochs=10, batch_size=32, verbose=0)

param_grid={
    'neurons_layer':[32, 64, 128],
    'neurons_layer2':[16, 32, 64]
}

grid_search=GridSearchCV(estimator=model, param_grid=param_grid, scoring='precision', cv=3)
grid_result=grid_search.fit(X_train, y_train)

print('Best parameters: ', grid_result.best_params_)

  model=KerasClassifier(build_fn=create_model_architecture, epochs=10, batch_size=32, verbose=0)


Best parameters:  {'neurons_layer': 128, 'neurons_layer2': 16}


In [25]:
def create_optimized_model(dropout_rate=0.2, neurons_layer=128, neurons_layer2=16, learning_rate=0.01):
    optimizer=Adam(learning_rate=learning_rate)
    model=Sequential()
    model.add(Dense(neurons_layer, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons_layer2, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid')) 

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [26]:
optimized_model=create_optimized_model()

optimized_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred=optimized_model.predict(X_test)
y_pred=(y_pred>0.5).astype(int).reshape(X_test.shape[0])

print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
print('Classification Report: \n', classification_report(y_test, y_pred))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.78
Confusion Matrix: 
 [[419 132]
 [115 434]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.78      0.76      0.77       551
           1       0.77      0.79      0.78       549

    accuracy                           0.78      1100
   macro avg       0.78      0.78      0.78      1100
weighted avg       0.78      0.78      0.78      1100



In [27]:
threshold=0.5
test_set_probability_of_wining=optimized_model.predict(X_test)
test_set_prediction=(test_set_probability_of_wining>threshold).astype(int).reshape(X_test.shape[0])

