In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

In [49]:
df_tennis=pd.read_csv('../data/prepared_data.csv')
df_tennis.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,...,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,loser_rank
0,2023-9900,United Cup,Hard,18,A,20230102,300,126203,Taylor Fritz,R,...,2.0,97.0,62.0,47.0,15.0,12.0,9.0,9.0,9.0,16.0
1,2023-9900,United Cup,Hard,18,A,20230102,299,126207,Frances Tiafoe,R,...,0.0,21.0,12.0,8.0,3.0,4.0,1.0,3.0,19.0,23.0
2,2023-9900,United Cup,Hard,18,A,20230102,296,126203,Taylor Fritz,R,...,1.0,82.0,62.0,51.0,7.0,12.0,2.0,2.0,9.0,10.0
3,2023-9900,United Cup,Hard,18,A,20230102,295,126207,Frances Tiafoe,R,...,3.0,68.0,41.0,26.0,12.0,9.0,6.0,9.0,19.0,245.0
4,2023-9900,United Cup,Hard,18,A,20230102,292,126774,Stefanos Tsitsipas,R,...,2.0,89.0,58.0,48.0,18.0,16.0,1.0,2.0,4.0,16.0


In [50]:
df_tennis.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_name', 'winner_hand',
       'winner_ht', 'winner_age', 'loser_id', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_age', 'score', 'best_of', 'round', 'minutes',
       'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms',
       'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn',
       'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'loser_rank'],
      dtype='object')

In [51]:
df_tennis.isna().sum()

tourney_id       0
tourney_name     0
surface          0
draw_size        0
tourney_level    0
tourney_date     0
match_num        0
winner_id        0
winner_name      0
winner_hand      0
winner_ht        0
winner_age       0
loser_id         0
loser_name       0
loser_hand       0
loser_ht         0
loser_age        0
score            0
best_of          0
round            0
minutes          0
w_ace            0
w_df             0
w_svpt           0
w_1stIn          0
w_1stWon         0
w_2ndWon         0
w_SvGms          0
w_bpSaved        0
w_bpFaced        0
l_ace            0
l_df             0
l_svpt           0
l_1stIn          0
l_1stWon         0
l_2ndWon         0
l_SvGms          0
l_bpSaved        0
l_bpFaced        0
winner_rank      0
loser_rank       0
dtype: int64

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [52]:
columns_to_drop=['tourney_id', 'tourney_name', 'winner_id', 'loser_id', 'score', 'tourney_date']
df_tennis.drop(columns_to_drop, axis=1, inplace=True)

In [54]:
X=df_tennis.drop('winner_name', axis=1)
y=df_tennis['winner_name']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

<h2>Let's create a pipeline </h2>

In [61]:
#preprocessing
numeric_features=X.select_dtypes(include=['int64', 'float64']).columns
categorical_features=X.select_dtypes(include=['object']).columns

numeric_transformer=Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer=Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')) #handle_unknown='ignore' to avoid errors when new categories are found in the test set
])

preprocessor=ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

X_train_transformed=preprocessor.fit_transform(X_train)

In [57]:
from sklearn.neural_network import MLPClassifier    
from sklearn.metrics import accuracy_score

In [62]:
nn_model=MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

pipeline=Pipeline([
    ('preprocessor', preprocessor),
    ('model', nn_model)
])

pipeline.fit(X_train, y_train)

y_pred=pipeline.predict(X_test)

accuracy=accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

Accuracy: 0.6381818181818182


In [65]:
X_test.columns

Index(['surface', 'draw_size', 'tourney_level', 'match_num', 'winner_hand',
       'winner_ht', 'winner_age', 'loser_name', 'loser_hand', 'loser_ht',
       'loser_age', 'best_of', 'round', 'minutes', 'w_ace', 'w_df', 'w_svpt',
       'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced',
       'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms',
       'l_bpSaved', 'l_bpFaced', 'winner_rank', 'loser_rank'],
      dtype='object')

In [73]:




data = {'winner_name': ['Novak Djokovic'], 'loser_name': ['Carlos Alcaraz'], 'surface': ['Hard']}
novak_vs_alcaraz = pd.DataFrame(data)

X_predict_subset=preprocessor.transform(novak_vs_alcaraz)

predictions=pipeline.predict(X_predict_subset)

print(f'Predictions: {predictions}')



KeyError: "None of [Index(['draw_size', 'match_num', 'winner_ht', 'winner_age', 'loser_ht',\n       'loser_age', 'best_of', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn',\n       'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace',\n       'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms',\n       'l_bpSaved', 'l_bpFaced', 'winner_rank', 'loser_rank'],\n      dtype='object')] are in the [columns]"