In [26]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from feature_engineering.difference_features import create_difference_features
from feature_engineering.encode_features import encode_categorical_features

MATCHES_PATH = 'data/matches/matches.csv'

df = pd.read_csv(MATCHES_PATH)

cols_to_keep = [
    'surface', 'tourney_level', 'round', 'best_of',
    'winner_hand', 'loser_hand', 'winner_ht', 'loser_ht',
    'winner_age', 'loser_age', 'winner_rank', 'loser_rank',
    'winner_rank_points', 'loser_rank_points',
    'winner_elo', 'loser_elo', 'winner_surface_elo', 'loser_surface_elo'
]

df = df[cols_to_keep]
df['target'] = 1

In [None]:
print(df['surface'].value_counts())
print(df['tourney_level'].value_counts())
print(df['best_of'].value_counts())

surface
Hard      62762
Clay      43661
Grass     12658
Carpet    10425
Name: count, dtype: int64
tourney_level
A    76350
M    21056
G    20002
D    11500
F      587
O       64
Name: count, dtype: int64
['R32' 'R32' 'R32' ... 'SF' 'SF' 'F']
best_of
3    101279
5     28280
Name: count, dtype: int64


In [28]:
median_ht = pd.concat([df['winner_ht'], df['loser_ht']]).median()
print(f"\nCalculated overall median height: {median_ht:.2f}")
df['winner_ht'].fillna(median_ht, inplace=True)
df['loser_ht'].fillna(median_ht, inplace=True)

median_age = pd.concat([df['winner_age'], df['loser_age']]).median()
print(f"Calculated overall median age: {median_age:.2f}")
df['winner_age'].fillna(median_age, inplace=True)
df['loser_age'].fillna(median_age, inplace=True)


Calculated overall median height: 185.00
Calculated overall median age: 25.30


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['winner_ht'].fillna(median_ht, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['loser_ht'].fillna(median_ht, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

In [29]:
cols_to_drop = ['loser_rank', 'winner_rank', 'surface', 'loser_hand', 'winner_hand']
df.dropna(subset=cols_to_drop, inplace=True)

In [30]:
player_col_suffixes = ['hand', 'ht', 'age', 'rank', 'rank_points', 'elo', 'surface_elo']
rename_map_p1 = {f'winner_{suffix}': f'p1_{suffix}' for suffix in player_col_suffixes if f'winner_{suffix}' in df.columns}
rename_map_p1.update({f'loser_{suffix}': f'p2_{suffix}' for suffix in player_col_suffixes if f'loser_{suffix}' in df.columns})
rename_map_p2 = {f'winner_{suffix}': f'p2_{suffix}' for suffix in player_col_suffixes if f'winner_{suffix}' in df.columns}
rename_map_p2.update({f'loser_{suffix}': f'p1_{suffix}' for suffix in player_col_suffixes if f'loser_{suffix}' in df.columns})

df_p1 = df.rename(columns=rename_map_p1)
df_p1['target'] = 1

df_p2 = df.rename(columns=rename_map_p2)
df_p2['target'] = 0

df = pd.concat([df_p1, df_p2], ignore_index=True)

In [31]:
df.columns

Index(['surface', 'tourney_level', 'round', 'best_of', 'p1_hand', 'p2_hand',
       'p1_ht', 'p2_ht', 'p1_age', 'p2_age', 'p1_rank', 'p2_rank',
       'p1_rank_points', 'p2_rank_points', 'p1_elo', 'p2_elo',
       'p1_surface_elo', 'p2_surface_elo', 'target'],
      dtype='object')

In [32]:
df = create_difference_features(df)
df = encode_categorical_features(df)

In [33]:
df.columns

Index(['best_of', 'target', 'ht_diff', 'age_diff', 'rank_diff',
       'rank_points_diff', 'elo_diff', 'surface_elo_diff', 'surface_Carpet',
       'surface_Clay', 'surface_Grass', 'surface_Hard', 'tourney_level_A',
       'tourney_level_D', 'tourney_level_F', 'tourney_level_G',
       'tourney_level_M', 'tourney_level_O', 'round_BR', 'round_ER', 'round_F',
       'round_QF', 'round_R128', 'round_R16', 'round_R32', 'round_R64',
       'round_RR', 'round_SF', 'p1_hand_A', 'p1_hand_L', 'p1_hand_R',
       'p1_hand_U', 'p2_hand_A', 'p2_hand_L', 'p2_hand_R', 'p2_hand_U'],
      dtype='object')

In [34]:
X = df.drop('target', axis=1)
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)
xgb_model.fit(X_train, y_train)


y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importances.head(15))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6717
ROC AUC Score: 0.7381

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.67      0.67     25092
           1       0.67      0.67      0.67     25092

    accuracy                           0.67     50184
   macro avg       0.67      0.67      0.67     50184
weighted avg       0.67      0.67      0.67     50184


Confusion Matrix:
[[16837  8255]
 [ 8220 16872]]
             Feature  Importance
5           elo_diff    0.296588
3          rank_diff    0.071232
6   surface_elo_diff    0.059271
0            best_of    0.052289
14   tourney_level_G    0.030584
2           age_diff    0.029978
9      surface_Grass    0.028674
25          round_RR    0.026010
33         p2_hand_R    0.022559
12   tourney_level_D    0.022128
8       surface_Clay    0.020615
7     surface_Carpet    0.020213
32         p2_hand_L    0.018749
28         p1_hand_L    0.018167
4   rank_points_diff    0.018014


In [36]:
xgb_model.save_model('data/models/xgboost_tennis_model.ubj')

In [37]:
df.columns

Index(['best_of', 'target', 'ht_diff', 'age_diff', 'rank_diff',
       'rank_points_diff', 'elo_diff', 'surface_elo_diff', 'surface_Carpet',
       'surface_Clay', 'surface_Grass', 'surface_Hard', 'tourney_level_A',
       'tourney_level_D', 'tourney_level_F', 'tourney_level_G',
       'tourney_level_M', 'tourney_level_O', 'round_BR', 'round_ER', 'round_F',
       'round_QF', 'round_R128', 'round_R16', 'round_R32', 'round_R64',
       'round_RR', 'round_SF', 'p1_hand_A', 'p1_hand_L', 'p1_hand_R',
       'p1_hand_U', 'p2_hand_A', 'p2_hand_L', 'p2_hand_R', 'p2_hand_U'],
      dtype='object')