In [23]:
import numpy as np
import pandas as pd
import glob
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

# First merge score and stats for appropriate years
# Second, append from 1991-2017


# Machine Learning Model
# Mock-up:
## Use only stats and score from 2017
## Features: Height(cm), Weight(kg), First Serve Percentage In%, % Second Serve Won, % Break Points Won(Saved), % Service Game Won(Take Total Service Games Won/Total Service Game Total), % Break Points Converted, Ace-to-DoubleFault Ratio(Take Aces/DoubleFaults), FlagCode(Encoding? PyCountry?) 
## Get rid of total points won for one of the players as they are the same
## 

In [None]:
stats_df = pd.read_csv("../original_data/Stats/match_stats_2017.csv")
stats_df.head()

In [24]:
stats_files = glob.glob("../original_data/Stats/*.csv")
stats_df = pd.concat((pd.read_csv(f) for f in stats_files))
stats_df.head()

Unnamed: 0,tourney_order,match_id,match_stats_url_suffix,match_time,match_duration,winner_aces,winner_double_faults,winner_first_serves_in,winner_first_serves_total,winner_first_serve_points_won,...,loser_second_serve_return_won,loser_second_serve_return_total,loser_break_points_converted,loser_break_points_return_total,loser_service_games_played,loser_return_games_played,loser_return_points_won,loser_return_points_total,loser_total_points_won,loser_total_points_total
0,0,2017-339-r975-n409,/en/scores/2017/339/MS004/match-stats,02:21:00,141.0,23.0,3.0,62.0,97.0,50.0,...,19.0,35.0,1.0,7.0,14.0,15.0,31.0,97.0,84.0,181.0
1,0,2017-339-d875-n552,/en/scores/2017/339/MS001/match-stats,01:49:00,109.0,7.0,2.0,52.0,77.0,41.0,...,13.0,25.0,2.0,7.0,12.0,13.0,24.0,77.0,69.0,146.0
2,0,2017-339-d875-r975,/en/scores/2017/339/MS002/match-stats,01:28:00,88.0,4.0,1.0,36.0,58.0,27.0,...,4.0,22.0,0.0,0.0,10.0,10.0,13.0,58.0,53.0,119.0
3,0,2017-339-n552-w367,/en/scores/2017/339/MS003/match-stats,01:42:00,102.0,1.0,1.0,56.0,77.0,37.0,...,7.0,21.0,1.0,5.0,10.0,11.0,26.0,77.0,63.0,138.0
4,0,2017-339-d875-tb69,/en/scores/2017/339/MS005/match-stats,02:05:00,125.0,3.0,3.0,52.0,94.0,42.0,...,19.0,42.0,1.0,14.0,14.0,14.0,29.0,94.0,82.0,176.0


In [25]:
stats_df = stats_df.dropna()

In [26]:
stats_df = stats_df.drop(columns=['loser_total_points_total'])

In [27]:
stats_df = stats_df.rename(columns={"winner_total_points_total": "total_points_played"})

In [28]:
stats_df["outcome"] = np.where(stats_df["winner_total_points_won"] > stats_df["loser_total_points_won"], 1.0, 2.0)

In [29]:
stats_df = stats_df.drop(columns=['tourney_order','match_id','match_stats_url_suffix','match_time'])

In [30]:
stats_df.head()

Unnamed: 0,match_duration,winner_aces,winner_double_faults,winner_first_serves_in,winner_first_serves_total,winner_first_serve_points_won,winner_first_serve_points_total,winner_second_serve_points_won,winner_second_serve_points_total,winner_break_points_saved,...,loser_second_serve_return_won,loser_second_serve_return_total,loser_break_points_converted,loser_break_points_return_total,loser_service_games_played,loser_return_games_played,loser_return_points_won,loser_return_points_total,loser_total_points_won,outcome
0,141.0,23.0,3.0,62.0,97.0,50.0,62.0,16.0,35.0,6.0,...,19.0,35.0,1.0,7.0,14.0,15.0,31.0,97.0,84.0,1.0
1,109.0,7.0,2.0,52.0,77.0,41.0,52.0,12.0,25.0,5.0,...,13.0,25.0,2.0,7.0,12.0,13.0,24.0,77.0,69.0,1.0
2,88.0,4.0,1.0,36.0,58.0,27.0,36.0,18.0,22.0,0.0,...,4.0,22.0,0.0,0.0,10.0,10.0,13.0,58.0,53.0,1.0
3,102.0,1.0,1.0,56.0,77.0,37.0,56.0,14.0,21.0,4.0,...,7.0,21.0,1.0,5.0,10.0,11.0,26.0,77.0,63.0,1.0
4,125.0,3.0,3.0,52.0,94.0,42.0,52.0,23.0,42.0,13.0,...,19.0,42.0,1.0,14.0,14.0,14.0,29.0,94.0,82.0,1.0


In [31]:
stats_df.dtypes

match_duration                      float64
winner_aces                         float64
winner_double_faults                float64
winner_first_serves_in              float64
winner_first_serves_total           float64
winner_first_serve_points_won       float64
winner_first_serve_points_total     float64
winner_second_serve_points_won      float64
winner_second_serve_points_total    float64
winner_break_points_saved           float64
winner_break_points_serve_total     float64
winner_service_points_won           float64
winner_service_points_total         float64
winner_first_serve_return_won       float64
winner_first_serve_return_total     float64
winner_second_serve_return_won      float64
winner_second_serve_return_total    float64
winner_break_points_converted       float64
winner_break_points_return_total    float64
winner_service_games_played         float64
winner_return_games_played          float64
winner_return_points_won            float64
winner_return_points_total      

In [32]:
X = stats_df.drop(['outcome'], axis=1)
#X = pd.get_dummies(X)
y = stats_df["outcome"]

In [33]:
from sklearn.model_selection import train_test_split
# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 1,
                                                    stratify = y)

In [34]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [45]:
classifier = LogisticRegression(solver='lbfgs', max_iter=2500, random_state=1)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=2500, random_state=1)

In [46]:
y_pred = classifier.predict(X_test)

In [47]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9996639132437556

In [48]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test,y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        1.0       1.00      1.00      1.00      1.00      1.00      1.00     22311
        2.0       1.00      1.00      1.00      1.00      1.00      1.00      1594

avg / total       1.00      1.00      1.00      1.00      1.00      1.00     23905



In [49]:
# Display the confusion matrix
confusion_matrix(y_test,y_pred)

array([[22310,     1],
       [    1,  1593]])