# Machine Learning to predict wins in season 2001

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data/ml_season_2001.csv")

In [3]:
data

Unnamed: 0,GAME_ID,GAME_DATE,HOME,AWAY,Height-1,Weight-1,Age-1,Mpg-1,Ppg-1,Fg-1,...,Fg-10,Fg3-10,Ft-10,Rpg-10,Apg-10,Spg-10,Bp-10,Tpg-10,Fpg-10,WL
0,20001020,2001-03-27,ATL,BOS,-0.12,-0.13,-0.33,0.14,0.32,0.06,...,-0.10,0.47,0.18,-0.08,0.03,-0.08,-0.24,0.08,-0.10,0
1,20000444,2001-01-02,ATL,BOS,-0.12,-0.13,-0.33,0.14,0.32,0.06,...,-0.10,0.47,0.18,-0.08,0.03,-0.08,-0.24,0.08,-0.10,1
2,20000840,2001-03-02,ATL,CHH,-0.07,-0.14,0.05,-0.02,0.19,0.02,...,0.00,0.47,-0.02,-0.04,0.02,0.08,0.05,0.15,0.07,0
3,20000004,2000-10-31,ATL,CHH,-0.07,-0.14,0.05,-0.02,0.19,0.02,...,0.00,0.47,-0.02,-0.04,0.02,0.08,0.05,0.15,0.07,0
4,20000903,2001-03-10,ATL,CHI,-0.12,-0.13,-0.24,0.19,0.34,0.00,...,0.02,0.47,0.46,-0.22,-0.01,-0.12,-0.14,0.08,-0.10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1184,20000502,2001-01-10,WAS,SEA,0.04,0.16,0.19,-0.20,-0.22,-0.09,...,-0.07,0.00,0.20,0.22,0.05,0.16,-0.05,-0.07,0.00,0
1185,20001182,2001-04-18,WAS,TOR,-0.03,0.00,0.57,-0.16,-0.37,-0.09,...,0.08,-0.24,-0.29,0.02,0.07,0.00,-0.05,-0.07,-0.19,0
1186,20000794,2001-02-24,WAS,TOR,-0.03,0.00,0.57,-0.16,-0.37,-0.09,...,0.08,-0.24,-0.29,0.02,0.07,0.00,-0.05,-0.07,-0.19,0
1187,20000854,2001-03-04,WAS,UTA,-0.07,-0.02,0.29,0.03,0.13,-0.06,...,-0.02,-0.42,-0.23,0.28,0.02,0.04,0.03,-0.09,-0.17,0


In [4]:
X = data.iloc[:,4:-1]

In [5]:
y = data["WL"]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=3)

In [7]:
from sklearn.metrics import accuracy_score

### ML Models

In [94]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=26)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
y_pred2 = knn.predict(X_train)
acc2 = accuracy_score(y_train, y_pred2)
print(f"For KNN, train accuracy = {acc2}.")
print(f"For KNN, test accuracy = {acc}.")

For KNN, train accuracy = 0.6992639327024185.
For KNN, test accuracy = 0.6764705882352942.


In [20]:
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression(C=0.011)
logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)
acc = accuracy_score(y_test, y_pred)
y_pred2 = logr.predict(X_train)
acc2 = accuracy_score(y_train, y_pred2)
print(f"For Logistic regression, train accuracy = {acc2}.")
print(f"For Logistic regression, accuracy = {acc}.")

For Logistic regression, train accuracy = 0.676130389064143.
For Logistic regression, accuracy = 0.6596638655462185.


In [32]:
from sklearn.svm import SVC
svc = SVC(C=0.25)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
acc = accuracy_score(y_test, y_pred)
y_pred2 = svc.predict(X_train)
acc2 = accuracy_score(y_train, y_pred2)
print(f"For SVC, train accuracy = {acc2}.")
print(f"For SVC, test accuracy = {acc}.")

For SVC, train accuracy = 0.7150368033648791.
For SVC, test accuracy = 0.6764705882352942.


In [91]:
import xgboost as xgb

xg = xgb.XGBClassifier(eta = 0.007, max_depth = 5, colsample_bytree = 0.5, subsample = 0.2, min_child_weight = 4)
xg.fit(X_train,y_train)
y_pred = xg.predict(X_test)
acc = accuracy_score(y_test, y_pred)
y_pred2 = xg.predict(X_train)
acc2 = accuracy_score(y_train, y_pred2)
print(f"For XGboost, train accuracy = {acc2}.")
print(f"For XGboost, accuracy = {acc}.")

For XGboost, train accuracy = 0.7350157728706624.
For XGboost, accuracy = 0.6722689075630253.


In [88]:
grid.best_params_

{'colsample_bytree': 0.5,
 'eta': 0.011,
 'max_depth': 6,
 'min_child_weight': 4,
 'subsample': 0.2}

### ML Optimization

In [63]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100], 
              'gamma': [0.001, 0.01, 0.1, 1, 10]}

clf = SVC()
grid = GridSearchCV(clf,
                   param_grid,
                   cv=3, 
                   verbose=1,  
                  )
grid.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    3.6s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [64]:
grid.best_params_

{'C': 10, 'gamma': 0.001}

In [92]:
acc_max = 0
k= 0

for i in range(5,100):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    if acc > acc_max:
        acc_max = acc
        k = i
    else : 
        continue
 
print(f"For KNN, test accuracy = {acc_max}.")
print(f"For KNN, k = {k}.")

For KNN, test accuracy = 0.6764705882352942.
For KNN, k = 26.


In [95]:
acc_max = 0
c_best = 0
C = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]

for i in C:
    logr = LogisticRegression(C=i)
    logr.fit(X_train, y_train)
    y_pred = logr.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    if acc > acc_max:
        acc_max = acc
        c_best = i
    else : 
        continue
 
print(f"For LogR, test accuracy = {acc_max}.")
print(f"For LogR, C = {c_best}.")

For LogR, test accuracy = 0.6596638655462185.
For LogR, C = 0.01.


In [87]:
from sklearn.model_selection import GridSearchCV

param_grid = {'eta':np.arange(0.004, 0.012, 0.001), 
              'max_depth':range(3,8),
              'colsample_bytree': np.arange(0.1, 1, 0.1), 
              'subsample': np.arange(0.1, 1, 0.1),
              'min_child_weight':range(3,8)}

clf = xgb.XGBClassifier(eta = 0.007, max_depth = 4, min_child_weight = 4)
grid = GridSearchCV(clf,
                   param_grid,
                   cv=3, # In order to test the different hyperparameters (on the train set), 
                         # we use the `cross validation` technique.
                         # 3 represents the number of folds of the cross-val.
                   verbose=1,  # Setting Verbose adds some "prints" (logs) detailing
                   n_jobs=-1    # what is happening in backend
                                # The higher the setting, the higher the nb of logs printed
                  )
grid.fit(X_train, y_train)


Fitting 3 folds for each of 16200 candidates, totalling 48600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 11242 tasks      |

GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, eta=0.007,
                                     gamma=None, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=4, min_child_weight=4,
                                     missing=nan, monotone_constraints=None,
                                     n_e...
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                         'eta': array([0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01 , 0.011]),
       

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])