In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 50)
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 55.6 ms


In [2]:
def preprocess_data(df, numeric_features, categorical_features, label, random_state=7):
    X = df[numeric_features + categorical_features]
    le = LabelEncoder()
    y = le.fit_transform(df[label])
    
    # We create the preprocessing pipelines for both numeric and categorical data.
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)])
    
    X_cv, X_validation, y_cv, y_validation = train_test_split(X, y, test_size=0.2, random_state=random_state**1,stratify=y)
    
    return X_cv, X_validation, y_cv, y_validation, preprocessor

time: 4.44 ms


In [9]:
def cross_validate_model(model, param_grid, preprocessor, X_cv, y_cv, n_splits=5, n_repeats=3, random_state=7, preprocess=True, verbose=5):
    
    keys = list(param_grid.keys())
    for key in keys:
        param_grid['classifier__' + key] = param_grid.pop(key)
    
    if preprocess:
        pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    else:
        pipe = Pipeline(steps=[('classifier', model)])
        
    grid = GridSearchCV(pipe, param_grid, cv=RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state**2), n_jobs=-1, return_train_score=True, 
                                scoring=['accuracy', 'neg_log_loss'], refit='accuracy', verbose=verbose)
    
    grid.fit(X_cv, y_cv)
    
    return grid

time: 4.19 ms


In [10]:
def get_cross_validation_results(grid, X_validation, y_validation):
    best_param_idx = grid.cv_results_['rank_test_accuracy'].argmin()
    cv_log_loss = -grid.cv_results_['mean_test_neg_log_loss'][best_param_idx]
    cv_accuracy = grid.best_score_
    cv_params = grid.best_params_
    keys = list(cv_params.keys())
    for key in keys:
        cv_params[key.split("__")[1]] = cv_params.pop(key)
    validation_accuracy = grid.score(X_validation, y_validation)
    
    print("Cross Validation Results:\n\tAccuracy: {}\n\tLog Loss: {}\n\tBest Parameters: {}\nValidation Accuracy: {}".format(
        cv_accuracy, cv_log_loss, cv_params, validation_accuracy))

time: 2.38 ms


In [11]:
df = pd.read_csv("../data/all_preprocessed_data.csv")

time: 60.8 ms


In [12]:
numeric_features = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
categorical_features = ['Tm']
label = 'Pos'

time: 1.11 ms


In [13]:
X_cv, X_validation, y_cv, y_validation, preprocessor =  preprocess_data(df, numeric_features, categorical_features, label)

time: 23.5 ms


In [34]:
classes, counts = np.unique(y_validation,return_counts=True)
print('balance:',np.max(counts/len(y_validation)))

balance: 0.20885729907053036
time: 19.1 ms


In [18]:
model = LogisticRegression(random_state=123)
param_grid = {'C': [.1, .25, .5, .75, 1]}
grid = cross_validate_model(model, param_grid, preprocessor, X_cv, y_cv, n_repeats=1, verbose=0)
get_cross_validation_results(grid, X_validation, y_validation)

Cross Validation Results:
	Accuracy: 0.5986329460013671
	Log Loss: 0.9850961930205352
	Best Parameters: {'C': 0.75}
Validation Accuracy: 0.5893931109896118
time: 8.56 s


In [24]:
model = RandomForestClassifier(random_state=123)
param_grid = {'max_depth': [5, 10, 15, 20, 25], 'min_samples_split': [3, 5, 7, 9, 11]}
grid = cross_validate_model(model, param_grid, preprocessor, X_cv, y_cv, n_repeats=1, verbose=0)
get_cross_validation_results(grid, X_validation, y_validation)

Cross Validation Results:
	Accuracy: 0.6057416267942584
	Log Loss: 1.0578500772910315
	Best Parameters: {'max_depth': 20, 'min_samples_split': 9}
Validation Accuracy: 0.5986878075451066
time: 13.5 s


In [28]:
model = SVC(random_state=123, probability=True)
param_grid = {'C': [150, 200, 250], 'gamma': [1e-3, 1e-2, 1e-1]}
grid = cross_validate_model(model, param_grid, preprocessor, X_cv, y_cv, n_repeats=1, verbose=10)
get_cross_validation_results(grid, X_validation, y_validation)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done  43 out of  45 | elapsed: 10.6min remaining:   29.5s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 11.0min finished


Cross Validation Results:
	Accuracy: 0.6642515379357484
	Log Loss: 0.7886814864811635
	Best Parameters: {'C': 200, 'gamma': 0.01}
Validation Accuracy: 0.6632039365773646
time: 11min 25s


In [33]:
model = KNeighborsClassifier()
param_grid = {'n_neighbors': [25, 50, 75]}
grid = cross_validate_model(model, param_grid, preprocessor, X_cv, y_cv, n_repeats=1, verbose=0)
get_cross_validation_results(grid, X_validation, y_validation)

Cross Validation Results:
	Accuracy: 0.557758031442242
	Log Loss: 1.066151010006486
	Best Parameters: {'n_neighbors': 50}
Validation Accuracy: 0.5538545653362493
time: 59.3 s
