## Imports

In [54]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector 
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import GridSearchCV, cross_val_score

import pickle

## Dataset

- Each observation represents a player
- Each column represents a characteristic of a player's performance

The target defines whether the player lasted less than 5 years (`0`) vs. 5 years or more (`1`) as a professional.

In [3]:
data = pd.read_csv("NBA_players_data.csv")

data.head()

Unnamed: 0,games played,minutes played,points per game,field goals made,field goal attempts,field goal percent,3 point made,3 point attempt,3 point %,free throw made,free throw attempts,free throw %,offensive rebounds,defensive rebounds,rebounds,assists,steals,blocks,turnovers,target_5y
0,36.0,27.4,7.4,2.6,7.6,,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,35.0,26.9,,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,0.9,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,58.0,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,48.0,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


In [4]:
df = data.copy()

In [5]:
X = df.drop(columns='target_5y', axis=1)
y = df.target_5y

## Pipeline

In [42]:


# Create a variable with the numerical columns 
num_columns = make_column_selector(dtype_include=['float64','int64'])

# Create a variable with the categorical columns 
cat_columns = make_column_selector(dtype_include=['object','bool'])

# Create a pipeline that has the desired transformations for the numerical columns
num_transformer = make_pipeline(SimpleImputer(), MinMaxScaler())

# Instantiate the desired transformations for the categorical columns
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = make_column_transformer(
		(num_transformer, num_columns),
    (cat_transformer, cat_columns),
    remainder='passthrough'
)

pd.DataFrame(preprocessor.fit_transform(X), 
             columns=preprocessor.get_feature_names_out())

Unnamed: 0,pipeline__games played,pipeline__minutes played,pipeline__points per game,pipeline__field goals made,pipeline__field goal attempts,pipeline__field goal percent,pipeline__3 point made,pipeline__3 point attempt,pipeline__3 point %,pipeline__free throw made,pipeline__free throw attempts,pipeline__free throw %,pipeline__offensive rebounds,pipeline__defensive rebounds,pipeline__rebounds,pipeline__assists,pipeline__steals,pipeline__blocks,pipeline__turnovers
0,0.591549,0.247283,0.146520,0.181818,0.163158,0.593186,0.000000,0.000000,0.000000,0.092105,0.100000,0.683473,0.309524,0.202128,0.222222,0.096774,0.16,0.057143,0.170732
1,0.619718,0.616848,0.227106,0.252525,0.336842,0.292585,0.043478,0.061538,0.238000,0.184211,0.188889,0.742297,0.166667,0.244681,0.207407,0.698925,0.44,0.028571,0.585366
2,0.901408,0.296196,0.172161,0.181818,0.210526,0.384770,0.000000,0.000000,0.000000,0.171053,0.211111,0.589636,0.166667,0.212766,0.185185,0.096774,0.20,0.257143,0.365854
3,0.802817,0.331522,0.106227,0.131313,0.184211,0.292585,0.086957,0.169231,0.192000,0.026316,0.033333,0.500000,0.261905,0.191489,0.200000,0.096774,0.12,0.114286,0.146341
4,1.000000,0.758152,0.637363,0.737374,0.700000,0.609218,0.173913,0.169231,0.341000,0.342105,0.366667,0.735294,0.666667,0.797872,0.748148,0.215054,0.28,0.485714,0.512195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933,0.408451,0.086957,0.047619,0.050505,0.078947,0.206413,0.000000,0.046154,0.083000,0.078947,0.100000,0.557423,0.047619,0.031915,0.022222,0.053763,0.12,0.028571,0.146341
934,0.661972,0.448370,0.238095,0.282828,0.305263,0.456914,0.000000,0.030769,0.222000,0.157895,0.177778,0.693277,0.476190,0.414894,0.422222,0.129032,0.20,0.171429,0.292683
935,0.859155,0.211957,0.073260,0.090909,0.078947,0.543086,0.000000,0.000000,0.195199,0.052632,0.122222,0.242297,0.238095,0.138298,0.155556,0.064516,0.12,0.200000,0.146341
936,0.352113,0.279891,0.179487,0.202020,0.242105,0.386774,0.000000,0.015385,0.500000,0.131579,0.144444,0.747899,0.261905,0.127660,0.148148,0.096774,0.12,0.057143,0.195122


In [None]:
final_pipeline = make_pipeline(preprocessor, SVC())

In [None]:
final_pipeline.get_params()

## Adjusting the Pipeline

We will try to find the best parameters for the model and the best input strategy. 
<br>
We also use a 'Precision' scoring metric to avoid false alarms.

In [43]:
final_pipeline = make_pipeline(preprocessor, SVC())

final_pipeline

## Hyperparameters are called with:
# the name of the feature + double underscore + name of the hyperparameter
grid_params = {'svc__C' : np.arange(0.1,1, 0.1), 
               'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
               'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median', 'most_frequent']}

# Instantiate Grid Search
grid = GridSearchCV(final_pipeline, grid_params, n_jobs=-1, scoring = "precision", cv = 5)

In [44]:
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'columntransformer__pipeline__simpleimputer__strategy': 'mean', 'svc__C': 0.4, 'svc__kernel': 'poly'}
0.7630538676291507


Storing the results in a variable

In [None]:
# Save in a variable the best parameters
best_pipeline = grid.best_estimator_

In [52]:
# Cross Validate the Dataset with the best parameters
scores = cross_val_score(best_pipeline, X,y,cv=5, scoring="precision")
score = scores.mean()

print(scores)
print(f'\nThe mean score is: {scores.mean()}') 

[0.74358974 0.76315789 0.79285714 0.75316456 0.7625    ]

The mean score is: 0.7630538676291507


## Export

Once the best parameters found, we export the pipeline as a pickle file

In [55]:
# Export pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(best_pipeline, file)