In [1]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn import set_config
set_config(display="diagram")

import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load data on player stats and salaries
df = pd.read_csv('datasets/nba_2022-23_all_stats_with_salary.csv', index_col=0)

# Fill columns with NaN values with zeros
columns_to_fill = ['FT%', '3P%', '2P%', 'eFG%', 'FG%',  '3PAr', 'FTr', 'TOV%', 'TS%']
df[columns_to_fill] = df[columns_to_fill].fillna(0)

# Display first five rows of dataframe
df.head()

Unnamed: 0,Player Name,Salary,Position,Age,Team,GP,GS,MP,FG,FGA,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,Stephen Curry,48070014,PG,34,GSW,56,56,34.7,10.0,20.2,...,12.5,31.0,5.8,2.0,7.8,0.192,7.5,0.1,7.5,4.7
1,John Wall,47345760,PG,32,LAC,34,3,22.2,4.1,9.9,...,17.1,27.0,-0.4,0.7,0.3,0.02,-0.8,-0.4,-1.2,0.1
2,Russell Westbrook,47080179,PG,34,LAL/LAC,73,24,29.1,5.9,13.6,...,18.4,27.7,-0.6,2.6,1.9,0.044,0.3,-0.1,0.2,1.2
3,LeBron James,44474988,PF,38,LAL,55,54,35.5,11.1,22.2,...,11.6,33.3,3.2,2.4,5.6,0.138,5.5,0.6,6.1,4.0
4,Kevin Durant,44119845,PF,34,BRK/PHO,47,47,35.6,10.3,18.3,...,13.4,30.7,4.7,2.1,6.8,0.194,6.0,1.2,7.1,3.9


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 467 entries, 0 to 466
Data columns (total 51 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Player Name    467 non-null    object 
 1   Salary         467 non-null    int64  
 2   Position       467 non-null    object 
 3   Age            467 non-null    int64  
 4   Team           467 non-null    object 
 5   GP             467 non-null    int64  
 6   GS             467 non-null    int64  
 7   MP             467 non-null    float64
 8   FG             467 non-null    float64
 9   FGA            467 non-null    float64
 10  FG%            467 non-null    float64
 11  3P             467 non-null    float64
 12  3PA            467 non-null    float64
 13  3P%            467 non-null    float64
 14  2P             467 non-null    float64
 15  2PA            467 non-null    float64
 16  2P%            467 non-null    float64
 17  eFG%           467 non-null    float64
 18  FT             

In [138]:
players = df.drop(['Player Name', 'Position', 'Team'], axis = 1)
players = players[players['Total Minutes'] >= 120].reset_index(drop = True)

correlation_matrix = players.corr()

corr_threshold = 0.80
correlated_features = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > corr_threshold:
            colname_i = correlation_matrix.columns[i]
            colname_j = correlation_matrix.columns[j]
            if colname_i not in correlated_features:
                correlated_features.add(colname_j)

players.drop(columns=correlated_features, inplace=True)

X = players.drop('Salary', axis = 1)
y = players['Salary']

print(X.shape)
X.head()

(398, 19)


Unnamed: 0,Age,3PA,3P%,2P%,FT%,TRB,STL,PF,TS%,3PAr,FTr,TRB%,AST%,STL%,BLK%,TOV%,USG%,DBPM,VORP
0,34,11.4,0.427,0.579,0.915,6.1,0.9,2.1,0.656,0.564,0.248,9.7,30.0,1.3,0.9,12.5,31.0,0.1,4.7
1,32,3.2,0.303,0.459,0.681,2.7,0.8,1.7,0.498,0.322,0.334,6.8,35.3,1.8,1.4,17.1,27.0,-0.4,0.1
2,34,3.9,0.311,0.487,0.656,5.8,1.0,2.2,0.513,0.289,0.317,10.8,38.6,1.7,1.3,18.4,27.7,-0.1,1.2
3,38,6.9,0.321,0.58,0.768,8.3,0.9,1.6,0.583,0.309,0.268,12.5,33.5,1.2,1.4,11.6,33.3,0.6,4.0
4,34,4.9,0.404,0.617,0.919,6.7,0.7,2.1,0.677,0.267,0.387,10.5,24.5,1.0,3.4,13.4,30.7,1.2,3.9


In [139]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

In [140]:
pipeline = Pipeline([
    ('poly_features', PolynomialFeatures(degree=1, include_bias=False)),
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(Ridge(alpha=0.01))),
    ('model', Ridge())
])

In [151]:
def do_search(estimator, param_grid, cv, verbose = 4):
    grid_search = GridSearchCV(estimator = estimator, param_grid = param_grid, cv = cv, verbose = verbose)
    grid_search.fit(X, y)
    best_estimator = grid_search.best_estimator_

    train_mse = mean_squared_error(y_train, best_estimator.predict(X_train))
    test_mse = mean_squared_error(y_test, best_estimator.predict(X_test))
    print(f'{train_mse:.4e}')
    print(f'{test_mse:.4e}')
    print(grid_search.best_params_)

    # f, ax = plt.subplots(figsize=(8,5))
    # ax.scatter(y_test, best_estimator.predict(X_test))
    # plt.show()

    return grid_search


In [159]:
# Holdout cross validation
param_grid = {
    'selector__estimator': [Ridge(alpha=i) for i in 10**np.arange(1, 12)],
    'model__alpha': 10**np.arange(1, 12),
    'poly_features__degree': [2, 3],
}
hocv = do_search(pipeline, param_grid, cv = [[list(X_train.index), list(X_test.index)]], verbose = 1)

Fitting 1 folds for each of 242 candidates, totalling 242 fits
2.0810e+13
2.7944e+13
{'model__alpha': 100, 'poly_features__degree': 3, 'selector__estimator': Ridge(alpha=100000)}


In [160]:
# 5-fold cross validation
param_grid = {
    'selector__estimator': [Ridge(alpha=i) for i in 10**np.arange(1, 12)],
    'model__alpha': 10**np.arange(1, 12),
    'poly_features__degree': [2, 3],
}
kfcv = do_search(pipeline, param_grid, cv = 5, verbose = 1)

Fitting 5 folds for each of 242 candidates, totalling 1210 fits
1.4585e+13
2.0112e+13
{'model__alpha': 10, 'poly_features__degree': 3, 'selector__estimator': Ridge(alpha=10000)}


In [168]:
# One out
param_grid = {
    'selector__estimator': [Ridge(alpha=i) for i in 10**np.arange(1, 12)],
    'model__alpha': 10**np.arange(1, 12),
    'poly_features__degree': [2, 3],
}
loocv = do_search(pipeline, param_grid, cv = 15, verbose = 1)

Fitting 15 folds for each of 242 candidates, totalling 3630 fits
1.4585e+13
2.0112e+13
{'model__alpha': 10, 'poly_features__degree': 3, 'selector__estimator': Ridge(alpha=10000)}


In [96]:
pipeline2 = Pipeline([
    ('poly_features', PolynomialFeatures(degree=1, include_bias=False)),
    ('scaler', StandardScaler()),
    ('selector', SequentialFeatureSelector(LinearRegression(), n_features_to_select=12)),
    ('ridge', Ridge())
])

In [None]:
# Holdout cross validation
param_grid2 = {
    'selector__n_features_to_select': np.array([5, 15, 25]),
    'ridge__alpha': 10**np.array([1, 2, 3, 4, 5, 6]),
    'poly_features__degree': [2, 3],
}
do_search(pipeline2, param_grid2, cv = [[list(X_train.index), list(X_test.index)]])


In [None]:
# 5-fold cross validation
param_grid2 = {
    'selector__n_features_to_select': np.array([5, 15, 25]),
    'ridge__alpha': 10**np.array([1, 2, 3, 4, 5, 6]),
    'poly_features__degree': [2, 3],
}
do_search(pipeline2, param_grid2, cv = 5)


In [None]:
# LOO cross validation
param_grid2 = {
    'selector__n_features_to_select': np.array([5, 15, 25]),
    'ridge__alpha': 10**np.array([1, 2, 3, 4, 5, 6]),
    'poly_features__degree': [2, 3],
}
do_search(pipeline2, param_grid2, cv = len(X.columns))
