In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import neighbors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
%matplotlib inline

In [2]:
# load the data (NBA box score data over 6 seasons)

df = pd.read_csv('2012-18_teamBoxScore.csv')

In [3]:
# create net values for all counting metrics, these will provide many of the features (and the target)

df['net_points'] = df['teamPTS'] - df['opptPTS'] # this is the target! can we predict the net point differential?
df['net_assists'] = df['teamAST'] - df['opptAST']
df['net_turnovers'] = df['teamTO'] - df['opptTO']
df['net_steals'] = df['teamSTL'] - df['opptSTL']
df['net_blocks'] = df['teamBLK'] - df['opptBLK']
df['net_fouls'] = df['teamPF'] - df['opptPF']
df['net_rebounds'] = df['teamTRB'] - df['opptTRB']
df['net_fourth'] = df['teamPTS4'] - df['opptPTS4']

In [4]:
# try plain OLS to see what we're working with

X = df[['net_assists', 'net_turnovers', 'net_steals', 'net_blocks', 'net_fouls', 'net_rebounds', 'net_fourth']]
Y = df['net_points']

X = sm.add_constant(X)

results = sm.OLS(Y, X).fit()

results.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,net_points,R-squared:,0.652
Model:,OLS,Adj. R-squared:,0.652
Method:,Least Squares,F-statistic:,3954.0
Date:,"Wed, 07 Aug 2019",Prob (F-statistic):,0.0
Time:,21:18:56,Log-Likelihood:,-51770.0
No. Observations:,14758,AIC:,103600.0
Df Residuals:,14750,BIC:,103600.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6.939e-18,0.067,-1.04e-16,1.000,-0.130,0.130
net_assists,0.8972,0.011,83.936,0.000,0.876,0.918
net_turnovers,-0.5903,0.021,-28.181,0.000,-0.631,-0.549
net_steals,0.4137,0.025,16.821,0.000,0.365,0.462
net_blocks,0.5534,0.019,28.830,0.000,0.516,0.591
net_fouls,-0.2423,0.013,-18.570,0.000,-0.268,-0.217
net_rebounds,0.6756,0.009,79.168,0.000,0.659,0.692
net_fourth,0.2718,0.009,28.914,0.000,0.253,0.290

0,1,2,3
Omnibus:,5.689,Durbin-Watson:,2.996
Prob(Omnibus):,0.058,Jarque-Bera (JB):,4.967
Skew:,0.0,Prob(JB):,0.0835
Kurtosis:,3.09,Cond. No.,9.63


In [5]:
# set aside 20% as test set for OLS

X = df[['net_assists', 'net_turnovers', 'net_steals', 'net_blocks', 'net_fouls', 'net_rebounds', 'net_fourth']]
Y = df['net_points']

X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

results = sm.OLS(y_train, X_train).fit()

results.summary()

0,1,2,3
Dep. Variable:,net_points,R-squared:,0.65
Model:,OLS,Adj. R-squared:,0.65
Method:,Least Squares,F-statistic:,3133.0
Date:,"Wed, 07 Aug 2019",Prob (F-statistic):,0.0
Time:,21:18:56,Log-Likelihood:,-41412.0
No. Observations:,11806,AIC:,82840.0
Df Residuals:,11798,BIC:,82900.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0468,0.074,-0.630,0.529,-0.193,0.099
net_assists,0.8938,0.012,74.617,0.000,0.870,0.917
net_turnovers,-0.6025,0.023,-25.657,0.000,-0.649,-0.556
net_steals,0.4060,0.027,14.778,0.000,0.352,0.460
net_blocks,0.5469,0.022,25.362,0.000,0.505,0.589
net_fouls,-0.2468,0.015,-16.956,0.000,-0.275,-0.218
net_rebounds,0.6727,0.010,70.078,0.000,0.654,0.692
net_fourth,0.2764,0.011,26.280,0.000,0.256,0.297

0,1,2,3
Omnibus:,4.222,Durbin-Watson:,2.027
Prob(Omnibus):,0.121,Jarque-Bera (JB):,4.427
Skew:,-0.013,Prob(JB):,0.109
Kurtosis:,3.091,Cond. No.,9.6


In [6]:
# let's try the whole suite of OLS methods for fun!

X = df[['net_assists', 'net_turnovers', 'net_steals', 'net_blocks', 'net_fouls', 'net_rebounds', 'net_fourth']]
Y = df['net_points']

X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

alphas = [np.power(10.0,p) for p in np.arange(-20,40,1)]

In [7]:
lrm = LinearRegression()

lrm.fit(X_train, y_train)

y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in training set is: 0.6509743544097176
-----Test set statistics-----
R-squared of the model in test set is: 0.6572757151443493
Mean absolute error of the prediction is: 6.4297060450247105
Mean squared error of the prediction is: 64.52294365389263
Root mean squared error of the prediction is: 8.03261748459944
Mean absolute percentage error of the prediction is: 96.44565146624794


In [8]:
lasso_cv = LassoCV(alphas=alphas, cv=5)

lasso_cv.fit(X_train, y_train)

y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)

print("Best alpha value is: {}".format(lasso_cv.alpha_))
print("R-squared of the model in training set is: {}".format(lasso_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lasso_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

  tol, rng, random, positive)
  tol, rng, random, positive)


Best alpha value is: 0.01
R-squared of the model in training set is: 0.6509742473856758
-----Test set statistics-----
R-squared of the model in test set is: 0.6572778063181777
Mean absolute error of the prediction is: 6.42968676960871
Mean squared error of the prediction is: 64.5225499593193
Root mean squared error of the prediction is: 8.032592978566717
Mean absolute percentage error of the prediction is: 96.43110347142402


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


In [9]:
ridge_cv = RidgeCV(alphas=alphas, cv=5)

ridge_cv.fit(X_train, y_train)

y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)

print("Best alpha value is: {}".format(ridge_cv.alpha_))
print("R-squared of the model in training set is: {}".format(ridge_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(ridge_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best alpha value is: 1000.0
R-squared of the model in training set is: 0.6509721284242674
-----Test set statistics-----
R-squared of the model in test set is: 0.6572651482619425
Mean absolute error of the prediction is: 6.429708884694992
Mean squared error of the prediction is: 64.52493302665746
Root mean squared error of the prediction is: 8.032741314561141
Mean absolute percentage error of the prediction is: 96.37304464308406


In [10]:
elasticnet_cv = ElasticNetCV(alphas=alphas, cv=5)

elasticnet_cv.fit(X_train, y_train)

y_preds_train = elasticnet_cv.predict(X_train)
y_preds_test = elasticnet_cv.predict(X_test)

print("Best alpha value is: {}".format(elasticnet_cv.alpha_))
print("R-squared of the model in training set is: {}".format(elasticnet_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(elasticnet_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 0.01
R-squared of the model in training set is: 0.6509742943239358
-----Test set statistics-----
R-squared of the model in test set is: 0.6572762629971989
Mean absolute error of the prediction is: 6.429695563284882
Mean squared error of the prediction is: 64.522840512446
Root mean squared error of the prediction is: 8.032611064432661
Mean absolute percentage error of the prediction is: 96.43406915564965


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


Despite python's many complaints about convergence, the OLS methods all performed very similarly to each other, explaining 2/3 of the variance with just a few features! Let's move on to KNN and see if it performs any better...

In [11]:
# unweighted
knn = neighbors.KNeighborsRegressor(n_neighbors=25)
X = df[['net_assists', 'net_turnovers', 'net_steals', 'net_blocks', 'net_fouls', 'net_rebounds', 'net_fourth']]
Y = df['net_points']
knn.fit(X, Y)

# weighted
knn_w = neighbors.KNeighborsRegressor(n_neighbors=25, weights='distance')
X = df[['net_assists', 'net_turnovers', 'net_steals', 'net_blocks', 'net_fouls', 'net_rebounds', 'net_fourth']]
Y = df['net_points']
knn_w.fit(X, Y)

score = cross_val_score(knn, X, Y, cv=5)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
score_w = cross_val_score(knn_w, X, Y, cv=5)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))

Unweighted Accuracy: 0.63 (+/- 0.03)
Weighted Accuracy: 0.63 (+/- 0.03)


The accuracy of the KNN models is fairly similar to the r-squared of the OLS models (not that they measure exactly the same thing, but for purposes of this exercise it's a good baseline). That being the case, there's no particularly compelling reason to prefer one model to another given this basic feature set.

Adding in some intuition leads me to believe that KNN is probably better than OLS for this particular application, given the likelihood of multicollinearity between some of the chosen features.