In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Dataset Preparation

In [2]:
# Load datasets from 2000 through 2022
nba = pd.read_csv("historical-data/nba2000s.csv")

# Load dataset from 2021-2022
nba22 = pd.read_csv("historical-data/nba22.csv")

In [3]:
# Filter for GP minimum
nba = nba[(nba.GP >= 41) & (nba.GP2 >= 41)]
nba22 = nba22[(nba22.GP >= 41)]

# Regression Model

#### Points Per Game in Year 2 should depend on Year 1 attributes:

* Age
* Minutes
* Points
* FG%
* 3P%
* FT%
* USG%

In [4]:
# Create list of predictor variables
cols = ['AGE', 'MIN', 'PPG', 'FG%', '3P%', 'FT%', 'USG%']

### Training Set

In [5]:
# Define training set of all seasons before 2021-22
train_set = nba[nba.YEAR2 != 2022][cols + ['PPG2']]

train_set.head()

Unnamed: 0,AGE,MIN,PPG,FG%,3P%,FT%,USG%,PPG2
0,36,31.1,7.6,41.9,33.8,77.7,14.0,8.4
1,29,28.5,10.2,45.7,0.0,78.7,17.7,7.2
2,26,29.2,9.8,43.0,30.6,75.2,17.2,11.8
3,25,34.7,15.8,45.2,26.6,83.3,22.8,16.5
4,25,41.4,24.9,44.2,30.2,71.5,27.6,19.7


In [6]:
# Prepare predictor variables, output from 2000s data
X = train_set[cols]
Y = train_set['PPG2']

### Random Forest

In [7]:
# Train model
random_forest = RandomForestRegressor(random_state = 0)
random_forest.fit(X, Y)

RandomForestRegressor(random_state=0)

In [8]:
# Calculate predicted PPG and PPG differential, residuals
nba['PRED_RF'] = nba.apply(lambda df : random_forest.predict([df[cols].values])[0], axis=1)
nba['d_PRED'] = nba.PRED_RF - nba.PPG
nba['RES_RF'] = nba.PPG2 - nba.PRED_RF

# Model Results

In [9]:
# Sort by random forest residuals, reset indices
nba = nba.sort_values(by=['RES_RF'], ascending=False)
nba = nba.reset_index(drop=True)

# Show top 10 players from 2021-22
nba2022 = nba[(nba.YEAR2 == 2022)].round(1).reset_index(drop=True)
nba2022[['PLAYER', 'TEAM', 'PPG', 'PPG2', 'd_PPG', 'd_PRED', 'PRED_RF', 'RES_RF']].head()

Unnamed: 0,PLAYER,TEAM,PPG,PPG2,d_PPG,d_PRED,PRED_RF,RES_RF
0,DeMar DeRozan,SAS,21.6,27.9,6.3,-1.6,20.0,7.9
1,Reggie Jackson,LAC,10.7,16.8,6.1,-1.7,9.0,7.8
2,Desmond Bane,MEM,9.2,18.2,9.0,1.4,10.6,7.6
3,Anfernee Simons,POR,7.8,17.3,9.5,2.0,9.8,7.5
4,Tyrese Maxey,PHI,8.0,17.5,9.5,2.1,10.1,7.4


# Model Validation

In [10]:
test_set = nba[nba.YEAR2 == 2022]

r_squared = 1 - (np.sum(test_set.RES_RF ** 2) / np.sum((np.mean(test_set.PPG2) - test_set.PPG2) ** 2))
rmse = np.sqrt( np.sum(test_set.RES_RF ** 2) / len(test_set) )

print("R-Squared:", round(r_squared, 2))
print("RMSE:", round(rmse, 2))

R-Squared: 0.79
RMSE: 2.86


# 2022-23 Predictions

In [None]:
# Calculate predicted PPG and PPG differential
nba22['PRED_RF'] = nba22.apply(lambda df : random_forest.predict([df[cols].values])[0], axis=1)
nba22['d_PRED'] = nba22.PRED_RF - nba22.PPG

In [None]:
# Sort by random forest residuals, reset indices
nba22 = nba22.sort_values(by=['d_PRED'], ascending=False)
nba22 = nba22.reset_index(drop=True)

# Create new dataframe with selected columns
results22 = nba22[['PLAYER', 'TEAM', 'GP', 'PPG', 'PRED_RF', 'd_PRED']].round(1)
results22.head()

# Export data

In [None]:
predictions = nba[nba.YEAR2 == 2022]
predictions = predictions.reset_index(drop=True)
predictions['RANK'] = predictions.index + 1
predictions = predictions[['RANK', 'PLAYER', 'TEAM', 'PPG', 'PPG2', 'PRED_RF', 'RES_RF']].round(1)

In [None]:
predictions.to_csv("nba_predictions_2022.csv")