In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Dataset Preparation

In [2]:
# Load datasets from 2000 through 2022
nba = pd.read_csv("historical-data/nba2000s.csv")

# Load dataset from 2021-2022
nba22 = pd.read_csv("historical-data/nba22.csv")

In [3]:
# Filter for GP minimum
nba = nba[(nba.GP >= 41) & (nba.GP2 >= 41)]
nba22 = nba22[(nba22.GP >= 41)]

# Regression Model

#### Points Per Game in Year 2 should depend on Year 1 attributes:

* Age
* Minutes
* Points
* FG%
* 3P%
* FT%
* USG%

In [10]:
# Create list of predictor variables
cols = ['AGE', 'MIN', 'PPG', 'FG%', '3P%', 'FT%', 'USG%']

### Training Set

In [25]:
# Define training set of all seasons before 2021-22
train_set = nba[nba.YEAR2 != 2022][cols + ['PPG2']]

train_set.head()

Unnamed: 0,AGE,MIN,PPG,FG%,3P%,FT%,USG%,PPG2
12,25,22.7,9.0,38.7,35.3,78.5,18.1,18.0
14,23,15.7,6.8,43.6,39.6,69.9,19.8,20.8
16,22,37.6,19.6,39.2,37.5,74.8,26.5,25.5
18,24,20.7,7.3,50.8,22.0,62.1,15.4,16.9
20,21,16.9,8.4,51.3,0.0,75.8,22.1,20.1


In [12]:
# Prepare predictor variables, output from 2000s data
X = train_set[cols]
Y = train_set['PPG2']

### Random Forest

In [13]:
# Train model
random_forest = RandomForestRegressor(random_state = 0)
random_forest.fit(X, Y)

RandomForestRegressor(random_state=0)

In [14]:
# Calculate predicted PPG and PPG differential, residuals
nba['PRED_RF'] = nba.apply(lambda df : random_forest.predict([df[cols].values])[0], axis=1)
nba['d_PRED'] = nba.PRED_RF - nba.PPG
nba['RES_RF'] = nba.PPG2 - nba.PRED_RF

# Model Results

In [33]:
# Sort by random forest residuals, reset indices
nba = nba.sort_values(by=['RES_RF'], ascending=False)
nba = nba.reset_index(drop=True)

# Show top 10 players from 2021-22
nba2022 = nba[(nba.YEAR2 == 2022)].round(1).reset_index(drop=True)
nba2022[['PLAYER', 'TEAM', 'PPG', 'PPG2', 'd_PPG', 'd_PRED', 'PRED_RF', 'RES_RF']].head()

Unnamed: 0,PLAYER,TEAM,PPG,PPG2,d_PPG,d_PRED,PRED_RF,RES_RF
0,DeMar DeRozan,SAS,21.6,27.9,6.3,-1.6,20.0,7.9
1,Reggie Jackson,LAC,10.7,16.8,6.1,-1.7,9.0,7.8
2,Desmond Bane,MEM,9.2,18.2,9.0,1.4,10.6,7.6
3,Anfernee Simons,POR,7.8,17.3,9.5,2.0,9.8,7.5
4,Tyrese Maxey,PHI,8.0,17.5,9.5,2.1,10.1,7.4


# Model Validation

In [34]:
test_set = nba[nba.YEAR2 == 2022]

r_squared = 1 - (np.sum(test_set.RES_RF ** 2) / np.sum((np.mean(test_set.PPG2) - test_set.PPG2) ** 2))
rmse = np.sqrt( np.sum(test_set.RES_RF ** 2) / len(test_set) )

print("R-Squared:", round(r_squared, 2))
print("RMSE:", round(rmse, 2))

R-Squared: 0.79
RMSE: 2.86


# 2022-23 Predictions

In [20]:
# Calculate predicted PPG and PPG differential
nba22['PRED_RF'] = nba22.apply(lambda df : random_forest.predict([df[cols].values])[0], axis=1)
nba22['d_PRED'] = nba22.PRED_RF - nba22.PPG

In [39]:
# Sort by random forest residuals, reset indices
nba22 = nba22.sort_values(by=['d_PRED'], ascending=False)
nba22 = nba22.reset_index(drop=True)

# Create new dataframe with selected columns
results22 = nba22[['PLAYER', 'TEAM', 'GP', 'PPG', 'PRED_RF', 'd_PRED']].round(1)
results22.head()

Unnamed: 0,PLAYER,TEAM,GP,PPG,PRED_RF,d_PRED
0,Jonathan Kuminga,GSW,70,9.3,14.4,5.1
1,Naz Reid,MIN,77,8.3,12.5,4.2
2,Sandro Mamukelashvili,MIL,41,3.8,7.6,3.8
3,Isaiah Joe,PHI,55,3.6,7.2,3.6
4,Frank Ntilikina,DAL,58,4.1,7.7,3.6


# Export data

In [22]:
predictions = nba[nba.YEAR2 == 2022]
predictions = predictions.reset_index(drop=True)
predictions['RANK'] = predictions.index + 1
predictions = predictions[['RANK', 'PLAYER', 'TEAM', 'PPG', 'PPG2', 'PRED_RF', 'RES_RF']].round(1)

In [23]:
predictions.to_csv("nba_predictions_2022.csv")