In [148]:
import pandas as pd

pd.set_option('display.max_columns', None)
df = pd.read_csv('Nba_stats.csv' ,encoding= "utf-8", index_col=0 )
df['Player'] = df['Player'].str.encode('latin1').str.decode('utf-8')
df['FG%'] = df['FG%'].fillna(0)
df['FT%'] = df['FT%'].fillna(0)
df['3P%'] = df['3P%'].fillna(0)
df['2P%'] = df['2P%'].fillna(0)
df['eFG%'] = df['eFG%'].fillna(0)
df['Age'] = df['Age'].fillna(0)
df['Age'] = df['Age'].astype(int)
df['G'] = df['G'].fillna(0)
df['GS'] = df['GS'].fillna(0)
df['G'] = df['G'].astype(int)
df['GS'] = df['GS'].astype(int)

df['team_code'] = df['Team'].astype("category").cat.codes
df['pos_code'] = df['Pos'].astype('category').cat.codes
df['player_id'] = df['id'].astype('category').cat.codes

df = df.sort_values(['Player', 'Season']).reset_index(drop=True)
df = df.drop_duplicates(subset=['Season', 'Player'], keep='first')





In [149]:
def next_season_stats(player):
   
    player[['Next-PPG', 'Next-APG', 'Next-RPG', 'Next-STL','Next-BLK' ]] = player[['PTS', 'AST', 'TRB', 'STL' , 'BLK']].shift(-1)
    return player

df = df.groupby('Player', group_keys=False).apply(next_season_stats)
#used to filter out inactive players by finding their last season played
df['last_season'] = df.groupby('player_id')['Season'].transform('max')
#dropping any player that did not play last season
df = df.drop(df[df['last_season'] != 2025].index)


  df = df.groupby('Player', group_keys=False).apply(next_season_stats)


In [150]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split


In [151]:
base_model = RandomForestRegressor()
model = MultiOutputRegressor(base_model)


In [152]:
features = ['G', 'GS','PTS' ,'AST', 'TRB', 'Age', 'STL', 'BLK', 'MP', '3PA', 'team_code', 'FGA', 'pos_code','player_id', 'PER', 'USG%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'WS/48', 'VORP']



In [153]:
years = sorted(df["Season"].unique())
years

[np.int64(2015),
 np.int64(2016),
 np.int64(2017),
 np.int64(2018),
 np.int64(2019),
 np.int64(2020),
 np.int64(2021),
 np.int64(2022),
 np.int64(2023),
 np.int64(2024),
 np.int64(2025)]

In [167]:
def backtest(data, model, features, start=6, step=1):
    all_predictions = []
    years = sorted(data["Season"].unique())


    prediction = ['P-PPG', 'P-APG', 'P-RBG', 'P-STL', 'P-BLK']
    
    actual_targets = ['Next-PPG','Next-APG','Next-RPG','Next-STL','Next-BLK']

    for i in range(start, len(years), step):
        current_year = years[i]
        
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[features], train[actual_targets])
        preds = model.predict(test[features]) 
        
        preds_df = pd.DataFrame(preds, columns=prediction ,index=test.index)
        preds_df = preds_df[prediction].round(1)
        combined = pd.concat([test[['Player', 'Season']], test[actual_targets], preds_df], axis= 1)
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)
   


In [155]:
predictions = backtest(df, model, features)



In [156]:
predictions

Unnamed: 0,Player,Season,Next-PPG,Next-APG,Next-RPG,Next-STL,Next-BLK,P-PPG,P-APG,P-RBG,P-STL,P-BLK
51,Aaron Gordon,2020,12.4,3.2,5.7,0.7,0.7,15.8,3.4,7.4,0.8,0.6
69,Aaron Holiday,2020,7.2,1.9,1.3,0.7,0.2,11.1,3.3,2.6,0.9,0.2
127,Al Horford,2020,14.2,3.4,6.7,0.9,0.9,11.8,3.5,6.6,0.7,0.8
179,Alec Burks,2020,12.7,2.2,4.6,0.6,0.3,16.9,3.4,4.7,0.9,0.3
219,Alex Caruso,2020,6.4,2.8,2.9,1.1,0.3,7.5,2.5,2.8,1.0,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...
12361,Zach LaVine,2025,,,,,,20.3,4.1,4.3,0.8,0.2
12395,Zeke Nnaji,2025,,,,,,4.8,0.8,3.0,0.4,0.8
12403,Ziaire Williams,2025,,,,,,9.1,1.5,3.9,0.9,0.4
12408,Zion Williamson,2025,,,,,,26.4,5.3,7.3,1.1,0.8


In [157]:
from sklearn.metrics import mean_squared_error
mask = predictions['Season'] < 2025
mse = mean_squared_error(predictions.loc[mask , ['Next-PPG','Next-APG','Next-RPG','Next-STL','Next-BLK']], predictions.loc[mask, ['P-PPG', 'P-APG', 'P-RBG', 'P-STL', 'P-BLK']])
print(mse) 



2.474912883435583


In [185]:
def rolling_averages(group, cols, new_cols):
    
    rolling_stats = group[cols].shift(1).rolling(3).mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [186]:
cols = ['PTS', 'AST', 'TRB', 'STL', 'BLK', 'PER','TRB%', 'AST%', 'STL%', 'BLK%']
new_cols = [f'{c}_rolling' for c in cols]


In [187]:
rolling_df = df.groupby("Player").apply(lambda x: rolling_averages(x, cols, new_cols))
rolling_df = rolling_df.droplevel('Player')



  rolling_df = df.groupby("Player").apply(lambda x: rolling_averages(x, cols, new_cols))


In [188]:
new_combined = backtest(rolling_df, model, features + new_cols)
new_combined

Unnamed: 0,Player,Season,Next-PPG,Next-APG,Next-RPG,Next-STL,Next-BLK,P-PPG,P-APG,P-RBG,P-STL,P-BLK
63,Aaron Gordon,2024,14.7,3.2,4.8,0.5,0.3,12.3,2.9,6.2,0.8,0.6
81,Aaron Holiday,2024,5.5,1.3,1.3,0.3,0.2,6.4,2.0,1.5,0.6,0.1
87,Aaron Nesmith,2024,12.0,1.2,4.0,0.8,0.4,14.2,1.6,3.9,0.8,0.5
131,Al Horford,2024,9.0,2.1,6.2,0.6,0.9,8.0,2.5,6.2,0.6,1.0
191,Alec Burks,2024,7.3,1.1,2.5,0.6,0.1,9.8,1.7,2.4,0.5,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...
12341,Zach Collins,2025,,,,,,7.1,2.1,4.7,0.5,0.6
12361,Zach LaVine,2025,,,,,,21.8,4.6,4.5,0.8,0.3
12395,Zeke Nnaji,2025,,,,,,5.0,0.8,2.8,0.4,0.6
12403,Ziaire Williams,2025,,,,,,9.1,1.6,4.0,0.9,0.4


In [189]:
mask = predictions['Season'] < 2025
mse = mean_squared_error(new_combined.loc[mask , ['Next-PPG','Next-APG','Next-RPG','Next-STL','Next-BLK']], new_combined.loc[mask, ['P-PPG', 'P-APG', 'P-RBG', 'P-STL', 'P-BLK']])
print(mse)

2.0378972332015812
