In [57]:
import pandas as pd

pd.set_option('display.max_columns', None)
df = pd.read_csv('Nba_stats.csv' ,encoding= "utf-8", index_col=0 )
df['Player'] = df['Player'].str.encode('latin1').str.decode('utf-8')
df['FG%'] = df['FG%'].fillna(0)
df['FT%'] = df['FT%'].fillna(0)
df['3P%'] = df['3P%'].fillna(0)
df['2P%'] = df['2P%'].fillna(0)
df['eFG%'] = df['eFG%'].fillna(0)
df['Age'] = df['Age'].fillna(0)
df['Age'] = df['Age'].astype(int)
df['G'] = df['G'].fillna(0)
df['GS'] = df['GS'].fillna(0)
df['G'] = df['G'].astype(int)
df['GS'] = df['GS'].astype(int)

df['team_code'] = df['Team'].astype("category").cat.codes
df['pos_code'] = df['Pos'].astype('category').cat.codes
df['player_id'] = df['id'].astype('category').cat.codes

df = df.sort_values(['Player', 'Season']).reset_index(drop=True)
df = df.drop_duplicates(subset=['Season', 'Player'], keep='first')





In [58]:
def next_season_stats(player):
   
    player[['Next-PPG', 'Next-APG', 'Next-RPG', 'Next-STL','Next-BLK' ]] = player[['PTS', 'AST', 'TRB', 'STL' , 'BLK']].shift(-1)
    return player

df = df.groupby('Player', group_keys=False).apply(next_season_stats)
#used to filter out inactive players by finding their last season played
df['last_season'] = df.groupby('player_id')['Season'].transform('max')
#dropping any player that did not play last season
df = df.drop(df[df['last_season'] != 2025].index)


  df = df.groupby('Player', group_keys=False).apply(next_season_stats)


In [59]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split


In [60]:
base_model = RandomForestRegressor()
model = MultiOutputRegressor(base_model)


In [61]:
features = ['G', 'GS','PTS' ,'AST', 'TRB', 'Age', 'STL', 'BLK', 'MP', '3PA', 'team_code', 'FGA', 'pos_code','player_id', 'PER', 'USG%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'WS/48', 'VORP']



In [62]:
years = sorted(df["Season"].unique())


In [63]:
def backtest(data, model, features, start=3, step=1):
    all_predictions = []
    years = sorted(data["Season"].unique())


    prediction = ['P-PPG', 'P-APG', 'P-RBG', 'P-STL', 'P-BLK']
    
    actual_targets = ['Next-PPG','Next-APG','Next-RPG','Next-STL','Next-BLK']

    for i in range(start, len(years), step):
        current_year = years[i]
        
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[features], train[actual_targets])
        preds = model.predict(test[features]) 
        
        preds_df = pd.DataFrame(preds, columns=prediction ,index=test.index)
        preds_df = preds_df[prediction].round(1)
        combined = pd.concat([test[['Player', 'Season']], test[actual_targets], preds_df], axis= 1)
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)
   


In [68]:
predictions = backtest(df, model, features)



In [65]:
from sklearn.metrics import mean_squared_error
mask = predictions['Season'] < 2025
mse = mean_squared_error(predictions.loc[mask , ['Next-PPG','Next-APG','Next-RPG','Next-STL','Next-BLK']], predictions.loc[mask, ['P-PPG', 'P-APG', 'P-RBG', 'P-STL', 'P-BLK']])
print(mse) 



2.284655421686747
