## Setup

In [158]:
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.endpoints import leaguedashplayerbiostats
import os
import pandas as pd

In [147]:
base_outdir = 'E:/data/nba/sophomore_breakout'
sophomore_outdir = f"{base_outdir}/sophomore_season"
rookie_outdir = f"{base_outdir}/rookie_season"
rookie_bio_outdir = f"{base_outdir}/rookie_bio"
last_ngames_list = [10, 20, 30]

## Data Retrieval

In [178]:
def get_player_season_data(start_year: int, end_year: int, outdir: str, player_experience: str, last_n_games: int = None, per_mode_detailed: str = 'PerGame'):
    os.makedirs(outdir, exist_ok=True)  # Create parent directory if it doesn't exist
    for season_year in range(start_year, end_year + 1):
        season = f"{season_year}-{str(season_year + 1)[-2:]}"
        params = {
            "season": season,
            "player_experience_nullable": player_experience,
            "per_mode_detailed": per_mode_detailed
        }
        if last_n_games:
            params["last_n_games"] = last_n_games
        res = leaguedashplayerstats.LeagueDashPlayerStats(**params)
        df = res.get_data_frames()[0]
        outfile = f"{outdir}/{player_experience}_{season}.csv"
        df.to_csv(outfile, index=False)
        print(f"Saved {season} to {outfile}")

def get_player_bio_data(start_year: int, end_year: int, outdir: str, player_experience: str):
    os.makedirs(outdir, exist_ok=True)  # Create parent directory if it doesn't exist
    for season_year in range(start_year, end_year + 1):
        season = f"{season_year}-{str(season_year + 1)[-2:]}"
        params = {
            "season": season,
            "player_experience_nullable": player_experience
        }
        res = leaguedashplayerbiostats.LeagueDashPlayerBioStats(**params)
        df = res.get_data_frames()[0]
        
        # filter only to rookies
        tmp_season_df = pd.read_csv(f'{rookie_outdir}/Rookie_{season}.csv')
        df = df[df.PLAYER_ID.isin(tmp_season_df.PLAYER_ID)]

        outfile = f"{outdir}/{player_experience}_{season}.csv"
        df.to_csv(outfile, index=False)
        print(f"Saved {season} to {outfile}")

In [179]:
# collect sophomore data
# get_player_season_data(2000, 2023, sophomore_outdir, 'Sophomore')
# collect rookie data
# get_player_season_data(2000, 2023, rookie_outdir, 'Rookie')
# collect rookie bio data
get_player_bio_data(2000, 2023, rookie_bio_outdir, 'Rookie')

Saved 2000-01 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2000-01.csv
Saved 2001-02 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2001-02.csv
Saved 2002-03 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2002-03.csv
Saved 2003-04 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2003-04.csv
Saved 2004-05 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2004-05.csv
Saved 2005-06 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2005-06.csv
Saved 2006-07 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2006-07.csv
Saved 2007-08 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2007-08.csv
Saved 2008-09 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2008-09.csv
Saved 2009-10 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2009-10.csv
Saved 2010-11 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2010-11.csv
Saved 2011-12 to E:/data/nba/sophomore_breakout/rookie_bio/Rookie_2011-12.csv
Saved 2012-13 to E:/data/nba/sophomore_breakout/rookie_bio/Rooki

In [28]:
# collect last N games data for rookies
for n in last_ngames_list:
    get_player_season_data(2000, 2023, f'{base_outdir}/rookie_last_{n}_games', 'Rookie', n)

Saved 2000-01 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2000-01.csv
Saved 2001-02 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2001-02.csv
Saved 2002-03 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2002-03.csv
Saved 2003-04 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2003-04.csv
Saved 2004-05 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2004-05.csv
Saved 2005-06 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2005-06.csv
Saved 2006-07 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2006-07.csv
Saved 2007-08 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2007-08.csv
Saved 2008-09 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2008-09.csv
Saved 2009-10 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2009-10.csv
Saved 2010-11 to E:/data/nba/sophomore_breakout/rookie_last_10_games/Rookie_2010-11.csv
Saved 2011-12 to E:/data/nba/sop

## Preparing the data

In [199]:
# columns to keep for season data
season_cols = ['PLAYER_ID','PLAYER_NAME','AGE','GP','MIN','FGA','FG_PCT','FG3A','FG3_PCT','FTA','FT_PCT','REB','AST','TOV','STL','BLK','PF','PTS','PLUS_MINUS','NBA_FANTASY_PTS']
# xdf will store all of our input data
season_df = pd.concat([pd.read_csv(f'{rookie_outdir}/{f}')[season_cols] for f in os.listdir(rookie_outdir)], ignore_index=True)
season_df['FPTS_PER_MIN'] = season_df['NBA_FANTASY_PTS'] / season_df['MIN']
# xdf = season_df.copy()

In [200]:
# columns to keep for bio data
bio_cols = ['PLAYER_ID','PLAYER_HEIGHT_INCHES','PLAYER_WEIGHT','DRAFT_NUMBER','NET_RATING','USG_PCT']
# read in all bio data
bio_df = pd.concat([pd.read_csv(f'{rookie_bio_outdir}/{f}')[bio_cols] for f in os.listdir(rookie_bio_outdir)], ignore_index=True)

In [201]:
# merge bio data with season data
xdf = pd.merge(season_df, bio_df, on='PLAYER_ID', how='inner')

In [202]:
xdf

Unnamed: 0,PLAYER_ID,PLAYER_NAME,AGE,GP,MIN,FGA,FG_PCT,FG3A,FG3_PCT,FTA,...,PF,PTS,PLUS_MINUS,NBA_FANTASY_PTS,FPTS_PER_MIN,PLAYER_HEIGHT_INCHES,PLAYER_WEIGHT,DRAFT_NUMBER,NET_RATING,USG_PCT
0,2062,A.J. Guyton,23.0,33,19.0,5.8,0.406,2.1,0.391,0.5,...,1.1,6.0,-4.4,10.8,0.568421,73.0,180.0,32,-11.0,0.167
1,1950,Andy Panko,23.0,1,0.6,0.0,0.000,0.0,0.000,0.0,...,0.0,0.0,0.0,0.0,0.000000,81.0,245.0,Undrafted,0.0,0.000
2,1609,Art Long,28.0,9,2.1,0.4,0.000,0.0,0.000,0.2,...,0.6,0.0,-1.2,2.0,0.952381,81.0,240.0,Undrafted,-25.2,0.135
3,2073,Brian Cardinal,24.0,15,8.3,2.1,0.323,0.3,0.000,1.2,...,1.8,2.1,-2.3,5.4,0.650602,80.0,245.0,44,-13.6,0.153
4,2036,Chris Mihm,21.0,59,19.7,6.6,0.442,0.0,0.000,2.1,...,2.6,7.6,-1.3,16.0,0.812183,85.0,265.0,7,-2.7,0.202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970,1631218,Trayce Jackson-Davis,24.0,68,16.6,4.9,0.702,0.0,0.000,1.9,...,1.6,7.9,0.5,19.7,1.186747,81.0,245.0,57,1.1,0.157
1971,1641998,Trey Jemison,24.0,25,23.0,5.4,0.551,0.0,0.000,1.0,...,2.8,6.8,-4.4,18.3,0.795652,82.0,260.0,Undrafted,-7.3,0.132
1972,1641774,Tristan Vukcevic,21.0,10,15.3,6.7,0.433,3.6,0.278,2.2,...,2.8,8.5,3.2,17.4,1.137255,82.0,220.0,42,10.4,0.233
1973,203995,Vasilije Micić,30.0,60,19.6,6.2,0.430,2.6,0.279,1.3,...,1.1,7.0,-1.2,15.6,0.795918,75.0,188.0,52,-3.1,0.189


In [203]:
# columns to keep for last_n_games data
last_n_games_cols = ['PLAYER_ID','MIN','FGA','FG_PCT','FG3A','FG3_PCT','FTA','FT_PCT','PTS','PLUS_MINUS','NBA_FANTASY_PTS']

# for n in last_ngames_list:
n = 10
for n in last_ngames_list:
    last_n_df = pd.concat([pd.read_csv(f'{base_outdir}/rookie_last_{n}_games/{f}')[last_n_games_cols] for f in os.listdir(f'{base_outdir}/rookie_last_{n}_games')], ignore_index=True)
    last_n_df['FPTS_PER_MIN'] = last_n_df['NBA_FANTASY_PTS'] / last_n_df['MIN']

    # Rename columns of last_n_df by prepending 'last_{n}_'
    last_n_df = last_n_df.rename(columns={col: f"last_{n}_{col}" for col in last_n_df.columns if col != 'PLAYER_ID'})

    # Merge the dataframes on 'PLAYER_ID'
    xdf= pd.merge(xdf, last_n_df, on='PLAYER_ID', how='left')

In [204]:
ycols = ['PLAYER_ID', 'NBA_FANTASY_PTS']
ydf = pd.concat([pd.read_csv(f'{sophomore_outdir}/{f}')[ycols] for f in os.listdir(sophomore_outdir)], ignore_index=True)
ydf = ydf.rename(columns={'NBA_FANTASY_PTS': 'sophomore_NBA_FANTASY_PTS'})

In [205]:
alldf = pd.merge(xdf, ydf, on='PLAYER_ID', how='left')
alldf = alldf.dropna()
alldf = alldf[alldf.DRAFT_NUMBER != 'Undrafted']

In [206]:
X = alldf.drop(columns=['PLAYER_ID', 'PLAYER_NAME','sophomore_NBA_FANTASY_PTS'])
# X.dropna(inplace=True, )
y = alldf['sophomore_NBA_FANTASY_PTS']

In [207]:
X

Unnamed: 0,AGE,GP,MIN,FGA,FG_PCT,FG3A,FG3_PCT,FTA,FT_PCT,REB,...,last_30_FGA,last_30_FG_PCT,last_30_FG3A,last_30_FG3_PCT,last_30_FTA,last_30_FT_PCT,last_30_PTS,last_30_PLUS_MINUS,last_30_NBA_FANTASY_PTS,last_30_FPTS_PER_MIN
0,23.0,33,19.0,5.8,0.406,2.1,0.391,0.5,0.833,1.1,...,6.2,0.436,2.5,0.417,0.8,0.875,7.2,-2.9,13.2,0.613953
3,24.0,15,8.3,2.1,0.323,0.3,0.000,1.2,0.611,1.5,...,2.3,0.333,0.4,0.000,1.6,0.714,2.7,-2.2,5.7,0.662791
4,21.0,59,19.7,6.6,0.442,0.0,0.000,2.1,0.794,4.7,...,7.8,0.427,0.0,0.000,2.7,0.775,8.7,-1.7,18.8,0.776860
6,24.0,65,21.1,8.8,0.417,0.7,0.370,2.3,0.820,2.2,...,14.6,0.448,1.3,0.389,3.9,0.857,17.0,-6.9,24.2,0.718101
7,20.0,35,7.4,1.9,0.262,0.0,0.000,0.8,0.464,1.6,...,2.2,0.293,0.0,0.000,0.9,0.500,1.7,-1.6,5.4,0.683544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1866,22.0,82,21.6,8.0,0.448,2.1,0.343,1.8,0.752,6.0,...,8.8,0.462,2.3,0.324,2.3,0.714,10.5,-2.7,24.4,0.953125
1869,21.0,31,14.0,5.2,0.363,2.6,0.238,0.6,0.556,1.5,...,7.0,0.385,3.6,0.298,0.8,0.600,6.9,-3.0,13.6,0.768362
1871,22.0,15,7.0,2.7,0.300,1.9,0.143,0.1,1.000,1.0,...,4.5,0.333,3.2,0.158,0.3,1.000,3.8,-1.2,6.5,0.698925
1872,21.0,74,23.0,5.6,0.720,0.0,0.333,2.1,0.516,8.4,...,7.4,0.726,0.1,0.333,2.6,0.484,12.1,-0.3,36.2,1.265734


## Building the model

In [217]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Assuming X and y are your feature and target DataFrames
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with StandardScaler and RandomForestRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('model', RandomForestRegressor(random_state=42))
])

# Set up the hyperparameter grid for tuning
param_grid = {
    # 'model__n_estimators': [50, 100, 200],
    # 'model__max_depth': [None, 10, 20, 30],
    # 'model__min_samples_split': [2, 5, 10],
    # 'model__min_samples_leaf': [1, 2, 4]
    'model__n_estimators': [300, 400],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2],
    'model__min_samples_leaf': [4,5]
}

# Initialize GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=2)

# Train the model with hyperparameter tuning
grid_search.fit(X_train, y_train.values.ravel())

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model using mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Mean Absolute Error: {mae}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END model__max_depth=None, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=300; total time=   3.8s
[CV] END model__max_depth=None, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=300; total time=   3.7s
[CV] END model__max_depth=None, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=300; total time=   3.7s
[CV] END model__max_depth=None, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=300; total time=   3.8s
[CV] END model__max_depth=None, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=300; total time=   3.7s
[CV] END model__max_depth=None, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=400; total time=   5.0s
[CV] END model__max_depth=None, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=400; total time=   5.0s
[CV] END model__max_de

In [218]:
# After fitting the RandomForest model
importances = best_model.named_steps['model'].feature_importances_

# Create a DataFrame for better visualization
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

                    Feature  Importance
55  last_30_NBA_FANTASY_PTS    0.497727
17          NBA_FANTASY_PTS    0.177248
1                        GP    0.025541
21             DRAFT_NUMBER    0.021536
15                      PTS    0.017219
18             FPTS_PER_MIN    0.012795
44  last_20_NBA_FANTASY_PTS    0.012693
34     last_10_FPTS_PER_MIN    0.011890
32       last_10_PLUS_MINUS    0.011117
37           last_20_FG_PCT    0.007657
7                       FTA    0.006638
45     last_20_FPTS_PER_MIN    0.006615
9                       REB    0.006389
26           last_10_FG_PCT    0.006386
54       last_30_PLUS_MINUS    0.006166
48           last_30_FG_PCT    0.006141
36              last_20_FGA    0.005999
23                  USG_PCT    0.005993
22               NET_RATING    0.005843
4                    FG_PCT    0.005814
33  last_10_NBA_FANTASY_PTS    0.005772
46              last_30_MIN    0.005654
53              last_30_PTS    0.005573
43       last_20_PLUS_MINUS    0.005417


In [219]:
import joblib

# Save the best model to a file
outfile = f'{base_outdir}/best_random_forest_model.pkl'
joblib.dump(best_model, outfile)

# Later, you can load the model using this:
# loaded_model = joblib.load('best_random_forest_model.pkl')

print("Model saved as outfile")

Model saved as outfile


## Run for next season's sophomores

In [221]:
res = leaguedashplayerstats.LeagueDashPlayerStats(season='2023-24', player_experience_nullable='Rookie')
recent_rookies_df = res.get_data_frames()[0]
recent_rookies_df = xdf[xdf['PLAYER_ID'].isin(recent_rookies_df['PLAYER_ID'])]
recent_rookies_df = recent_rookies_df.dropna()
recent_rookies_df.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,AGE,GP,MIN,FGA,FG_PCT,FG3A,FG3_PCT,FTA,...,last_30_FGA,last_30_FG_PCT,last_30_FG3A,last_30_FG3_PCT,last_30_FTA,last_30_FT_PCT,last_30_PTS,last_30_PLUS_MINUS,last_30_NBA_FANTASY_PTS,last_30_FPTS_PER_MIN
1874,1641745,Adam Flagler,24.0,2,7.0,3.5,0.143,3.0,0.167,0.0,...,3.5,0.143,3.0,0.167,0.0,0.0,1.5,0.0,4.5,0.642857
1875,1641766,Adama Sanogo,22.0,9,7.3,3.0,0.519,0.0,0.0,1.3,...,4.0,0.458,0.0,0.0,1.5,0.667,4.7,2.8,10.5,1.141304
1876,1641788,Alex Fudge,21.0,6,6.7,2.5,0.4,0.8,0.2,0.3,...,4.5,0.556,1.5,0.333,0.0,0.0,5.5,-9.5,11.3,0.875969
1877,1641735,Amari Bailey,20.0,10,6.5,2.4,0.333,0.8,0.125,0.7,...,2.4,0.353,0.7,0.2,0.7,1.0,2.6,0.7,5.9,0.830986
1878,1641708,Amen Thompson,21.0,62,22.4,7.2,0.536,0.9,0.138,2.5,...,8.9,0.56,0.7,0.1,3.1,0.697,12.2,3.2,32.1,1.180147


In [233]:
recent_season_df = pd.read_csv(f'{rookie_outdir}/Rookie_2023-24.csv')[season_cols]
recent_season_df['FPTS_PER_MIN'] = recent_season_df['NBA_FANTASY_PTS'] / recent_season_df['MIN']
# read in all bio data
recent_bio_df = pd.read_csv(f'{rookie_bio_outdir}/Rookie_2023-24.csv')[bio_cols]
recent_rookies_df = pd.merge(recent_season_df, recent_bio_df, on='PLAYER_ID', how='inner')

for n in last_ngames_list:
    last_n_df = pd.read_csv(f'{base_outdir}/rookie_last_{n}_games/Rookie_2023-24.csv')[last_n_games_cols]
    last_n_df['FPTS_PER_MIN'] = last_n_df['NBA_FANTASY_PTS'] / last_n_df['MIN']

    # Rename columns of last_n_df by prepending 'last_{n}_'
    last_n_df = last_n_df.rename(columns={col: f"last_{n}_{col}" for col in last_n_df.columns if col != 'PLAYER_ID'})

    # Merge the dataframes on 'PLAYER_ID'
    recent_rookies_df = pd.merge(recent_rookies_df, last_n_df, on='PLAYER_ID', how='left')

recent_rookies_df = recent_rookies_df.dropna()
recent_rookies_df = recent_rookies_df[recent_rookies_df.DRAFT_NUMBER != 'Undrafted']

In [234]:
X_recent_rookies = recent_rookies_df.drop(columns=['PLAYER_ID', 'PLAYER_NAME'])
y_pred = best_model.predict(X_recent_rookies)
recent_rookies_df['predicted_NBA_FANTASY_PTS'] = y_pred

In [235]:
view_df = recent_rookies_df[['PLAYER_NAME', 'NBA_FANTASY_PTS','predicted_NBA_FANTASY_PTS']]
view_df['diff'] = recent_rookies_df.predicted_NBA_FANTASY_PTS - recent_rookies_df.NBA_FANTASY_PTS

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  view_df['diff'] = recent_rookies_df.predicted_NBA_FANTASY_PTS - recent_rookies_df.NBA_FANTASY_PTS
