In [115]:
import pandas as pd
import numpy as np
from scraper import GameLogs

In [116]:
gamelogs = GameLogs("Stephen Curry")
dat = gamelogs.fetch_data()

In [117]:
df = GameLogs("Stephen Curry").fetch_data()

df = df.fillna(0)

# Remove rows where 'GS' is 'Inactive' or 'Did Not Dress'
df = df[~df['GS'].isin(['Inactive', 'Did Not Dress'])]
    
# Make home away column: 1 if away, 0 if home
if 'Unnamed: 5' in df.columns:
    df = df.rename(columns={'Unnamed: 5': 'HomeAway'})
    df['HomeAway'] = df['HomeAway'].replace('@', '1')

# Convert 'date' to datetime then to a numeric difference in days (days rest)
df['Date'] = pd.to_datetime(df['Date'])
df['Rest'] = df['Date'].diff().dt.days
df['Rest'] = df['Rest'].fillna(0)
df.reset_index(drop=True, inplace=True)

# Drop unnecessary columns if they exist
columns_to_drop = ['Rk', 'Age', 'Unnamed: 7', 'MP', 'G', '+/-', 'GS', 'BLK', 'STL', 'ORB', 'Date']
for column in columns_to_drop:
    if column in df.columns:
        df = df.drop(column, axis=1)

# Map opponent teams to numeric values
team_to_number = {
    'ATL': 1, 'BOS': 2, 'BRK': 3, 'CHI': 4, 'CHO': 5, 'CLE': 6, 'DAL': 7,
    'DEN': 8, 'DET': 9, 'GSW': 10, 'HOU': 11, 'IND': 12, 'LAC': 13, 'LAL': 14,
    'MEM': 15, 'MIA': 16, 'MIL': 17, 'MIN': 18, 'NOP': 19, 'NYK': 20, 'OKC': 21,
    'ORL': 22, 'PHI': 23, 'PHO': 24, 'POR': 25, 'SAC': 26, 'SAS': 27, 'TOR': 28,
    'UTA': 29, 'WAS': 30,
}
df['Opp'] = df['Opp'].apply(lambda x: team_to_number.get(x, x))
df['Tm'] = df['Tm'].apply(lambda x: team_to_number.get(x, x))


In [118]:
df

Unnamed: 0,Tm,HomeAway,Opp,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,DRB,TRB,AST,TOV,PF,PTS,GmSc,Rest
0,10,1,25,4,10,0.4,3,7,0.429,6,6,1.0,9,9,10,2,0,17,21.3,0.0
1,10,1,29,7,20,0.35,4,13,0.308,2,2,1.0,3,3,4,3,3,20,10.3,2.0
2,10,0,13,6,11,0.545,4,7,0.571,2,2,1.0,4,4,6,6,1,18,14.4,2.0
3,10,1,30,7,15,0.467,4,9,0.444,6,6,1.0,2,3,6,2,1,24,19.4,8.0
4,10,1,2,8,17,0.471,4,9,0.444,7,7,1.0,7,7,9,3,2,27,27.6,2.0
5,10,1,6,5,10,0.5,1,4,0.25,1,1,1.0,1,1,2,6,0,12,4.7,2.0
6,10,1,21,13,23,0.565,7,13,0.538,3,4,0.75,4,5,7,3,2,36,29.4,2.0
7,10,0,7,14,27,0.519,5,12,0.417,4,5,0.8,6,6,9,4,2,37,29.0,2.0
8,10,0,15,4,9,0.444,3,7,0.429,2,2,1.0,7,8,5,3,2,13,14.8,3.0
9,10,1,13,10,21,0.476,6,15,0.4,0,0,0.0,6,7,6,3,1,26,19.3,3.0


In [119]:
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [120]:
valid_columns = df.columns
df = df[valid_columns].copy()

In [121]:
# Naive Prediction
pts_mean = df['PTS'].mean()
pts_mean

np.float64(24.517857142857142)

In [122]:
def roll_column(df, col, winsize):
    t_col = df[col].rolling(winsize, closed='left').sum().to_numpy()
    
    t_col[:winsize] = np.concatenate(([0],df[col].iloc[:(winsize)].cumsum().to_numpy()[:-1]))

    return(t_col)

In [123]:
def load_and_process_df(df):
    cols_to_agg = ['FG', '3P', 'FT', 'PTS']
    winsizes = [3, 5, 10]  # Lookback windows (in games)
    for winsize in winsizes:
        for raw_col in cols_to_agg:
            new_colname = 'rollsum_' + raw_col + '_' + str(winsize)
            df[new_colname] = roll_column(df, raw_col, winsize)
    return df


In [124]:
df = load_and_process_df(df)

In [125]:
# Create target: next game points. This shifts PTS column by -1.
df['target'] = df['PTS'].shift(-1)
df = df.dropna().reset_index(drop=True)

In [126]:
predictors = [
    'rollsum_PTS_3', 'rollsum_PTS_5', 'rollsum_PTS_10',
    'rollsum_FG_3', 'rollsum_FG_5', 'rollsum_FG_10',
    'rollsum_FT_3', 'rollsum_FT_5', 'rollsum_FT_10',
    'rollsum_3P_3', 'rollsum_3P_5', 'rollsum_3P_10',
    'Rest', 'HomeAway', 'Opp'
]

In [127]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[predictors] = scaler.fit_transform(df[predictors])

In [128]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt']
}

tscv = TimeSeriesSplit(n_splits=5)
grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    scoring='neg_mean_squared_error',
    cv=tscv,
    n_jobs=-1
)

In [129]:
# Use the first 70% of the data for tuning
split_idx = int(0.7 * len(df))
train_data = df.iloc[:split_idx]
grid_search.fit(train_data[predictors], train_data['target'])
best_params = grid_search.best_params_
print("Best parameters from grid search:", best_params)

Best parameters from grid search: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
88 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/shargo/DSC540-Project/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shargo/DSC540-Project/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/shargo/DSC540-Project/.venv/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Users/shargo/DSC540-Project/.venv/lib/python3.12/site-packages/sklearn/utils/_param

In [130]:
model = RandomForestRegressor(**best_params, random_state=42)

In [131]:
def backtest(data, model, predictors, start=3):
    all_predictions = []
    for i in range(start, len(data) - 1):
        train = data.iloc[:i+1]
        test = data.iloc[i+1:i+2]
        model.fit(train[predictors], train['target'])
        pred = model.predict(test[predictors])[0]
        record = pd.DataFrame({
            'actual': test['target'].values,
            'prediction': [pred]
        }, index=test.index)
        all_predictions.append(record)
    return pd.concat(all_predictions)

In [132]:
predictions = backtest(df.copy(), model, predictors)

In [133]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(predictions['actual'], predictions['prediction'])
print("Mean Squared Error:", mse)

Mean Squared Error: 102.91124035910575


In [134]:
print(predictions.head(10))

    actual  prediction
4     12.0   22.323750
5     36.0   19.995333
6     37.0   24.438333
7     13.0   29.486607
8     26.0   25.424708
9     23.0   24.297167
10    19.0   22.599476
11    14.0   23.179714
12    28.0   19.869331
13    23.0   21.383488
