In [2]:
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import optuna

MASTER = pd.read_csv("MASTER_DATA_103123.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
MASTER.shape
MASTER.columns
MASTER.isna().sum()

MASTER["season"].value_counts()

# remove 2023 and remove minimum year
MASTER = MASTER[MASTER["season"] != 2023]
d2023 = 
MASTER = MASTER[MASTER["season"] < min(MASTER["season"])]

season
2021    359
2020    347
2022    346
2019    343
2018    339
2016    322
2017    320
2015    318
2012    311
2013    308
2011    305
2014    305
2010    298
2023    284
Name: count, dtype: int64

In [9]:
features = [
    "points_ly",
    "targets_pg_ly",
    "wopr_pg_ly",
    "pick",
    "air_yards_pg_ly",
    "total_games_ly",
    "pass_attempt_difference",
    # "position", - one hot encode soon
    "total_positional_investment",
    "target_dropoff",
    "points_2y",
    "total_games_2y",
    "points_3y",
    "total_games_3y",
    "targets_pg_2y",
    "targets_pg_3y",
    "epa_pg_ly",
    "years_pro",
    "wopr_pg_2y",
    "wopr_pg_3y",
    "targets_added_this_year",
    "is_on_new_team",
    # "combine_cluster", to do -- onehot encode
]
target = ["points_per_game"]

X = MASTER[features]
y = MASTER[target]

# Perform a 70/30 train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


# define an obj function to be mnimuized
def objective(trial):
    # suggest values for hyperparamters using "trial obect"
    regression_name = trial.suggest_categorical("regression", ["XGBRegressor"])

    if regression_name == "XGBRegressor":
        xgb_max_depth = trial.suggest_int("max_depth", 1, 15)
        xgb_eta = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        xgb_gamma = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        xgb_colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        xgb_min_child_weight = trial.suggest_int("min_child_weight", 1, 100, log=True)

        reg_object = xgb.XGBRegressor(
            max_depth=xgb_max_depth,
            eta=xgb_eta,
            gamma=xgb_gamma,
            colsample_bytree=xgb_colsample_bytree,
        )

    # Use cross_val_score to get negative mean squared errors
    mse_scores = -cross_val_score(
        reg_object, X_train, y_train, n_jobs=-1, cv=5, scoring="neg_mean_squared_error"
    )

    # Calculate the mean RMSE
    rmse_scores = np.sqrt(mse_scores)
    mean_rmse = rmse_scores.mean()

    accuracy = mean_rmse

    #   reg_object.fit(X_train, y_train)
    #   y_pred = reg_object.predict(X_test)

    #  accuracy = sklearn.metrics.mean_squared_error(y_test, y_pred)
    #  accuracy = np.sqrt(accuracy)
    return accuracy


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1000)

print(study.best_trial)

# so far best:
# 3.21 avg cross validation rmse
# FrozenTrial(number=668, state=1, values=[3.2190158650035583], datetime_start=datetime.datetime(2023, 11, 1, 13, 23, 13, 196716), datetime_complete=datetime.datetime(2023, 11, 1, 13, 23, 13, 260960), params={'regression': 'XGBRegressor', 'max_depth': 4, 'eta': 0.06948461324750728, 'gamma': 8.813726740937882e-07, 'colsample_bytree': 0.3444873951445566, 'min_child_weight': 3}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'regression': CategoricalDistribution(choices=('XGBRegressor',)), 'max_depth': IntDistribution(high=15, log=False, low=1, step=1), 'eta': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'gamma': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'min_child_weight': IntDistribution(high=100, log=True, low=1, step=1)}, trial_id=668, value=None)

[I 2023-11-01 13:21:41,312] A new study created in memory with name: no-name-b05458bd-840b-43df-afd2-944939a1b599
[I 2023-11-01 13:21:41,353] Trial 0 finished with value: 4.719809066543347 and parameters: {'regression': 'XGBRegressor', 'max_depth': 1, 'eta': 8.934268105456128e-06, 'gamma': 0.33400416813277173, 'colsample_bytree': 0.579296805616044, 'min_child_weight': 99}. Best is trial 0 with value: 4.719809066543347.
[I 2023-11-01 13:21:42,597] Trial 1 finished with value: 4.692836288229758 and parameters: {'regression': 'XGBRegressor', 'max_depth': 12, 'eta': 0.0001239414295883694, 'gamma': 3.747280477520585e-08, 'colsample_bytree': 0.8524693320608583, 'min_child_weight': 2}. Best is trial 1 with value: 4.692836288229758.
[I 2023-11-01 13:21:43,725] Trial 2 finished with value: 3.9611812652260356 and parameters: {'regression': 'XGBRegressor', 'max_depth': 14, 'eta': 0.0049682163242022095, 'gamma': 1.8500451742689103e-07, 'colsample_bytree': 0.5042626312129104, 'min_child_weight': 90

FrozenTrial(number=668, state=1, values=[3.2190158650035583], datetime_start=datetime.datetime(2023, 11, 1, 13, 23, 13, 196716), datetime_complete=datetime.datetime(2023, 11, 1, 13, 23, 13, 260960), params={'regression': 'XGBRegressor', 'max_depth': 4, 'eta': 0.06948461324750728, 'gamma': 8.813726740937882e-07, 'colsample_bytree': 0.3444873951445566, 'min_child_weight': 3}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'regression': CategoricalDistribution(choices=('XGBRegressor',)), 'max_depth': IntDistribution(high=15, log=False, low=1, step=1), 'eta': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'gamma': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'min_child_weight': IntDistribution(high=100, log=True, low=1, step=1)}, trial_id=668, value=None)
