In [2]:
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly.express as px
import json

import plotly.io as pio
import re
from sklearn.linear_model import LinearRegression

pio.templates.default = "plotly_dark"
pio.renderers.default = "browser"

In [3]:
with open('C:/Users/jrnas/Downloads/BBGM_data_2094_playoffs.json', encoding='latin') as f:
    r_json = json.load(f)

In [4]:
# Initialize an empty list to store the data
data = []

# Iterate over the list of players
for player in tqdm(r_json['players']):
    # Iterate over the ratings of the current player
    for rating in player['stats']:
        # Create a new dictionary that includes 'pid', 'firstName', 'lastName' and the rating
        row = {
            'pid': player['pid']
        }
        row.update(rating)
        # Append the dictionary to the list
        data.append(row)

# Convert the list of dictionaries to a DataFrame
stats_df = (
    pd.DataFrame(data)
    .convert_dtypes(dtype_backend='pyarrow')
)

stats_df = stats_df[(stats_df.season.between(2026, 2220)) & (stats_df.playoffs == False)].reset_index(drop=True)

100%|██████████| 5754/5754 [00:00<00:00, 29090.32it/s]


In [6]:
# Initialize an empty list to store the data
data = []

# Iterate over the list of players
for player in tqdm(r_json['players']):
    # Iterate over the ratings of the current player
    for rating in player['ratings']:
        # Create a new dictionary that includes 'pid', 'firstName', 'lastName' and the rating
        row = {
            'pid': player['pid'],
            'firstName': player['firstName'],
            'lastName': player['lastName'],
            'born': player['born']['year'],
        }
        row.update(rating)
        # Append the dictionary to the list
        data.append(row)

# Convert the list of dictionaries to a DataFrame
ratings_df = (
    pd.DataFrame(data)
    .convert_dtypes(dtype_backend='pyarrow')
    .astype({'skills': 'string[pyarrow]'})
    .assign(
        age=lambda x: x.season - x.born,
    )
)

ratings_df = ratings_df[ratings_df.season.between(2026, 2220)].reset_index(drop=True)

100%|██████████| 5754/5754 [00:00<00:00, 91321.64it/s]


In [15]:
import polars as pl
df = pl.DataFrame(ratings_df.merge(
    stats_df[['pid', 'season', 'tid', 'gp', 'gs', 'min', 'usgp', 'ortg', 'drtg', 'obpm', 'dbpm', 'ows', 'dws', 'vorp',
              'ewa']],
    on=['pid', 'season'], how='left')).with_columns(
        pl.col('ows').truediv('min').mul(48).alias('ows_rate'),
        pl.col('dws').truediv('min').mul(48).alias('dws_rate'),
    )

In [14]:
print(df.columns)

['pid', 'firstName', 'lastName', 'born', 'season', 'hgt', 'stre', 'spd', 'jmp', 'endu', 'ins', 'dnk', 'ft', 'fg', 'tp', 'diq', 'oiq', 'drb', 'pss', 'reb', 'pos', 'fuzz', 'skills', 'ovr', 'pot', 'injuryIndex', 'age', 'tid', 'gp', 'gs', 'min', 'usgp', 'ortg', 'drtg', 'obpm', 'dbpm', 'ows', 'dws', 'vorp', 'ewa']


In [31]:
info = ['firstName', 'lastName', 'season', 'age', 'ovr' ,'min']
features = ['hgt', 'stre', 'spd', 'jmp', 'endu', 'ins', 'dnk', 'ft', 'fg', 'tp', 'diq', 'oiq', 'drb', 'pss', 'reb']
targets = ['obpm','dbpm','ows_rate','dws_rate']

model_df = df.filter(pl.col('min').ge(10)).select(*info, *features, *targets).drop_nulls().drop_nans()

In [33]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import optuna
import numpy as np

# Prepare data
X = model_df.select(features).to_numpy()
sample_weights = model_df.select('min').to_numpy().flatten()

# Dictionary to store trained models and their performance
models = {}
results = {}

# Train a separate model for each target
for target in targets:
    print(f"\n{'='*60}")
    print(f"Training model for: {target}")
    print(f"{'='*60}")
    
    # Get target values
    y = model_df.select(target).to_numpy().flatten()
    
    # Split data (stratified by sample weights to ensure representative distribution)
    X_train, X_temp, y_train, y_temp, sw_train, sw_temp = train_test_split(
        X, y, sample_weights, test_size=0.3, random_state=42
    )
    
    X_val, X_test, y_val, y_test, sw_val, sw_test = train_test_split(
        X_temp, y_temp, sw_temp, test_size=0.5, random_state=42
    )
    
    # Create DMatrix objects with sample weights
    dtrain = xgb.DMatrix(X_train, label=y_train, weight=sw_train)
    dval = xgb.DMatrix(X_val, label=y_val, weight=sw_val)
    dtest = xgb.DMatrix(X_test, label=y_test, weight=sw_test)
    
    # Hyperparameter tuning with Optuna
    def objective(trial):
        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist',  # Fast histogram-based algorithm
            'device': 'cuda',  # Use GPU if available, fallback to CPU automatically
            
            # Tunable hyperparameters
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),  # L1 regularization
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),  # L2 regularization
            'random_state': 42
        }
        
        n_estimators = trial.suggest_int('n_estimators', 100, 1000)
        
        # Train with early stopping
        evals = [(dtrain, 'train'), (dval, 'validation')]
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=n_estimators,
            evals=evals,
            early_stopping_rounds=50,
            verbose_eval=False
        )
        
        # Evaluate on validation set
        y_pred = model.predict(dval)
        mse = mean_squared_error(y_val, y_pred, sample_weight=sw_val)
        rmse = np.sqrt(mse)
        
        return rmse
    
    # Run Optuna optimization
    print(f"Starting hyperparameter optimization...")
    study = optuna.create_study(direction='minimize', study_name=f'{target}_optimization')
    study.optimize(objective, n_trials=25, show_progress_bar=True, n_jobs=1)
    
    print(f"\nBest trial: {study.best_trial.number}")
    print(f"Best RMSE: {study.best_value:.4f}")
    print(f"Best parameters: {study.best_params}")
    
    # Train final model with best parameters
    best_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'hist',
        'device': 'cuda',
        'random_state': 42,
        **{k: v for k, v in study.best_params.items() if k != 'n_estimators'}
    }
    
    n_estimators = study.best_params['n_estimators']
    
    evals = [(dtrain, 'train'), (dval, 'validation')]
    final_model = xgb.train(
        best_params,
        dtrain,
        num_boost_round=n_estimators,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    # Evaluate on test set
    y_pred_test = final_model.predict(dtest)
    test_mse = mean_squared_error(y_test, y_pred_test, sample_weight=sw_test)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(y_test, y_pred_test, sample_weight=sw_test)
    test_r2 = r2_score(y_test, y_pred_test, sample_weight=sw_test)
    
    # Store results
    models[target] = final_model
    results[target] = {
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_r2': test_r2,
        'best_params': best_params,
        'best_iteration': final_model.best_iteration
    }
    
    print(f"\n{'='*60}")
    print(f"Final Model Performance for {target}:")
    print(f"{'='*60}")
    print(f"Test RMSE: {test_rmse:.4f}")
    print(f"Test MAE: {test_mae:.4f}")
    print(f"Test R²: {test_r2:.4f}")
    print(f"Best iteration: {final_model.best_iteration}")
    
    # Feature importance
    importance = final_model.get_score(importance_type='gain')
    print(f"\nTop 5 Most Important Features:")
    sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5]
    for feat, score in sorted_importance:
        feature_name = features[int(feat.replace('f', ''))]
        print(f"  {feature_name}: {score:.2f}")

print(f"\n{'='*60}")
print("Training Complete!")
print(f"{'='*60}")
print("\nSummary of all models:")
for target, result in results.items():
    print(f"\n{target}:")
    print(f"  RMSE: {result['test_rmse']:.4f}")
    print(f"  MAE: {result['test_mae']:.4f}")
    print(f"  R²: {result['test_r2']:.4f}")

[I 2026-01-02 21:50:24,283] A new study created in memory with name: obpm_optimization



Training model for: obpm
Starting hyperparameter optimization...


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2026-01-02 21:50:24,921] Trial 0 finished with value: 1.3115385971935143 and parameters: {'max_depth': 3, 'learning_rate': 0.037559686168312435, 'min_child_weight': 6, 'subsample': 0.7951735390299896, 'colsample_bytree': 0.9962814447507236, 'gamma': 0.7430767984783598, 'reg_alpha': 1.3729176791144904, 'reg_lambda': 7.324380783645969, 'n_estimators': 261}. Best is trial 0 with value: 1.3115385971935143.
[I 2026-01-02 21:50:26,579] Trial 1 finished with value: 1.1994859119975647 and parameters: {'max_depth': 5, 'learning_rate': 0.03410661566567974, 'min_child_weight': 1, 'subsample': 0.6216740249100112, 'colsample_bytree': 0.7504766172457746, 'gamma': 4.240891352899624, 'reg_alpha': 1.1849618871766288, 'reg_lambda': 8.127366056981874, 'n_estimators': 758}. Best is trial 1 with value: 1.1994859119975647.
[I 2026-01-02 21:50:27,853] Trial 2 finished with value: 1.186176076170607 and parameters: {'max_depth': 6, 'learning_rate': 0.043509856980023756, 'min_child_weight': 6, 'subsample': 0

[I 2026-01-02 21:51:35,905] A new study created in memory with name: dbpm_optimization



Final Model Performance for obpm:
Test RMSE: 1.1974
Test MAE: 0.9220
Test R²: 0.8851
Best iteration: 933

Top 5 Most Important Features:
  oiq: 440947.38
  pss: 137485.67
  drb: 135540.39
  tp: 96166.20
  fg: 64781.32

Training model for: dbpm
Starting hyperparameter optimization...


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2026-01-02 21:51:37,765] Trial 0 finished with value: 0.6993137837208608 and parameters: {'max_depth': 4, 'learning_rate': 0.07378921643769566, 'min_child_weight': 4, 'subsample': 0.7972100781084036, 'colsample_bytree': 0.6122453619677665, 'gamma': 0.6818827117729415, 'reg_alpha': 7.9227789054661155, 'reg_lambda': 7.582264891140149, 'n_estimators': 839}. Best is trial 0 with value: 0.6993137837208608.
[I 2026-01-02 21:51:38,348] Trial 1 finished with value: 0.7070077169355333 and parameters: {'max_depth': 7, 'learning_rate': 0.17676377032687238, 'min_child_weight': 4, 'subsample': 0.8331153468066999, 'colsample_bytree': 0.6408799406396329, 'gamma': 0.34755335352192995, 'reg_alpha': 0.14117136888236792, 'reg_lambda': 7.881796059589345, 'n_estimators': 294}. Best is trial 0 with value: 0.6993137837208608.
[I 2026-01-02 21:51:39,453] Trial 2 finished with value: 0.693604840914721 and parameters: {'max_depth': 8, 'learning_rate': 0.098368832676415, 'min_child_weight': 7, 'subsample': 0.

[I 2026-01-02 21:53:44,320] A new study created in memory with name: ows_rate_optimization



Final Model Performance for dbpm:
Test RMSE: 0.6827
Test MAE: 0.5226
Test R²: 0.7339
Best iteration: 864

Top 5 Most Important Features:
  pss: 12490.72
  oiq: 6912.64
  diq: 6151.46
  drb: 5303.67
  jmp: 3468.63

Training model for: ows_rate
Starting hyperparameter optimization...


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2026-01-02 21:53:48,439] Trial 0 finished with value: 0.02698618204714345 and parameters: {'max_depth': 10, 'learning_rate': 0.020599852341048334, 'min_child_weight': 6, 'subsample': 0.7020350591931467, 'colsample_bytree': 0.7757071517867852, 'gamma': 0.501582004056893, 'reg_alpha': 8.415195506331393, 'reg_lambda': 1.6652035133043164, 'n_estimators': 259}. Best is trial 0 with value: 0.02698618204714345.
[I 2026-01-02 21:53:51,146] Trial 1 finished with value: 0.027781130651645503 and parameters: {'max_depth': 4, 'learning_rate': 0.023407988256049814, 'min_child_weight': 8, 'subsample': 0.8609853837828967, 'colsample_bytree': 0.9761291145383544, 'gamma': 4.976194521270005, 'reg_alpha': 1.8667092944430752, 'reg_lambda': 4.86473535828741, 'n_estimators': 647}. Best is trial 0 with value: 0.02698618204714345.
[I 2026-01-02 21:53:53,003] Trial 2 finished with value: 0.028170582254249135 and parameters: {'max_depth': 3, 'learning_rate': 0.026871674192647514, 'min_child_weight': 4, 'subsa

[I 2026-01-02 21:54:42,654] A new study created in memory with name: dws_rate_optimization



Final Model Performance for ows_rate:
Test RMSE: 0.0275
Test MAE: 0.0209
Test R²: 0.8152
Best iteration: 421

Top 5 Most Important Features:
  oiq: 233.36
  drb: 70.89
  hgt: 35.07
  ft: 29.26
  tp: 26.93

Training model for: dws_rate
Starting hyperparameter optimization...


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2026-01-02 21:54:43,239] Trial 0 finished with value: 0.011510398960383583 and parameters: {'max_depth': 4, 'learning_rate': 0.08476394123371024, 'min_child_weight': 10, 'subsample': 0.8681896383863411, 'colsample_bytree': 0.7599475155791604, 'gamma': 3.5970961587788786, 'reg_alpha': 3.80323222034906, 'reg_lambda': 1.7876325788946246, 'n_estimators': 530}. Best is trial 0 with value: 0.011510398960383583.
[I 2026-01-02 21:54:44,443] Trial 1 finished with value: 0.011289504808111604 and parameters: {'max_depth': 7, 'learning_rate': 0.02544078119929736, 'min_child_weight': 5, 'subsample': 0.8586075243814306, 'colsample_bytree': 0.7830737184278799, 'gamma': 2.286067659744471, 'reg_alpha': 2.6231919651504354, 'reg_lambda': 4.852739140366622, 'n_estimators': 642}. Best is trial 1 with value: 0.011289504808111604.
[I 2026-01-02 21:54:44,814] Trial 2 finished with value: 0.011356593894531518 and parameters: {'max_depth': 5, 'learning_rate': 0.05464328699412867, 'min_child_weight': 5, 'subs

In [34]:
models

{'obpm': <xgboost.core.Booster at 0x269f4f24740>,
 'dbpm': <xgboost.core.Booster at 0x26a50a2a660>,
 'ows_rate': <xgboost.core.Booster at 0x26a4ad02210>,
 'dws_rate': <xgboost.core.Booster at 0x26a5056de80>}

In [39]:
df[features]

hgt,stre,spd,jmp,endu,ins,dnk,ft,fg,tp,diq,oiq,drb,pss,reb
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
67,69,56,63,60,64,80,46,81,41,52,73,47,53,73
67,66,55,60,57,61,78,44,80,40,50,70,45,51,71
67,67,56,59,57,62,78,45,81,40,51,71,46,51,72
67,67,54,56,57,62,78,46,82,41,52,71,46,51,73
67,68,54,54,58,63,79,48,83,44,55,74,49,52,75
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
51,49,50,56,35,36,43,43,36,36,45,39,56,52,47
40,48,47,40,19,28,37,17,19,17,34,33,31,33,49
37,41,39,33,38,25,42,37,33,36,41,41,27,33,48
58,34,47,48,22,23,38,37,30,29,21,29,42,34,29


In [None]:
import pickle
from pathlib import Path

# Create directory if it doesn't exist
models_dir = Path('../models/ovr')
models_dir.mkdir(parents=True, exist_ok=True)

# Save each model
for target, model in models.items():
    model_path = models_dir / f'{target}_model.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"Saved {target} model to {model_path}")

# Optionally, save the results dictionary as well
results_path = models_dir / 'model_results.pkl'
with open(results_path, 'wb') as f:
    pickle.dump(results, f)
print(f"\nSaved model results to {results_path}")

# Also save feature names for reference
metadata = {
    'features': features,
    'targets': targets
}
metadata_path = models_dir / 'metadata.pkl'
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
print(f"Saved metadata to {metadata_path}")

In [47]:
# Convert features to numpy array for prediction
X_predict = df.select(features).to_numpy()

# Create predictions for all targets
predictions = {}
for target, model in models.items():
    # Create DMatrix for prediction
    dpred = xgb.DMatrix(X_predict)
    predictions[target] = model.predict(dpred)

# Add all predictions to the dataframe
df_with_predictions = df.with_columns([
    pl.Series(f'pred_{target}', predictions[target])
    for target in targets
])

In [70]:
# Calculate ovr statistics
ovr_mean = df.select(pl.col('ovr').mean()).item()
ovr_std = df.select(pl.col('ovr').std()).item()

# Add scaled metrics and new_ovr
result = (
    df_with_predictions
    .select(*info, *[f'pred_{target}' for target in targets])
    .with_columns([
        # Calculate raw metrics
        pl.sum_horizontal('pred_obpm', 'pred_dbpm').alias('bpm'),
        pl.sum_horizontal('pred_ows_rate', 'pred_dws_rate').alias('ws')
    ])
    .with_columns([
        # Scale bpm to ovr's distribution
        (
            (pl.col('bpm') - pl.col('bpm').mean()) / pl.col('bpm').std() * ovr_std + ovr_mean
        ).alias('bpm_scaled'),
        # Scale ws to ovr's distribution
        (
            (pl.col('ws') - pl.col('ws').mean()) / pl.col('ws').std() * ovr_std + ovr_mean
        ).alias('ws_scaled')
    ])
    .with_columns([
        # Average the scaled metrics
        pl.mean_horizontal('bpm_scaled', 'ws_scaled','ws_scaled','ws_scaled','ws_scaled').alias('new_ovr')
    ])
    .select(
        'firstName', 'lastName', 'season', 'age', 'ovr', 'new_ovr',
        'bpm', 'ws'
    )
    .filter(pl.col('season').eq(2026))
    #.filter(pl.col('age').lt(22))
    .sort('new_ovr', descending=True)
)

result

firstName,lastName,season,age,ovr,new_ovr,bpm,ws
str,str,i64,i64,i64,f32,f32,f32
"""Shai""","""Gilgeous-Alexander""",2026,27,74,88.655296,11.581886,0.27995
"""Victor""","""Wembanyama""",2026,22,73,87.761887,9.295696,0.282994
"""Giannis""","""Antetokounmpo""",2026,31,72,84.896217,10.410711,0.260003
"""Jimmy""","""Butler""",2026,36,65,82.54335,7.633878,0.255442
"""Nikola""","""Jokic""",2026,31,76,81.647095,10.341479,0.239106
…,…,…,…,…,…,…,…
"""Cayden""","""Boozer""",2026,19,33,26.51849,-8.910167,-0.045378
"""Chaney""","""Johnson""",2026,24,37,26.460052,-7.533512,-0.051096
"""Judah""","""Mintz""",2026,23,36,25.358093,-9.540882,-0.050492
"""Joseph""","""Tugler""",2026,21,30,25.215307,-8.513227,-0.055406
