In [2]:
import polars as pl
import pickle

from hgm.config import MODELS_DIR, DATA_DIR

[32m2024-07-25 14:04:38.435[0m | [1mINFO    [0m | [36mhgm.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\jrnas\Projects\Personal\hgm[0m


In [12]:
players_raw = pl.scan_parquet(DATA_DIR / 'raw' / 'players.parquet')

with open(DATA_DIR / 'raw' / 'game_settings.pkl', 'rb') as file:
    game_settings = pickle.load(file)

In [13]:
player_ratings = (
    players_raw
    .with_columns(
        pl.concat_str(pl.col('firstName'), pl.lit(' '), pl.col('lastName')).alias('player'),
    )
    .explode('ratings')
    .unnest('ratings')
    .with_columns(
        age=pl.col('season') - pl.col('born').struct.field('year')
    )
    .unique(['pid', 'season'])
    .sort(['pid', 'season'])
    .select('player', 'pid', 'tid', 'season', 'age', 'pos', 'ovr')
    .collect()
)

player_salaries = (
    players_raw
    .select('pid', 'salaries')
    .explode('salaries')
    .unnest('salaries')
    .group_by(['pid', 'season'])
    .agg(salary=pl.col('amount').last())
    .sort('pid', 'season')
    .collect()
)

In [65]:
new_contracts = (
    player_salaries
    .with_columns(
        contract_year=pl.col('salary') != pl.col('salary').shift(-1).over('pid'),
    )
    .filter(pl.col('contract_year'))
    .select(
        pl.col('pid'),
        pl.col('season'),
        pl.col('salary').alias('current_salary') / 1000,
        pl.col('salary').shift(-1).over('pid').alias('next_salary') / 1000,
    )
    .drop_nulls()
)

In [66]:
contract_data = (
    new_contracts
    .join(player_ratings, on=['pid', 'season'], how='left')
    .select(
        'pid', 'age', 'ovr', 'current_salary', 'next_salary',
        pl.col('pos').replace({
            'C': 1,
            'W': 2,
            'D': 3,
            'G': 4
        }).cast(pl.Int32).alias('pos_map')
    )
)

In [68]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import pickle

In [77]:
features = ['pos_map', 'age', 'ovr', 'current_salary']
target = 'next_salary'

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = (
    train_test_split(
        contract_data.select(features),
        contract_data.select(target),
        test_size=0.2,
        random_state=42
    )
)

model = XGBRegressor(
    early_stopping_rounds=10,
    learning_rate=0.0621803
)

# Fit the model using the training data and validate using the validation data
model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

with open(MODELS_DIR / 'salary' / 'xgboost_salary.pkl', 'wb') as file:
    pickle.dump(model, file)

[0]	validation_0-rmse:3.30932
[1]	validation_0-rmse:3.14592
[2]	validation_0-rmse:2.99429
[3]	validation_0-rmse:2.85365
[4]	validation_0-rmse:2.72448
[5]	validation_0-rmse:2.60415
[6]	validation_0-rmse:2.49447
[7]	validation_0-rmse:2.39310
[8]	validation_0-rmse:2.30091
[9]	validation_0-rmse:2.21624
[10]	validation_0-rmse:2.13859
[11]	validation_0-rmse:2.06782
[12]	validation_0-rmse:2.00295
[13]	validation_0-rmse:1.94477
[14]	validation_0-rmse:1.89168
[15]	validation_0-rmse:1.84343
[16]	validation_0-rmse:1.80050
[17]	validation_0-rmse:1.76152
[18]	validation_0-rmse:1.72626
[19]	validation_0-rmse:1.69459
[20]	validation_0-rmse:1.66653
[21]	validation_0-rmse:1.64148
[22]	validation_0-rmse:1.61881
[23]	validation_0-rmse:1.59873
[24]	validation_0-rmse:1.58014
[25]	validation_0-rmse:1.56442
[26]	validation_0-rmse:1.55016
[27]	validation_0-rmse:1.53767
[28]	validation_0-rmse:1.52641
[29]	validation_0-rmse:1.51631
[30]	validation_0-rmse:1.50711
[31]	validation_0-rmse:1.49923
[32]	validation_0-

In [74]:
(
    contract_data
    .with_columns(
        predicted_salary=model.predict(contract_data.select(features))
    )
    .filter(
        (pl.col('next_salary') >= 6) &
        (pl.col('predicted_salary') <= 2)
    )
)