In [1]:
import polars as pl
import pickle

from hgm.config import MODELS_DIR, DATA_DIR

In [2]:
players_raw = pl.scan_parquet(DATA_DIR / 'raw' / 'players.parquet')

with open(DATA_DIR / 'raw' / 'game_settings.pkl', 'rb') as file:
    game_settings = pickle.load(file)

In [3]:
player_ratings = (
    players_raw
    .with_columns(
        pl.concat_str(pl.col('firstName'), pl.lit(' '), pl.col('lastName')).alias('player'),
    )
    .explode('ratings')
    .unnest('ratings')
    .with_columns(
        age=pl.col('season') - pl.col('born').struct.field('year')
    )
    .unique(['pid', 'season'])
    .sort(['pid', 'season'])
    .select('player', 'pid', 'tid', 'season', 'age', 'pos', 'ovr')
    .collect()
)

player_salaries = (
    players_raw
    .select('pid', 'salaries')
    .explode('salaries')
    .unnest('salaries')
    .group_by(['pid', 'season'])
    .agg(salary=pl.col('amount').last())
    .sort('pid', 'season')
    .collect()
)

In [4]:
new_contracts = (
    player_salaries
    .with_columns(
        contract_year=pl.col('salary') != pl.col('salary').shift(-1).over('pid'),
    )
    .filter(pl.col('contract_year'))
    .select(
        pl.col('pid'),
        pl.col('season'),
        pl.col('salary').alias('current_salary') / 1000,
        pl.col('salary').shift(-1).over('pid').alias('next_salary') / 1000,
    )
    .drop_nulls()
)

In [5]:
contract_data = (
    new_contracts
    .join(player_ratings, on=['pid', 'season'], how='left')
    .select(
        'pid', 'age', 'ovr', 'current_salary', 'next_salary',
        pl.col('pos').replace({
            'C': 1,
            'W': 2,
            'D': 3,
            'G': 4
        }).cast(pl.Int32).alias('pos_map')
    )
)

In [6]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import pickle

In [7]:
features = ['pos_map', 'age', 'ovr', 'current_salary']
target = 'next_salary'

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = (
    train_test_split(
        contract_data.select(features),
        contract_data.select(target),
        test_size=0.2,
        random_state=42
    )
)

model = XGBRegressor(
    early_stopping_rounds=5,
    learning_rate=0.1,
    random_state=42,
)

# Fit the model using the training data and validate using the validation data
model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

[0]	validation_0-rmse:3.43862
[1]	validation_0-rmse:3.19225
[2]	validation_0-rmse:2.97630
[3]	validation_0-rmse:2.78831
[4]	validation_0-rmse:2.62629
[5]	validation_0-rmse:2.48717
[6]	validation_0-rmse:2.36899
[7]	validation_0-rmse:2.26858
[8]	validation_0-rmse:2.18419
[9]	validation_0-rmse:2.11371
[10]	validation_0-rmse:2.05476
[11]	validation_0-rmse:2.00528
[12]	validation_0-rmse:1.96497
[13]	validation_0-rmse:1.93147
[14]	validation_0-rmse:1.90434
[15]	validation_0-rmse:1.88181
[16]	validation_0-rmse:1.86339
[17]	validation_0-rmse:1.84906
[18]	validation_0-rmse:1.83756
[19]	validation_0-rmse:1.82864
[20]	validation_0-rmse:1.82141
[21]	validation_0-rmse:1.81484
[22]	validation_0-rmse:1.80945
[23]	validation_0-rmse:1.80620
[24]	validation_0-rmse:1.80234
[25]	validation_0-rmse:1.80032
[26]	validation_0-rmse:1.79767
[27]	validation_0-rmse:1.79625
[28]	validation_0-rmse:1.79483
[29]	validation_0-rmse:1.79414
[30]	validation_0-rmse:1.79365
[31]	validation_0-rmse:1.79442
[32]	validation_0-

In [8]:
with open(MODELS_DIR / 'salary' / 'xgboost_salary.pkl', 'wb') as file:
    pickle.dump(model, file)