In [102]:
import polars as pl
import pickle
from sklearn.linear_model import LinearRegression, RidgeCV

from hgm.config import MODELS_DIR, DATA_DIR

In [103]:
players_raw = pl.scan_parquet(DATA_DIR / 'raw' / 'players.parquet')
teams_raw = pl.read_parquet(DATA_DIR / 'raw' / 'teams.parquet')
with open(DATA_DIR / 'raw' / 'game_settings.pkl', 'rb') as file:
    game_settings = pickle.load(file)

In [104]:
def adjust_for_replacement(df, replacements):
    return (
        df
        .join(replacements, on='pos', how='left')
        .with_columns(
            ovr_added=(pl.col('ovr') - pl.col('ovr_rep')).clip(0) / 100,
        )
        .with_columns(
            ovr_added_minutes=pl.col('ovr_added') * pl.col('min'),
        )
        .drop(['ovr_rep'])
    )

In [105]:
player_ratings = (
    players_raw
    .select(
        'pid',
        pl.concat_str([pl.col('firstName'), pl.lit(' '), pl.col('lastName')]).alias('player'),
        'born',
        'ratings')
    .explode('ratings')
    .unnest('ratings')
    .with_columns(
        age=pl.col('season') - pl.col('born').struct.field('year')
    )
    .unique(['pid', 'season'])
    .sort(['pid', 'season'])
    .select('pid', 'player', 'season', 'age', 'pos', 'ovr')
    .collect()
)

player_stats = (
    players_raw
    .select('pid', 'stats')
    .explode('stats')
    .unnest('stats')
    .filter(pl.col('playoffs') == False)
    .with_columns(ps=pl.col('ops') + pl.col('dps') + pl.col('gps'))
    .sort('pid', 'season', 'tid')
    .select('pid', 'season', 'tid', 'gp', 'min', 'ops', 'dps', 'gps', 'ps')
    .collect()
)

player_stats_by_team_season = (
    player_stats
    .group_by(['pid', 'season', 'tid'])
    .sum()
    .sort('pid', 'season', 'tid')
)

player_stats_by_season = (
    player_stats
    .group_by(['pid', 'season'])
    .sum()
    .sort('pid', 'season')
)

player_salaries = (
    players_raw
    .select('pid', 'salaries')
    .explode('salaries')
    .unnest('salaries')
    .group_by(['pid', 'season'])
    .agg(salary=pl.col('amount').last())
    .sort('pid', 'season')
    .collect()
)

players = (
    player_ratings
    .join(player_stats_by_season, on=['pid', 'season'], how='left')
    .join(player_salaries, on=['pid', 'season'], how='left')
    .sort('pid', 'season')
)

replacement_df = (
    players
    .filter((pl.col('salary') == game_settings['minContract']) & (pl.col('age') >= 25) & (pl.col('min') > 0))
    .with_columns(
        weighted_ovr=pl.col('ovr') * pl.col('min')
    )
    .group_by('pos')
    .agg(
        total_minutes=pl.col('min').sum(),
        total_ovr=pl.col('weighted_ovr').sum()
    )
    .select(
        'pos',
        ovr_rep=pl.col('total_ovr') / pl.col('total_minutes')
    )
)

players = players.pipe(adjust_for_replacement, replacement_df)

teams_with_ovr = (
    player_stats_by_team_season
    .join(
        players.select('pid', 'season', 'pos', 'ovr'),
        on=['pid', 'season'], how='left'
    )
    .pipe(adjust_for_replacement, replacement_df)
    .group_by('season', 'tid', 'pos')
    .agg(
        pl.sum('ovr_added_minutes')
    )
    .pivot(index=['tid', 'season'], on='pos')
    .sort('season', 'tid')
)

teams_with_ps = (
    player_stats_by_team_season
    .group_by('tid', 'season')
    .agg(
        pl.sum('ops'),
        pl.sum('dps'),
        pl.sum('gps'),
    )
)

team_player_stats = (
    teams_with_ovr
    .join(teams_with_ps, on=['tid', 'season'], how='left')
)

team_standings = (
    teams_raw
    .select('seasons')
    .explode('seasons')
    .unnest('seasons')
    .select(
        pl.col('tid'),
        pl.col('season'),
        (2 * pl.col('won') + pl.col('otl')).alias('points'),
        pl.col('won').alias('wins')
    )
)

team_goals = (
    teams_raw
    .select('stats')
    .explode('stats')
    .unnest('stats')
    .filter(pl.col('playoffs') == False)
    .select(
        pl.col('tid'),
        pl.col('season'),
        (pl.col('pts') - pl.col('oppPts')).alias('goal_diff')
    )
)

teams = (
    team_standings
    .join(team_goals, on=['tid', 'season'], how='left')
    .join(team_player_stats, on=['tid', 'season'], how='left')
)

In [106]:
ridge_model = RidgeCV()
ridge_model.fit(
    teams.select(
        ['C', 'W', 'D', 'G']),
    teams.select('goal_diff')
)

model_coefs = ridge_model.coef_[0]

In [107]:
teams_with_preds = (
    teams
    .with_columns(
        goals_C=pl.col('C') * model_coefs[0],
        goals_W=pl.col('W') * model_coefs[1],
        goals_D=pl.col('D') * model_coefs[2],
        goals_G=pl.col('G') * model_coefs[3],
        goals_added=(
                pl.col('C') * model_coefs[0] +
                pl.col('W') * model_coefs[1] +
                pl.col('D') * model_coefs[2] +
                pl.col('G') * model_coefs[3]
        )
    )
    .with_columns(
        goals_pred=pl.col('goals_added') + ridge_model.intercept_
    )
)

In [108]:
goals_for_grabs = teams_with_preds.group_by('season').agg(pl.sum('goals_added')).select(pl.mean('goals_added')).item()
goalie_goals_for_grabs = teams_with_preds.group_by('season').agg(pl.sum('goals_G')).select(pl.mean('goals_G')).item()
skater_goals_for_grabs = goals_for_grabs - goalie_goals_for_grabs

salary_after_replacement = 32 * (80 - 25 * (game_settings['minContract'] / 1000))

salary_for_goalies = salary_after_replacement * 0.15
salary_for_skaters = salary_after_replacement - salary_for_goalies

In [109]:
from xgboost import XGBRegressor

minute_data = (
    players
    .select(
        'pos',
        'ovr',
        pl.col('min').truediv(60 * 82).fill_null(0).alias('min_pct')
    )
    .with_columns(
        pl.col('pos').replace({'C': 1, 'W': 2, 'D': 3, 'G': 4}).alias('pos_map'),
    )
)

model = XGBRegressor()
model.fit(
    minute_data.select('pos_map', 'ovr'),
    minute_data.select('min_pct'),
)

lookup_data = (
    pl.DataFrame({'pos': ['C', 'W', 'D', 'G'], 'pos_map': [1, 2, 3, 4]})
    .with_columns(
        ovr=pl.Series([list(range(0, 101))]),
    )
    .explode('ovr')
)
minutes_lookup = (
    lookup_data
    .with_columns(
        min_pred=pl.Series(model.predict(lookup_data.select('pos_map', 'ovr')).clip(0))
    )
    .rolling(
        'ovr',
        period='10i',
        offset='-5i',
        group_by='pos')
    .agg(
        pl.mean('min_pred')
    )
    .select(
        pl.col('pos'),
        pl.col('ovr').alias('ovr_round'),
        pl.col('min_pred')
    )
)

In [125]:
players_final = (
    players
    .with_columns(
        ovr_round=pl.col('ovr').round(0)
    )
    .join(minutes_lookup, on=['pos', 'ovr_round'], how='left')
    .with_columns(
        pos_mod=(
            pl.when(pl.col('pos') == 'C').then(pl.lit(0.275))
            .when(pl.col('pos') == 'W').then(pl.lit(0.275))
            .when(pl.col('pos') == 'D').then(pl.lit(0.3))
            .when(pl.col('pos') == 'G').then(pl.lit(0.65))
        ),
        pos_mod_new=pl.col('min_pred'),
        ovr_mod=(
            pl.when(pl.col('pos') == 'C').then(pl.lit(model_coefs[0]))
            .when(pl.col('pos') == 'W').then(pl.lit(model_coefs[1]))
            .when(pl.col('pos') == 'D').then(pl.lit(model_coefs[2]))
            .when(pl.col('pos') == 'G').then(pl.lit(model_coefs[3]))
        )
    )
    .with_columns(pred_minutes=60 * 82 * pl.col('pos_mod_new'))
    .with_columns(ovr_added_pred_minutes=pl.col('ovr_added') * pl.col('pred_minutes'))
    .with_columns(
        goals_added=pl.col('ovr_added_pred_minutes') * pl.col('ovr_mod')
    )
    .with_columns(
        goals_added_pct=pl.col('goals_added') / goals_for_grabs,
        goals_added_goalie_pct=pl.col('goals_added') / goalie_goals_for_grabs,
        goals_added_skater_pct=pl.col('goals_added') / skater_goals_for_grabs,
    )
    .with_columns(
        cap_value=salary_after_replacement * pl.col('goals_added_pct') + (game_settings['minContract'] / 1000),
        cap_value_goalie=salary_for_goalies * pl.col('goals_added_goalie_pct') + (game_settings['minContract'] / 1000),
        cap_value_skater=salary_for_skaters * pl.col('goals_added_skater_pct') + (game_settings['minContract'] / 1000),
    )
    .with_columns(
        cap_value_pos=pl.when(pl.col('pos') == 'G').then(pl.col('cap_value_goalie')).otherwise(
            pl.col('cap_value_skater'))
    )
)

In [126]:
for position in ['C', 'W', 'D', 'G']:
    model_data = (
        players_final.filter(pl.col('pos') == position)
        .filter(pl.col('ovr') >= 60)
        .unique('ovr')
        .select('ovr', 'cap_value', 'cap_value_pos')
    )
    model = LinearRegression()
    model.fit(model_data.select('ovr'), model_data.select('cap_value_pos'))
    filepath = MODELS_DIR / 'ovr_to_cap' / f'{position}.pkl'
    with open(filepath, 'wb') as file:
        pickle.dump(model, file)