In [1]:
import polars as pl
from scipy.stats import gaussian_kde
from scipy import signal
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle

from hgm.config import DATA_DIR, MODELS_DIR

In [2]:
players_raw = pl.scan_parquet(DATA_DIR / 'raw' / 'players.parquet')

model_dict = {
    position: pickle.load(open(MODELS_DIR / 'ovr_to_cap' / f'{position}.pkl', 'rb'))
    for position in ['C', 'W', 'D', 'G']
}

player_ratings = (
    players_raw
    .select(
        'pid',
        'born',
        'ratings')
    .explode('ratings')
    .unnest('ratings')
    .with_columns(
        age=pl.col('season') - pl.col('born').struct.field('year')
    )
    .unique(['pid', 'season'])
    .sort(['pid', 'season'])
    .select('pid', 'season', 'age', 'pos', 'ovr')
)

In [3]:
plot_data = (
    player_ratings
    .sort(['pid', 'season'])
    .with_columns(
        ovr_shift=pl.col('ovr').shift(-1).over('pid') - pl.col('ovr')
    )
    .select('pos', 'age', 'ovr_shift')
    .drop_nulls()
    .collect()
)

In [20]:
(
    plot_data
    .filter(pl.col('pos') == 'G')
    .filter(pl.col('age') == 18)
    .select(
        pl.quantile('ovr_shift', 0.5).alias('median'),
        pl.quantile('ovr_shift', 0.9).alias('upper')
    )
    .with_columns(diff=pl.col('upper') - pl.col('median'))
)

median,upper,diff
f64,f64,f64
2.0,11.0,9.0


In [4]:
kde_dict = dict()

for position in ['C', 'W', 'D', 'G']:
    kde_dict[position] = {}
    for age in range(18, 36):
        kde_dict[position][age] = {}
        kde_dict[position][age]['data'] = plot_data.filter((pl.col('pos') == position) & (pl.col('age') == age)).select(
            'ovr_shift').to_series().to_list()
        kde_dict[position][age]['kde'] = gaussian_kde(kde_dict[position][age]['data'])

for position in ['C', 'W', 'D', 'G']:
    for age in range(36, 60):
        kde_dict[position][age] = kde_dict[position][35]

In [5]:
def convolve_distributions(kdes):
    # Generate x values that cover the range of all KDEs
    x = np.linspace(-100, 100, 1000)

    # Initialize the convolved density as the density of the first KDE
    y_convolved = kdes[0](x)

    # Iterate over the rest of the KDEs
    for kde in kdes[1:]:
        # Calculate the density of the current KDE
        y = kde(x)

        # Perform the convolution
        y_convolved = signal.convolve(y_convolved, y, mode='same')

        # Normalize the result
        y_convolved /= np.trapezoid(y_convolved, x)  # Use trapezoidal rule to approximate the integral

    return x, y_convolved

In [6]:
prog_dict = {}
for position in ['C', 'W', 'D', 'G']:
    prog_dict[position] = {}
    for age in tqdm(range(18, 45)):
        prog_dict[position][age] = {}
        for years_in_adv in range(1, 10):
            dicts_to_compile = [kde_dict[position][age]['kde'] for age in range(age, age + years_in_adv)]
            prog_dict[position][age][years_in_adv] = {}
            prog_dict[position][age][years_in_adv]['x'], prog_dict[position][age][years_in_adv][
                'y'] = convolve_distributions(dicts_to_compile)

100%|██████████| 27/27 [00:34<00:00,  1.30s/it]
100%|██████████| 27/27 [00:45<00:00,  1.68s/it]
100%|██████████| 27/27 [00:44<00:00,  1.66s/it]
100%|██████████| 27/27 [00:26<00:00,  1.03it/s]


In [7]:
df_list = []
for position in ['C', 'W', 'D', 'G']:
    for age in tqdm(range(18, 45)):
        prog_df = pd.DataFrame()
        for years_in_adv in range(1, 10):
            temp_df = pd.DataFrame({
                'x': prog_dict[position][age][years_in_adv]['x'],
                f'y_{years_in_adv}': prog_dict[position][age][years_in_adv]['y']
            })
            if years_in_adv > 1:
                temp_df = temp_df.drop('x', axis=1)
            prog_df = pd.concat([prog_df, temp_df], axis=1)
        df_list.append(prog_df.assign(position=position, age=age))

prog_df = pl.DataFrame(pd.concat(df_list, axis=0).reset_index(drop=True))
prog_df.write_parquet('../data/constants/progression.parquet')

100%|██████████| 27/27 [00:00<00:00, 319.10it/s]
100%|██████████| 27/27 [00:00<00:00, 326.10it/s]
100%|██████████| 27/27 [00:00<00:00, 310.55it/s]
100%|██████████| 27/27 [00:00<00:00, 321.60it/s]


In [8]:
prog_df = pl.scan_parquet('../data/constants/progression.parquet')

In [9]:
calculated_progs = (
    prog_df
    .rename({'position': 'pos'})
    .with_columns(
        ovr=pl.Series([range(0, 101)], dtype=pl.List(pl.Int64)),
    )
    .explode('ovr')
    .rename({'x': 'exp_growth'})
    .with_columns(
        exp_ovr=pl.col('exp_growth').add(pl.col('ovr')),
    )
    .with_columns(
        exp_value=(
            pl.when(pl.col('pos') == 'C').then(
                pl.col('exp_ovr') * model_dict['C'].coef_[0] + model_dict['C'].intercept_)
            .when(pl.col('pos') == 'W').then(pl.col('exp_ovr') * model_dict['W'].coef_[0] + model_dict['W'].intercept_)
            .when(pl.col('pos') == 'D').then(pl.col('exp_ovr') * model_dict['D'].coef_[0] + model_dict['D'].intercept_)
            .when(pl.col('pos') == 'G').then(pl.col('exp_ovr') * model_dict['G'].coef_[0] + model_dict['G'].intercept_)
            .clip(0)
        )
    )
    .with_columns(
        [(pl.col('exp_ovr') * pl.col(f'y_{i}')).alias(f'exp_ovr_{i}') for i in range(1, 10)] +
        [(pl.col('exp_value') * pl.col(f'y_{i}')).alias(f'exp_value_{i}') for i in range(1, 10)]
    )
    .group_by(['pos', 'age', 'ovr'])
    .agg(
        [pl.sum(f'exp_ovr_{i}').alias(f'exp_ovr_product_{i}') for i in range(1, 10)] +
        [pl.sum(f'exp_value_{i}').alias(f'exp_value_product_{i}') for i in range(1, 10)] +
        [pl.sum(f'y_{i}').alias(f'y_{i}') for i in range(1, 10)]
    )
    .select(
        ['pos', 'age', 'ovr'] +
        [pl.col(f'exp_ovr_product_{i}').truediv(pl.col(f'y_{i}')).alias(f'exp_ovr_{i}') for i in range(1, 10)] +
        [pl.col(f'exp_value_product_{i}').truediv(pl.col(f'y_{i}')).alias(f'exp_value_{i}') for i in range(1, 10)]
    )
    .sort(['age', 'ovr'])
    .unpivot(index=['pos', 'age', 'ovr'])
    .collect()
)

In [10]:
calculated_progs.write_parquet(DATA_DIR / 'constants' / 'calculated_progs.parquet')

In [11]:
(
    calculated_progs
    .filter(pl.col('pos') == 'C')
    .filter(pl.col('age') == 18)
    .filter(pl.col('ovr') == 50)
    .to_pandas()
)

Unnamed: 0,pos,age,ovr,variable,value
0,C,18,50,exp_ovr_1,52.409274
1,C,18,50,exp_ovr_2,54.840981
2,C,18,50,exp_ovr_3,57.361169
3,C,18,50,exp_ovr_4,58.716925
4,C,18,50,exp_ovr_5,60.036757
5,C,18,50,exp_ovr_6,61.260064
6,C,18,50,exp_ovr_7,62.470018
7,C,18,50,exp_ovr_8,62.232078
8,C,18,50,exp_ovr_9,62.023121
9,C,18,50,exp_value_1,1.365423
