In [2]:
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly.express as px
import json

import plotly.io as pio
import re
from sklearn.linear_model import LinearRegression

pio.templates.default = "plotly_dark"
pio.renderers.default = "browser"

In [3]:
def get_salary_cap_events():
    ## Load json file from downloads
    with open('C:/Users/jrnas/Downloads/BBGM_League_1_2220_free_agency(1).json', encoding='latin') as f:
        r_json = json.load(f)

    return pd.DataFrame([(x['season'], x['text']) for x in r_json['events'] if 'An inflation rate of' in x['text']],
                        columns=['season', 'text'])


def extract_values(text):
    # Pattern for inflation rate
    inflation_pattern = r"(\d+(\.\d+)?)%"
    # Pattern for salary cap
    salary_cap_pattern = r"\$(\d+(\.\d+)?[MB]?)"

    # Search for the patterns
    inflation_match = re.search(inflation_pattern, text)
    salary_cap_match = re.search(salary_cap_pattern, text)

    # Extract the matched values
    inflation_rate = float(inflation_match.group(1)) if inflation_match else None
    salary_cap = salary_cap_match.group(1) if salary_cap_match else None

    return inflation_rate, salary_cap


def convert_salary_cap(salary_cap):
    if salary_cap.endswith('M'):
        return float(salary_cap[:-1])
    elif salary_cap.endswith('B'):
        return float(salary_cap[:-1]) * 1000
    else:
        return None


def get_salary_cap():
    inf_df = get_salary_cap_events()
    inf_df['inf_rate'], inf_df['salary_cap'] = zip(*inf_df['text'].apply(extract_values))
    inf_df['salary_cap'] = inf_df['salary_cap'].apply(convert_salary_cap)
    inf_df = inf_df[['season', 'inf_rate', 'salary_cap']]
    ## Add a row for 2023
    inf_df = pd.concat(
        [pd.DataFrame({'season': 2023, 'inf_rate': 0, 'salary_cap': 136}, index=[0]), inf_df]).reset_index(
        drop=True)
    ## Set up dictionary
    return inf_df.set_index('season').to_dict()['salary_cap']

In [4]:
salary_cap = get_salary_cap()

In [5]:
with open('C:/Users/jrnas/Downloads/BBGM_League_1_2220_free_agency.json', encoding='latin') as f:
    r_json = json.load(f)

In [6]:
# Initialize an empty list to store the data
data = []

# Iterate over the list of players
for player in tqdm(r_json['players']):
    # Iterate over the ratings of the current player
    for rating in player['stats']:
        # Create a new dictionary that includes 'pid', 'firstName', 'lastName' and the rating
        row = {
            'pid': player['pid']
        }
        row.update(rating)
        # Append the dictionary to the list
        data.append(row)

# Convert the list of dictionaries to a DataFrame
stats_df = (
    pd.DataFrame(data)
    .convert_dtypes(dtype_backend='pyarrow')
)

stats_df = stats_df[(stats_df.season.between(2024, 2220)) & (stats_df.playoffs == False)].reset_index(drop=True)

100%|██████████| 14724/14724 [00:00<00:00, 27900.24it/s]


In [7]:
# Initialize an empty list to store the data
data = []

# Iterate over the list of players
for player in tqdm(r_json['players']):
    # Iterate over the ratings of the current player
    for rating in player['ratings']:
        # Create a new dictionary that includes 'pid', 'firstName', 'lastName' and the rating
        row = {
            'pid': player['pid'],
            'firstName': player['firstName'],
            'lastName': player['lastName'],
            'born': player['born']['year'],
        }
        row.update(rating)
        # Append the dictionary to the list
        data.append(row)

# Convert the list of dictionaries to a DataFrame
ratings_df = (
    pd.DataFrame(data)
    .convert_dtypes(dtype_backend='pyarrow')
    .astype({'skills': 'string[pyarrow]'})
    .assign(
        age=lambda x: x.season - x.born,
    )
)

ratings_df = ratings_df[ratings_df.season.between(2024, 2220)].reset_index(drop=True)

100%|██████████| 14724/14724 [00:00<00:00, 70834.27it/s]


In [8]:
# Initialize an empty list to store the data
data = []

# Iterate over the list of players
for player in tqdm(r_json['players']):
    # Iterate over the ratings of the current player
    for rating in player['salaries']:
        # Create a new dictionary that includes 'pid', 'firstName', 'lastName' and the rating
        row = {
            'pid': player['pid'],
        }
        row.update(rating)
        # Append the dictionary to the list
        data.append(row)

# Convert the list of dictionaries to a DataFrame
salaries_df = (
    pd.DataFrame(data)
    .convert_dtypes(dtype_backend='pyarrow')
)

salaries_df = salaries_df[salaries_df.season.between(2024, 2220)].reset_index(drop=True)

100%|██████████| 14724/14724 [00:00<00:00, 340502.47it/s]


In [9]:
df = ratings_df.merge(
    stats_df[['pid', 'season', 'tid', 'gp', 'gs', 'min', 'usgp', 'ortg', 'drtg', 'obpm', 'dbpm', 'ows', 'dws', 'vorp',
              'ewa']],
    on=['pid', 'season'], how='left').merge(
    salaries_df[['pid', 'season', 'amount']].rename(columns={'amount': 'salary'}), on=['pid', 'season'], how='left')

In [10]:
df['vorp_norm'] = (df['vorp'] / df['min']) * (32 * 82)

In [11]:
test_df = df[(df['min'] > 10) & (~df['vorp_norm'].isna())].reset_index(drop=True)
test_df['vorp_norm_wt'] = test_df['vorp_norm'] * test_df['min']
agg_df = test_df.groupby('ovr')[['vorp_norm_wt', 'min']].sum().reset_index()
agg_df['vorp_norm'] = agg_df['vorp_norm_wt'] / agg_df['min']

In [12]:
under_over = 56.064

model_df_under = agg_df[agg_df['ovr'] <= under_over].reset_index(drop=True)
poly_under = np.polyfit(model_df_under['ovr'], model_df_under['vorp_norm'], 1)

model_df_over = agg_df[agg_df['ovr'] > under_over].reset_index(drop=True)
poly_over = np.polyfit(model_df_over['ovr'], model_df_over['vorp_norm'], 1)

df['vorp_under'] = np.polyval(poly_under, df['ovr'])
df['vorp_over'] = np.polyval(poly_over, df['ovr'])
df['vorp_pred'] = np.where(
    df['ovr'] <= under_over,
    df['vorp_under'],
    df['vorp_over']
)

In [13]:
df['cvorp'] = df['vorp'].clip(0, )
df['vorp_pct'] = df['vorp_pred'].clip(0, ) / df.groupby('season').cvorp.sum().mean()
df['vorp_pct_cap'] = df['vorp_pct'] * 30

In [14]:
### Growth

In [15]:
df = df.drop_duplicates(['pid', 'season'], keep='first').reset_index(drop=True)

In [16]:
df['ovr+'] = df.groupby('pid')['ovr'].shift(-1) - df['ovr']

In [17]:
from scipy.stats import gaussian_kde
kde_dict = dict()

for age in range(19, 36):
    kde_dict[age] = dict()
    kde_dict[age]['data'] = df[df.age == age]['ovr+'].dropna().values
    kde_dict[age]['kde'] = gaussian_kde(kde_dict[age]['data'])
    
kde_dict[18] = kde_dict[19]

for age in range(36, 60):
    kde_dict[age] = kde_dict[35]

In [18]:
from scipy import signal
import numpy as np

def convolve_distributions(kdes):
    # Generate x values that cover the range of all KDEs
    x = np.linspace(-100, 100, 1000)

    # Initialize the convolved density as the density of the first KDE
    y_convolved = kdes[0](x)

    # Iterate over the rest of the KDEs
    for kde in kdes[1:]:
        # Calculate the density of the current KDE
        y = kde(x)

        # Perform the convolution
        y_convolved = signal.convolve(y_convolved, y, mode='same')

        # Normalize the result
        y_convolved /= np.trapz(y_convolved, x)  # Use trapezoidal rule to approximate the integral

    return x, y_convolved

In [19]:
kde_dict.keys()

dict_keys([19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 18, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])

In [20]:
# Use the function with two KDEs from your dictionary
x_new, y_new = convolve_distributions([
    kde_dict[19]['kde'], 
    kde_dict[20]['kde'],
    kde_dict[21]['kde'],
    kde_dict[22]['kde'],
])
# Normalize the result
y_new /= np.trapz(y_new, x_new)  # Use trapezoidal rule to approximate the integral

In [21]:
prog_dict = {}
for age in tqdm(range(df.age.min(), df.age.max() + 1)):
    prog_dict[age] = {}
    for years_in_adv in range(1, 10):
        dicts_to_compile = [kde_dict[age]['kde'] for age in range(age, age + years_in_adv)]
        prog_dict[age][years_in_adv] = {}
        prog_dict[age][years_in_adv]['x'], prog_dict[age][years_in_adv]['y'] = convolve_distributions(dicts_to_compile)

 11%|█         | 3/28 [00:21<03:02,  7.30s/it]


KeyboardInterrupt: 

In [None]:
prog_df_list = []
for age in range(df.age.min(), df.age.max() + 1):
    ## Create a df for each age, a column with values ranging from -100 to 100 and a column with the density for each years_in_adv
    prog_df = pd.DataFrame()
    for years_in_adv in range(1, 10):
        temp_df = pd.DataFrame({'x': prog_dict[age][years_in_adv]['x'], f'y_{years_in_adv}': prog_dict[age][years_in_adv]['y']})
        if years_in_adv > 1:
            temp_df = temp_df.drop('x', axis=1)
        prog_df = pd.concat([prog_df, temp_df], axis=1)
    prog_df_list.append(prog_df.assign(age=age))
prog_df = pd.concat(prog_df_list, axis=0).reset_index(drop=True)
prog_df.to_parquet('../constants/progression.parquet')

In [22]:
test_df = pd.read_parquet('../constants/progression.parquet')

In [24]:
np.dot(
    test_df[test_df.age == 26]['x'].values,
    test_df[test_df.age == 26]['y_1'].values
) / np.sum(test_df[test_df.age == 26]['y_1'].values)

-0.6694764862466687

In [1]:
import polars as pl
import pandas as pd

In [4]:
pdf = pd.read_parquet('../constants/progression.parquet')

In [2]:
df = pl.read_parquet('../constants/progression.parquet')

In [5]:
df

x,y_1,y_2,y_3,y_4,y_5,y_6,y_7,y_8,y_9,age
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
-100.0,0.0,0.0,0.0,0.0,0.0,8.2658e-232,1.0921e-142,1.5024e-84,1.6019e-52,18
-99.7998,0.0,0.0,0.0,0.0,0.0,2.4190e-230,1.1467e-141,7.1024e-84,3.9753e-52,18
-99.5996,0.0,0.0,0.0,0.0,0.0,6.9950e-229,1.1914e-140,3.3244e-83,9.7836e-52,18
-99.399399,0.0,0.0,0.0,0.0,0.0,1.9986e-227,1.2247e-139,1.5407e-82,2.3881e-51,18
-99.199199,0.0,0.0,0.0,0.0,0.0,5.6425e-226,1.2458e-138,7.0700e-82,5.7811e-51,18
…,…,…,…,…,…,…,…,…,…,…
99.199199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45
99.399399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45
99.5996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45
99.7998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45


In [21]:
(
    df
    .with_columns(y1_exp=pl.col('x') * pl.col('y_1'))
    .group_by('age')
    .agg(pl.sum('y1_exp','y_1'))
    .select(
        'age',
        (pl.col('y1_exp') / pl.col('y_1')).alias('exp')
    )
)

age,exp
i64,f64
22,2.764452
36,-5.196691
38,-5.196691
42,-5.196691
32,-3.701707
…,…
26,-0.669476
45,-5.196691
37,-5.196691
25,-0.320724


In [25]:
(
    df
    .with_columns(y1_exp=pl.col('x') * pl.col('y_1'))
    .group_by('age')
    .agg(pl.sum('y1_exp','y_1'))
    .select(
        'age',
        (pl.col('y1_exp') / pl.col('y_1')).alias('exp')
    )
    .sort('age')
    .to_pandas()
)

Unnamed: 0,age,exp
0,18,4.160076
1,19,4.160076
2,20,4.527958
3,21,2.904228
4,22,2.764452
5,23,0.98246
6,24,0.952395
7,25,-0.320724
8,26,-0.669476
9,27,-1.723478
