### Load libraries

In [None]:
import pandas as pd
import lightgbm as lgb
import numpy as np

### Define some helpers

In [None]:
def get_stats(all_stats, year_change):
    all_prior_stats = all_stats.copy()
    all_prior_stats['year'] += year_change
    all_prior_stats['id'] = all_prior_stats['player_url'] + '___' + all_prior_stats['year'].astype(str)
    cols = ['mp', 'g', 'bpm', 'ts_pct', 'per', 'usg_pct', 'obpm', 'dbpm',
            'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
             'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'ws', 'ows', 'dws',
           'win_loss_pct', 'srs', 'pace_rel', 'off_rtg_rel', 'def_rtg_rel', 'made_playoffs']
    return all_prior_stats[['id'] + cols].rename(columns={col: '{}___{}'.format(col, year_change) for col in cols})

In [None]:
def merge_dfs(target_df, *args):
    res = target_df
    for arg in args:
        cols = arg.columns
        intersection = set(res.columns).intersection(cols)
        if intersection != {'id'}:
            raise ValueError
        else:
            res = res.merge(arg, how='left', on='id')
    return res

### Load data

In [None]:
all_stats = pd.read_msgpack('all_stats.mp')

In [None]:
all_stats['g'] = all_stats['g'].astype(float)

### Define a target df

In [None]:
target = all_stats[['player_url', 'year', 'ws', 'id', 'bpm', 'mp']]

### Create some features that we will know for that year (age, position)

In [None]:
X_df1 = pd.concat([all_stats[['id', 'age', 'pick_overall']], pd.get_dummies(all_stats['pos']), all_stats[['college_name']].isnull()], axis=1)

### Create feature for years in league

In [None]:
year_in_league = target.merge(target.groupby('player_url')['year'].min().to_frame('min_year').reset_index())
year_in_league['years_pro'] = year_in_league['year'] - year_in_league['min_year']
year_in_league = year_in_league[['id', 'years_pro']]

### Same team

In [None]:
_x = all_stats[['player_url', 'started_team', 'year', 'mp', 'age', 'team_id']]
_x['year'] += 1
_x['id'] = _x['player_url'] + '___' + _x['year'].astype(str)

In [None]:
_x = all_stats.merge(_x, on='id', suffixes=('', '_old'), how='left')
_x['same_team'] = (_x['started_team'] == _x['started_team_old']).astype(int)

In [None]:
_x['old_mp_same'] = _x['mp_old'] * _x['same_team']

In [None]:
consistency = (_x.groupby(['year', 'started_team'])['old_mp_same'].sum() / _x.groupby(['year', 'started_team'])['mp_old'].sum()).to_frame('consistency').reset_index()

In [None]:
consistency1 = (_x.groupby(['year', 'started_team'])['old_mp_same'].sum() / _x.groupby(['year', 'started_team'])['mp_old'].sum()).to_frame('consistency1').reset_index()

In [None]:
_x = _x.merge(consistency, how='left', on=['year', 'started_team'])

In [None]:
_x = _x.merge(consistency1.rename(columns={'started_team_old': 'started_team'}), how='left', on=['year', 'started_team'])

In [None]:
_x = _x.merge(_x.groupby(['year', 'started_team'])[['age_old']].mean().reset_index().rename(columns={'age_old': 'team_age_old'}), how='left', on=['year', 'started_team'])

In [None]:
_x['traded'] = _x['team_id_old'] == 'TOT'

In [None]:
_x.loc[_x['traded'], 'same_team'] = np.nan
_x.loc[_x['traded'], 'consistency'] = np.nan
_x.loc[_x['traded'], 'consistency1'] = np.nan

In [None]:
team_df = _x[['id', 'same_team', 'team_age_old', 'consistency', 'consistency1', 'traded']]

### Concat above dfs with stats from previous 3 years

In [None]:
all_df = merge_dfs(
    target, 
    X_df1, 
    team_df,
    year_in_league,
    get_stats(all_stats, 1),
    get_stats(all_stats, 2),
    get_stats(all_stats, 3),
)
all_df = all_df[all_df['bpm'].notnull() | (all_df['year'] == 2020)]

### Create some new features/interactions

In [None]:
all_df['lot_of_min'] = (all_df['mp___1'].fillna(0) + all_df['mp___2'].fillna(0)) > 1000

In [None]:
all_df['ws_diff'] = all_df['ws___1'] - all_df['ws___2']
all_df['bpm_diff'] = all_df['bpm___1'] - all_df['bpm___2']
all_df['mp_diff'] = all_df['mp___1'] - all_df['mp___2']
all_df['bpm_int'] = all_df['bpm___1'] * all_df['bpm___2']
all_df['bpm_diff_2'] = all_df['bpm___2'] - all_df['bpm___3']
all_df['bpm_age_int'] = all_df['bpm___1'] * all_df['age']

### Create (new?) target, change in BPM

In [None]:
all_df['diff'] = all_df['bpm'] - all_df['bpm___1']

In [None]:
all_df.to_msgpack('feature_df.mp')

In [None]:
all_df['year'].max()

#### New feature - minutes played last season that are coming back

In [None]:
all_df = pd.read_msgpack("feature_df.mp")
all_df.shape

In [None]:
team_mp = (
    all_stats[['id', 'team_id', 'year', 'player_url', 'mp', 'bpm', 'started_team', 'pos']]
    .merge(all_df[["id", "mp___1", "g___1", "same_team"]], on=["id"], how="left")
    [lambda x: x['started_team'] != "TOT"]
    [lambda x: x['year'] != 1999]
)

team_mp['g___1'] = team_mp['g___1'].astype(float)
team_mp['mpg___1'] = team_mp['mp___1'] / team_mp['g___1']

In [None]:
def agg_mp___1(cols):
    return (
        team_mp
        .groupby(cols)['mp___1'].sum()
    )

In [None]:
mp_last = agg_mp___1(['started_team', 'year'])
mp_last_new_old = agg_mp___1(['started_team', 'year', 'same_team']).unstack('same_team')

pos_mp_last = agg_mp___1(['started_team', 'year', 'pos'])
pos_mp_last_new_old = agg_mp___1(['started_team', 'year', 'pos', 'same_team']).unstack('same_team')

In [None]:
all_stats["id"].duplicated().any(), all_df["id"].duplicated().any()

In [None]:
all_df = (
    all_df
    .merge(
        all_stats[["id", "started_team", "pos"]], on=["id"], how="left"
    )
    .merge(
        mp_last.to_frame("team_mp___1").reset_index(), on=["started_team", "year"], how="left"
    )
    .merge(
        mp_last_new_old
        .rename(columns=lambda x: "team_mp___1_{}".format("new" if x < 0.5 else "returning"))
        .reset_index(),
        on=["started_team", "year"],
        how="left"
    )
    .merge(
        pos_mp_last.to_frame("pos_team_mp___1").reset_index(), 
        on=["pos", "started_team", "year"], 
        how="left"
    )
    .merge(
        pos_mp_last_new_old
        .rename(columns=lambda x: "pos_team_mp___1_{}".format("new" if x < 0.5 else "returning"))
        .reset_index(),
        on=["pos", "started_team", "year"],
        how="left"
    )
)

In [None]:
all_df.shape

In [None]:
all_df.to_msgpack('feature_df___w_mp.mp')