### Load libraries

In [253]:
import pandas as pd
import lightgbm as lgb
import numpy as np

### Define some helpers

In [254]:
def get_stats(all_stats, year_change):
    all_prior_stats = all_stats.copy()
    all_prior_stats['year'] += year_change
    all_prior_stats['id'] = all_prior_stats['player_url'] + '___' + all_prior_stats['year'].astype(str)
    cols = ['mp', 'bpm', 'ts_pct', 'per', 'usg_pct', 'obpm', 'dbpm',
            'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
             'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'ws', 'ows', 'dws',
           'win_loss_pct', 'srs', 'pace_rel', 'off_rtg_rel', 'def_rtg_rel', 'made_playoffs']
    return all_prior_stats[['id'] + cols].rename(columns={col: '{}___{}'.format(col, year_change) for col in cols})

In [255]:
def merge_dfs(target_df, *args):
    res = target_df
    for arg in args:
        cols = arg.columns
        intersection = set(res.columns).intersection(cols)
        if intersection != {'id'}:
            raise ValueError
        else:
            res = res.merge(arg, how='left', on='id')
    return res

### Load data

In [256]:
all_stats = pd.read_msgpack('all_stats.mp')

### Define a target df

In [257]:
target = all_stats[['player_url', 'year', 'ws', 'id', 'bpm', 'mp']]

### Create some features that we will know for that year (age, position)

In [258]:
X_df1 = pd.concat([all_stats[['id', 'age', 'pick_overall']], pd.get_dummies(all_stats['pos']), all_stats[['college_name']].isnull()], axis=1)

### Create feature for years in league

In [259]:
year_in_league = target.merge(target.groupby('player_url')['year'].min().to_frame('min_year').reset_index())
year_in_league['years_pro'] = year_in_league['year'] - year_in_league['min_year']
year_in_league = year_in_league[['id', 'years_pro']]

### Same team

In [260]:
_x = all_stats[['player_url', 'started_team', 'year', 'mp', 'age', 'team_id']]
_x['year'] += 1
_x['id'] = _x['player_url'] + '___' + _x['year'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [261]:
_x = all_stats.merge(_x, on='id', suffixes=('', '_old'), how='left')
_x['same_team'] = (_x['started_team'] == _x['started_team_old']).astype(int)

In [262]:
_x['old_mp_same'] = _x['mp_old'] * _x['same_team']

In [263]:
consistency = (_x.groupby(['year', 'started_team'])['old_mp_same'].sum() / _x.groupby(['year', 'started_team'])['mp_old'].sum()).to_frame('consistency').reset_index()

In [264]:
consistency1 = (_x.groupby(['year', 'started_team'])['old_mp_same'].sum() / _x.groupby(['year', 'started_team'])['mp_old'].sum()).to_frame('consistency1').reset_index()

In [265]:
_x = _x.merge(consistency, how='left', on=['year', 'started_team'])

In [266]:
_x = _x.merge(consistency1.rename(columns={'started_team_old': 'started_team'}), how='left', on=['year', 'started_team'])

In [267]:
_x = _x.merge(_x.groupby(['year', 'started_team'])[['age_old']].mean().reset_index().rename(columns={'age_old': 'team_age_old'}), how='left', on=['year', 'started_team'])

In [268]:
_x['traded'] = _x['team_id_old'] == 'TOT'

In [269]:
_x.loc[_x['traded'], 'same_team'] = np.nan
_x.loc[_x['traded'], 'consistency'] = np.nan
_x.loc[_x['traded'], 'consistency1'] = np.nan

In [270]:
team_df = _x[['id', 'same_team', 'team_age_old', 'consistency', 'consistency1', 'traded', 'year_coaching']]

### Concat above dfs with stats from previous 3 years

In [271]:
all_df = merge_dfs(
    target, 
    X_df1, 
    team_df,
    year_in_league,
    get_stats(all_stats, 1),
    get_stats(all_stats, 2),
    get_stats(all_stats, 3),
)
all_df = all_df[all_df['bpm'].notnull() | (all_df['year'] == 2020)]

### Create some new features/interactions

In [272]:
all_df['lot_of_min'] = (all_df['mp___1'].fillna(0) + all_df['mp___2'].fillna(0)) > 1000

In [273]:
all_df['ws_diff'] = all_df['ws___1'] - all_df['ws___2']
all_df['bpm_diff'] = all_df['bpm___1'] - all_df['bpm___2']
all_df['mp_diff'] = all_df['mp___1'] - all_df['mp___2']
all_df['bpm_int'] = all_df['bpm___1'] * all_df['bpm___2']
all_df['bpm_diff_2'] = all_df['bpm___2'] - all_df['bpm___3']
all_df['bpm_age_int'] = all_df['bpm___1'] * all_df['age']

### Create (new?) target, change in BPM

In [274]:
all_df['diff'] = all_df['bpm'] - all_df['bpm___1']

In [275]:
all_df.to_msgpack('feature_df.mp')

In [276]:
all_df['year'].max()

2020