### Load libraries

In [None]:
import pandas as pd
import lightgbm as lgb

### Define some helpers

In [None]:
def get_stats(all_stats, year_change):
    all_prior_stats = all_stats.copy()
    all_prior_stats['year'] += year_change
    all_prior_stats['id'] = all_prior_stats['player_url'] + '___' + all_prior_stats['year'].astype(str)
    cols = ['mp', 'bpm', 'ts_pct', 'per', 'usg_pct', 'obpm', 'dbpm',
            'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
             'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'ws', 'ows', 'dws']
    return all_prior_stats[['id'] + cols].rename(columns={col: '{}___{}'.format(col, year_change) for col in cols})

In [None]:
def merge_dfs(target_df, *args):
    res = target_df
    for arg in args:
        cols = arg.columns
        intersection = set(res.columns).intersection(cols)
        if intersection != {'id'}:
            raise ValueError
        else:
            res = res.merge(arg, how='left', on='id')
    return res

### Load data

all_stats = pd.read_msgpack('all_stats.mp')

### Define a target df

In [None]:
target = all_stats[['player_url', 'year', 'ws', 'id', 'bpm', 'mp']]

### Create some features that we will know for that year (age, position)

In [None]:
X_df1 = pd.concat([all_stats[['id', 'age']], pd.get_dummies(all_stats['pos'])], axis=1)

### Create feature for years in league

In [None]:
year_in_league = target.merge(target.groupby('player_url')['year'].min().to_frame('min_year').reset_index())
year_in_league['years_pro'] = year_in_league['year'] - year_in_league['min_year']
year_in_league = year_in_league[['id', 'years_pro']]

### Concat above dfs with stats from previous 3 years

In [None]:
all_df = merge_dfs(
    target, 
    X_df1, 
    year_in_league,
    get_stats(all_stats, 1),
    get_stats(all_stats, 2),
    get_stats(all_stats, 3),
).dropna(subset=['bpm'])

### Create some new features/interactions

In [None]:
all_df['lot_of_min'] = (all_df['mp___1'].fillna(0) + all_df['mp___2'].fillna(0)) > 1000

In [None]:
all_df['ws_diff'] = all_df['ws___1'] - all_df['ws___2']
all_df['bpm_diff'] = all_df['bpm___1'] - all_df['bpm___2']
all_df['mp_diff'] = all_df['mp___1'] - all_df['mp___2']
all_df['bpm_int'] = all_df['bpm___1'] * all_df['bpm___2']
all_df['bpm_diff_2'] = all_df['bpm___2'] - all_df['bpm___3']
all_df['bpm_age_int'] = all_df['bpm___1'] * all_df['age']

### Create (new?) target, change in BPM

In [None]:
all_df['diff'] = all_df['bpm'] - all_df['bpm___1']

### Subset train df

Only between certain years, not rookie, change in BPM is not null (only if predicting delta)

In [None]:
train_df = all_df[
    (all_df['year'] > 1970) 
    & (all_df['year'] < 2019) 
    & (all_df['years_pro'] > 0) 
    &(all_df['diff'].notnull())
]

### Some LGB params

In [None]:

lgb_params = {
    'boosting_type': 'gbdt',
    'metric': ['rmse'],
    #'num_leaves': 20,
    'learning_rate': 0.05,
    #'feature_fraction': 0.6,
    #'bagging_fraction': 0.6,
    #'bagging_freq': 1,
    #'bagging_freq': 5,
    #'colsample_bytree': .4,
    #'min_data_in_leaf': 2,
    #'reg_alpha': 1,
    #'reg_lambda': 1,
    #'max_depth': 4,
    'verbose': 0
}

### Define X, y, drop any lookahead features

In [None]:
y = train_df['diff']
drop_cols =['id', 'ws', 'bpm', 'player_url', 'year', 'mp', 'diff']
X = train_df.drop(drop_cols, 1)
X_all = all_df.drop(drop_cols, 1)

### When training, care more about players with more minutes

Better actual estimate for their BPM

In [None]:
weights = np.log10(train_df['mp']+1) #* 0 + 1

### CV for num rounds

In [None]:
lgb_data = lgb.Dataset(X, y, weight=weights)
out = lgb.cv(lgb_params, lgb_data, num_boost_round=10000, nfold=5,
             early_stopping_rounds=10,stratified=False)
cv_loss = out['rmse-mean'][-1]
cv_num_rounds = len(out['rmse-mean'])

cv_loss, cv_num_rounds

### Fit model

In [None]:
bst = lgb.LGBMRegressor(n_estimators=cv_num_rounds, **lgb_params)
bst.fit(X, y, sample_weight=weights)

### Predict and transform back to BPM prediction

In [None]:
preds = bst.predict(X_all)

In [None]:
pred_df = all_df.copy()
pred_df['pred'] = preds + pred_df['bpm___1']

### Look at predictions for recent year

In [None]:
recent = pred_df[(pred_df['year'] > 2018) & (pred_df['year'] < 2020)].sort_values('pred')
recent = recent[(recent['years_pro'] > 0)]

### Look into data!!