### Load libraries

In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Define some helpers

In [2]:
def get_stats(all_stats, year_change):
    all_prior_stats = all_stats.copy()
    all_prior_stats['year'] += year_change
    all_prior_stats['id'] = all_prior_stats['player_url'] + '___' + all_prior_stats['year'].astype(str)
    cols = ['mp', 'g', 'bpm', 'ts_pct', 'per', 'usg_pct', 'obpm', 'dbpm',
            'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
             'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'ws', 'ows', 'dws',
           'win_loss_pct', 'srs', 'pace_rel', 'off_rtg_rel', 'def_rtg_rel', 'made_playoffs',
           'playoff_mp', 'playoff_bpm']
    return all_prior_stats[['id'] + cols].rename(columns={col: '{}___{}'.format(col, year_change) for col in cols})

In [3]:
def merge_dfs(target_df, *args):
    res = target_df
    for arg in args:
        cols = arg.columns
        intersection = set(res.columns).intersection(cols)
        if intersection != {'id'}:
            raise ValueError
        else:
            res = res.merge(arg, how='left', on='id')
    return res

### Load data

In [4]:
all_stats = pd.read_msgpack('all_stats.mp')

In [5]:
all_stats['g'] = all_stats['g'].astype(float)

### Define a target df

In [6]:
target = all_stats[['player_url', 'year', 'ws', 'id', 'bpm', 'mp']]

### Create some features that we will know for that year (age, position)

In [7]:
X_df1 = pd.concat([all_stats[['id', 'age', 'pick_overall']], pd.get_dummies(all_stats['pos']), all_stats[['college_name']].isnull()], axis=1)

### Create feature for years in league

In [8]:
year_in_league = target.merge(target.groupby('player_url')['year'].min().to_frame('min_year').reset_index())
year_in_league['years_pro'] = year_in_league['year'] - year_in_league['min_year']
year_in_league = year_in_league[['id', 'years_pro']]

### Same team

In [9]:
_x = all_stats[['player_url', 'started_team', 'year', 'mp', 'age', 'team_id']]
_x['year'] += 1
_x['id'] = _x['player_url'] + '___' + _x['year'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
_x = all_stats.merge(_x, on='id', suffixes=('', '_old'), how='left')
_x['same_team'] = (_x['started_team'] == _x['started_team_old']).astype(int)

In [11]:
_x['old_mp_same'] = _x['mp_old'] * _x['same_team']

In [12]:
consistency = (_x.groupby(['year', 'started_team'])['old_mp_same'].sum() / _x.groupby(['year', 'started_team'])['mp_old'].sum()).to_frame('consistency').reset_index()

In [13]:
consistency1 = (_x.groupby(['year', 'started_team'])['old_mp_same'].sum() / _x.groupby(['year', 'started_team'])['mp_old'].sum()).to_frame('consistency1').reset_index()

In [14]:
_x = _x.merge(consistency, how='left', on=['year', 'started_team'])

In [15]:
_x = _x.merge(consistency1.rename(columns={'started_team_old': 'started_team'}), how='left', on=['year', 'started_team'])

In [16]:
_x = _x.merge(_x.groupby(['year', 'started_team'])[['age_old']].mean().reset_index().rename(columns={'age_old': 'team_age_old'}), how='left', on=['year', 'started_team'])

In [17]:
_x['traded'] = _x['team_id_old'] == 'TOT'

In [18]:
_x.loc[_x['traded'], 'same_team'] = np.nan
_x.loc[_x['traded'], 'consistency'] = np.nan
_x.loc[_x['traded'], 'consistency1'] = np.nan

In [19]:
team_df = _x[['id', 'same_team', 'team_age_old', 'consistency', 'consistency1', 'traded', 'year_coaching']]

### Concat above dfs with stats from previous 3 years

In [20]:
all_df = merge_dfs(
    target, 
    X_df1, 
    team_df,
    year_in_league,
    get_stats(all_stats, 1),
    get_stats(all_stats, 2),
    get_stats(all_stats, 3),
)
all_df = all_df[all_df['bpm'].notnull() | (all_df['year'] == 2020)]

### Create some new features/interactions

In [21]:
all_df['lot_of_min'] = (all_df['mp___1'].fillna(0) + all_df['mp___2'].fillna(0)) > 1000

In [22]:
all_df['ws_diff'] = all_df['ws___1'] - all_df['ws___2']
all_df['bpm_diff'] = all_df['bpm___1'] - all_df['bpm___2']
all_df['mp_diff'] = all_df['mp___1'] - all_df['mp___2']
all_df['bpm_int'] = all_df['bpm___1'] * all_df['bpm___2']
all_df['bpm_diff_2'] = all_df['bpm___2'] - all_df['bpm___3']
all_df['bpm_age_int'] = all_df['bpm___1'] * all_df['age']

### Create (new?) target, change in BPM

In [23]:
all_df['diff'] = all_df['bpm'] - all_df['bpm___1']

In [24]:
all_df.to_msgpack('feature_df.mp')

In [25]:
all_df['year'].max()

2020

#### New feature - minutes played last season that are coming back

In [26]:
all_df = pd.read_msgpack("feature_df.mp")
all_df.shape

(18557, 115)

In [27]:
team_mp = (
    all_stats[['id', 'team_id', 'year', 'player_url', 'mp', 'bpm', 'started_team', 'pos']]
    .merge(all_df[["id", "mp___1", "g___1", "same_team"]], on=["id"], how="left")
    [lambda x: x['started_team'] != "TOT"]
    [lambda x: x['year'] != 1999]
)

team_mp['g___1'] = team_mp['g___1'].astype(float)
team_mp['mpg___1'] = team_mp['mp___1'] / team_mp['g___1']

In [28]:
def agg_mp___1(cols):
    return (
        team_mp
        .groupby(cols)['mp___1'].sum()
    )

In [29]:
mp_last = agg_mp___1(['started_team', 'year'])
mp_last_new_old = agg_mp___1(['started_team', 'year', 'same_team']).unstack('same_team')

pos_mp_last = agg_mp___1(['started_team', 'year', 'pos'])
pos_mp_last_new_old = agg_mp___1(['started_team', 'year', 'pos', 'same_team']).unstack('same_team')

In [30]:
all_stats["id"].duplicated().any(), all_df["id"].duplicated().any()

(False, False)

In [31]:
all_df = (
    all_df
    .merge(
        all_stats[["id", "started_team", "pos"]], on=["id"], how="left"
    )
    .merge(
        mp_last.to_frame("team_mp___1").reset_index(), on=["started_team", "year"], how="left"
    )
    .merge(
        mp_last_new_old
        .rename(columns=lambda x: "team_mp___1_{}".format("new" if x < 0.5 else "returning"))
        .reset_index(),
        on=["started_team", "year"],
        how="left"
    )
    .merge(
        pos_mp_last.to_frame("pos_team_mp___1").reset_index(), 
        on=["pos", "started_team", "year"], 
        how="left"
    )
    .merge(
        pos_mp_last_new_old
        .rename(columns=lambda x: "pos_team_mp___1_{}".format("new" if x < 0.5 else "returning"))
        .reset_index(),
        on=["pos", "started_team", "year"],
        how="left"
    )
)

In [32]:
all_df

Unnamed: 0,player_url,year,ws,id,bpm,mp,age,pick_overall,C,F,...,bpm_age_int,diff,started_team,pos,team_mp___1,team_mp___1_new,team_mp___1_returning,pos_team_mp___1,pos_team_mp___1_new,pos_team_mp___1_returning
0,/players/a/abdulza01.html,1974,6.5,/players/a/abdulza01.html___1974,1.0,2459.0,27,5.0,1,0,...,,,HOU,C,19499.0,0.0,19205.0,3781.0,,3781.0
1,/players/a/abdulka01.html,1974,18.4,/players/a/abdulka01.html___1974,8.5,3548.0,26,1.0,1,0,...,,,MIL,C,20269.0,1016.0,18289.0,3946.0,,3946.0
2,/players/a/adamsdo01.html,1974,3.6,/players/a/adamsdo01.html___1974,-1.1,2298.0,26,120.0,0,0,...,,,DET,SF,20256.0,1853.0,14882.0,3325.0,,1451.0
3,/players/a/adelmri01.html,1974,0.7,/players/a/adelmri01.html___1974,-3.0,618.0,27,79.0,0,0,...,,,CHI,PG,20726.0,3368.0,15671.0,4704.0,1822.0,2882.0
4,/players/a/allenlu01.html,1974,8.2,/players/a/allenlu01.html___1974,3.6,2388.0,26,3.0,0,0,...,,,MIL,PG,20269.0,1016.0,18289.0,6446.0,1016.0,5430.0
5,/players/a/architi01.html,1974,2.3,/players/a/architi01.html___1974,0.6,1272.0,25,19.0,0,0,...,,,SAC,PG,18664.0,1567.0,15056.0,5149.0,1468.0,3681.0
6,/players/a/awtrede01.html,1974,1.4,/players/a/awtrede01.html___1974,0.3,756.0,25,46.0,1,0,...,,,CHI,C,20726.0,3368.0,15671.0,3872.0,,2185.0
7,/players/b/bantomi01.html,1974,0.5,/players/b/bantomi01.html___1974,-2.9,1982.0,22,8.0,0,0,...,,,PHO,PF,20256.0,2679.0,17577.0,2048.0,0.0,2048.0
8,/players/b/barnedi01.html,1974,0.0,/players/b/barnedi01.html___1974,-8.2,58.0,37,4.0,0,0,...,,,NYK,SG,21533.0,1805.0,19728.0,6142.0,1805.0,4337.0
9,/players/b/barneji02.html,1974,4.1,/players/b/barneji02.html___1974,-0.7,1689.0,29,8.0,0,0,...,,,GSW,SG,19312.0,1403.0,17909.0,5220.0,,5220.0


In [33]:
all_df.to_msgpack('feature_df___w_mp.mp')