# Imports

In [1]:
import pandas as pd
import numpy as np
np.random.seed(2121)

In [2]:
batpos = pd.read_csv('../data/batpos_eda.csv')

# Lookbacks

The nature of our dataset is that a player's performance will be across multiple lines in our dataset. We are interested in players who have a consecutive set of years in the database so we can look into their historical performance to make inferences.

A few approaches will be made to see how our dataset changes (in number of rows) for lookback windows of 2, 3, and 5 years.

## Calculate lookback loss

In [3]:
batpos = batpos.sort_values(['player_id','year_id'])

In [4]:
def lookback_counts(yr):
    '''returns the count of rows remaining in the df using a lookback
       value of yr where the yrs must be non-null and consecutive'''
    
    # create our working df
    _yrlookback =\
        pd.Series(
            pd.DataFrame(
                    [batpos.groupby('player_id')['year_id'].shift(i)
                     for i in range(1,yr+1)]).T.values.tolist(),
        name='_yr_lookback')
    
    df = batpos.join(_yrlookback)
    
     
    # remove rows where _yr_lookback array includes a nan
    # create mask to count nulls
    null_count = df['_yr_lookback']\
                    .map(lambda x: sum([1 if pd.isnull(i) else 0 for i in x]))

    # use mask to keep only rows with mask value 0
    df = df.loc[null_count==0]
    
     # remove rows where _yr_lookback years aren't consecutive
    consec = df.apply(lambda row:
                 [row.year_id - x for x in row['_yr_lookback']] 
                      == list(range(1,yr+1))
                    , axis = 1)

    # use mask to keep only rows with mask value 0
    df = df.loc[consec]
    
    return df.shape[0], df

## Lookback comparisions

In [5]:
lb2 = lookback_counts(2)
lb3 = lookback_counts(3)
lb5 = lookback_counts(5)
rows = batpos.shape[0]

print('Lookback Loss\n')
print('2 year lookback: {} rows\n{:.2f} of original\n'.format(lb2[0],
                                                            lb2[0]/rows))
print('3 year lookback: {} rows\n{:.2f} of original\n'.format(lb3[0],
                                                            lb3[0]/rows))
print('5 year lookback: {} rows\n{:.2f} of original\n'.format(lb5[0],
                                                            lb5[0]/rows))

Lookback Loss

2 year lookback: 3627 rows
0.47 of original

3 year lookback: 2638 rows
0.34 of original

5 year lookback: 1411 rows
0.18 of original



## Lookback Feature Extraction

In [6]:
batpos.columns

Index(['waa_pg', 'player_id', 'year_id', 'name_common', 'age', 'g', 'team_gp',
       'pa_pg', 'inn_pg', 'runs_bat_pg', 'runs_br_pg', 'runs_dp_pg',
       'runs_defense_pg', 'runs_position_pg', 'teamrpg_avg', 'opprpg',
       'pyth_exponent_avg', 'percent_team_gp', 'pa_per_team_gp',
       'inn_per_team_gp', 'g_c_share', 'g_1b_share', 'g_2b_share',
       'g_3b_share', 'g_ss_share', 'g_cf_share', 'g_dh_share', 'g_cof_share'],
      dtype='object')

In [7]:
# create lookbacks for everything except our player vars, and age
cols = ['waa_pg','g', 'team_gp',
       'pa_pg', 'inn_pg', 'runs_bat_pg', 'runs_br_pg', 'runs_dp_pg',
       'runs_defense_pg', 'runs_position_pg', 'teamrpg_avg', 'opprpg',
       'pyth_exponent_avg', 'percent_team_gp', 'pa_per_team_gp',
       'inn_per_team_gp', 'g_c_share', 'g_1b_share', 'g_2b_share',
       'g_3b_share', 'g_ss_share', 'g_cf_share', 'g_dh_share', 'g_cof_share']

for c in cols:
    batpos[c+'_lookback'] =\
    pd.DataFrame(
            [batpos.groupby('player_id')[c].shift(i)
             for i in range(1,4)]).T.values.tolist()
    
    if c != 'waa_pg':
        batpos.drop(c, axis=1, inplace=True)

In [8]:
# using the df from our 3-year lookback, merge with batpos to mask it

batpos = pd.merge(lb3[1][['player_id','year_id']], batpos,
                 on = ['player_id', 'year_id'])

# reorder
batpos = batpos.iloc[:,[2]].join(batpos.iloc[:,0:2]).join(batpos.iloc[:,3:])

In [9]:
batpos.head()

Unnamed: 0,waa_pg,player_id,year_id,name_common,age,waa_pg_lookback,g_lookback,team_gp_lookback,pa_pg_lookback,inn_pg_lookback,...,pa_per_team_gp_lookback,inn_per_team_gp_lookback,g_c_share_lookback,g_1b_share_lookback,g_2b_share_lookback,g_3b_share_lookback,g_ss_share_lookback,g_cf_share_lookback,g_dh_share_lookback,g_cof_share_lookback
0,0.036,aaronha01,1965,Henry Aaron,31.0,"[0.0306206896551724, 0.0398757763975155, 0.038...","[145.0, 161.0, 156.0]","[162.0, 163.0, 162.0]","[4.372413793103449, 4.434782608695652, 4.27564...","[8.50551724137931, 8.98136645962733, 8.5942307...",...,"[3.91358024691358, 4.380368098159509, 4.117283...","[7.612962962962962, 8.871165644171779, 8.27592...","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0064516129032258]","[0.0733333333333333, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.535483870967742]","[0.0, 0.0, 0.0]","[0.9266666666666666, 1.0, 0.4580645161290322]"
1,0.034367,aaronha01,1966,Henry Aaron,32.0,"[0.036, 0.0306206896551724, 0.0398757763975155]","[150.0, 145.0, 161.0]","[162.0, 162.0, 163.0]","[4.26, 4.372413793103449, 4.434782608695652]","[8.728666666666667, 8.50551724137931, 8.981366...",...,"[3.9444444444444446, 3.91358024691358, 4.38036...","[8.0820987654321, 7.612962962962962, 8.8711656...","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.0, 0.0733333333333333, 0.0]","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]","[1.0, 0.9266666666666666, 1.0]"
2,0.039613,aaronha01,1967,Henry Aaron,33.0,"[0.0343670886075949, 0.036, 0.0306206896551724]","[158.0, 150.0, 145.0]","[163.0, 162.0, 162.0]","[4.354430379746836, 4.26, 4.372413793103449]","[8.596835443037975, 8.728666666666667, 8.50551...",...,"[4.220858895705521, 3.9444444444444446, 3.9135...","[8.333128834355827, 8.0820987654321, 7.6129629...","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.0121212121212121, 0.0, 0.0733333333333333]","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.0303030303030303, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.9575757575757576, 1.0, 0.9266666666666666]"
3,0.027437,aaronha01,1968,Henry Aaron,34.0,"[0.0396129032258064, 0.0343670886075949, 0.036]","[155.0, 158.0, 150.0]","[162.0, 163.0, 162.0]","[4.316129032258065, 4.354430379746836, 4.26]","[8.703225806451613, 8.596835443037975, 8.72866...",...,"[4.12962962962963, 4.220858895705521, 3.944444...","[8.32716049382716, 8.333128834355827, 8.082098...","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.0064935064935064, 0.0121212121212121, 0.0]","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.0714285714285714, 0.0303030303030303, 0.0]","[0.0, 0.0, 0.0]","[0.922077922077922, 0.9575757575757576, 1.0]"
4,0.040136,aaronha01,1969,Henry Aaron,35.0,"[0.0274374999999999, 0.0396129032258064, 0.034...","[160.0, 155.0, 158.0]","[163.0, 162.0, 163.0]","[4.225, 4.316129032258065, 4.354430379746836]","[8.714375, 8.703225806451613, 8.596835443037975]",...,"[4.147239263803681, 4.12962962962963, 4.220858...","[8.553987730061351, 8.32716049382716, 8.333128...","[0.0, 0.0, 0.0]","[0.0848484848484848, 0.0, 0.0]","[0.0, 0.0064935064935064, 0.0121212121212121]","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]","[0.0, 0.0714285714285714, 0.0303030303030303]","[0.0, 0.0, 0.0]","[0.9151515151515152, 0.922077922077922, 0.9575..."
