### Predictions with our best models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
# To plot matplotlib figures inline on the notebook
%matplotlib inline

from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor

from sklearn.cross_validation import cross_val_score, train_test_split, KFold
from sklearn.grid_search import GridSearchCV



In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [4]:
from luther_common import *

In [5]:
# categories to predict
pred_categories = ['pts_per_g',
         'fg_per_g','fga_per_g',
         'fg3_per_g','fg3a_per_g',
         'ft_per_g','fta_per_g',
         'trb_per_g','blk_per_g',
         'stl_per_g','ast_per_g',
         'tov_per_g'
        ]

In [6]:
# load our predictive and standardization models
from sklearn.externals import joblib
estimators = dict()
standardizers = dict()
predictions = dict()

for category in pred_categories:
    estimators[category]=joblib.load('best_linreg_predictor_'+category+'.pkl')
    standardizers[category]=joblib.load('best_linreg_standardizer_'+category+'.pkl')

In [7]:
# load our data:
X_df = pd.read_csv('LEBRON_data_feng.csv', index_col=0)
y_df = pd.read_csv('LEBRON_target.csv', index_col=0)

In [8]:
# best prediction using 'D' mask:
level = 'D'
for category in pred_categories:
    #piggyback off the existing mask function to mask the X and y
    ready_X, ready_y = mask_data(category, level, X_df, y_df)
    std_ready_X = standardizers[category].transform(ready_X)
    predictions[category] = estimators[category].predict(std_ready_X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ready_X.drop(excluded_columns, axis=1, inplace=True)


In [9]:
from sklearn.metrics import mean_absolute_error
for category in pred_categories:
    print(category)
    print(mean_absolute_error(predictions[category], y_df[category]))

pts_per_g
2.16106937219
fg_per_g
0.816117450206
fga_per_g
1.67254534001
fg3_per_g
0.215503174381
fg3a_per_g
0.52232031868
ft_per_g
0.519309684101
fta_per_g
0.641199021875
trb_per_g
0.869022845894
blk_per_g
0.15165832259
stl_per_g
0.188236839746
ast_per_g
0.555098977011
tov_per_g
0.310396935841


In [12]:
#first set up players back to 2015
player_seasons_df = pd.read_csv('player_seasons_list_processed.csv')
player_seasons_df = player_seasons_df[player_seasons_df['season_year'] >= 2015]

In [13]:
%%time
# 5. replace the season rows with weighted averages where relevant
# Define lambda functions to:
# compute the weighted average (by g)
# keep the value of non-averaged rows
mean_wt_by_g = lambda x: np.average(x, weights=player_seasons_df.loc[x.index, 'g'])
keep = lambda x: x.iloc[0]
# Define a dictionary with the functions to apply for each column:
f = {
'age':keep,
'ast_per_g':mean_wt_by_g,
'blk_per_g':mean_wt_by_g,
'canonical':keep,
'drb_per_g':mean_wt_by_g,
'eff_raw':sum,
'eff_ratio':sum,
'fg2_per_g':mean_wt_by_g,
'fg2a_per_g':mean_wt_by_g,
'fg3_per_g':mean_wt_by_g,
'fg3a_per_g':mean_wt_by_g,
'fg_per_g':mean_wt_by_g,
'fga_per_g':mean_wt_by_g,
'ft_per_g':mean_wt_by_g,
'fta_per_g':mean_wt_by_g,
'g':np.sum,
'mp_per_g':mean_wt_by_g,
'name':keep,
'orb_per_g':mean_wt_by_g,
'pf_per_g':mean_wt_by_g,
'pts_per_g':mean_wt_by_g,
'season':keep,
'stl_per_g':mean_wt_by_g,
'team_id':keep,
'tov_per_g':mean_wt_by_g,
'trb_per_g':mean_wt_by_g,
'season_year':keep,
'pos_C':mean_wt_by_g,
'pos_PF':mean_wt_by_g,
'pos_PG':mean_wt_by_g,
'pos_SF':mean_wt_by_g,
'pos_SG':mean_wt_by_g,
'poscat':mean_wt_by_g,
'team_pace':mean_wt_by_g
}
# Groupby and aggregate with the dictionary:
final_player_seasons_df = player_seasons_df.groupby(['season_year', 'canonical']).agg(f)

CPU times: user 14.9 s, sys: 119 ms, total: 15 s
Wall time: 15 s


In [14]:
final_player_seasons_df_2018 = final_player_seasons_df[final_player_seasons_df['season_year'] == 2017]

In [15]:
final_player_seasons_df_2018.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 486 entries, (2017.0, abrinal01) to (2017.0, zubaciv01)
Data columns (total 34 columns):
trb_per_g      486 non-null float64
pos_PF         486 non-null float64
drb_per_g      486 non-null float64
eff_ratio      486 non-null float64
canonical      486 non-null object
name           486 non-null object
fg2_per_g      486 non-null float64
pts_per_g      486 non-null float64
ast_per_g      486 non-null float64
stl_per_g      486 non-null float64
tov_per_g      486 non-null float64
fg2a_per_g     486 non-null float64
pos_SF         486 non-null float64
blk_per_g      486 non-null float64
ft_per_g       486 non-null float64
fga_per_g      486 non-null float64
season         486 non-null object
g              486 non-null int64
pos_PG         486 non-null float64
season_year    486 non-null float64
eff_raw        486 non-null float64
fta_per_g      486 non-null float64
pf_per_g       486 non-null float64
poscat         486 non-null float64
fg

In [16]:
# we are trying to create 2018 players
final_player_seasons_df_2018['age'] = final_player_seasons_df_2018['age'].apply(lambda x: x + 1)
final_player_seasons_df_2018['season_year'] = final_player_seasons_df_2018['season_year'].apply(lambda x: x + 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [17]:
final_player_seasons_df = final_player_seasons_df.append(final_player_seasons_df_2018, ignore_index=True)
df = final_player_seasons_df

In [18]:
# read in the player individual stats
players_df = pd.read_csv('player_list.csv', index_col=0)
# calculate player's individual stats 
# height and weight for the player (seems like those are the useful values)
# rookie year
players_df['height'] = players_df['height'].fillna('6-6')
players_df['weight'] = players_df['weight'].fillna(230)
def height_str_to_height_inches(height_str):
    ft_str, in_str = (height_str.split('-'))
    return float(ft_str) * 12 + float(in_str)
players_df['height_inches'] = players_df['height'].apply(height_str_to_height_inches)
players_df['rookie_yr'] = players_df['year_min']

In [19]:
%%time
# process: iterate through each row in the dataset, 
# if the previous num_yrs_back years stats exist:
# append prev num_yrs_back year stats to X (break multiple rows into cols)
# append ppg to y
num_yrs_back = 3
X_df = pd.DataFrame()
y_df = pd.DataFrame()
# for each row, count it as a row if we can get something from the year before that row
for _ , row in df.iterrows():
    temp_df = df[(df['season_year'] >= row['season_year']-num_yrs_back) &
                 (df['season_year'] <= row['season_year']) &
                 (df['canonical'] == row['canonical'])]
    if temp_df.shape[0] == num_yrs_back + 1:
        year_row_wide = pd.DataFrame()
        years_ago = num_yrs_back
        for _ , year_row in temp_df.iterrows():
            #need to go across the rows and append with 'years_ago'
            for col_name , year_row_item in year_row.iteritems():
                year_row_wide[col_name + '_{}_ya'.format(years_ago)] = [year_row_item]
            years_ago -= 1 #not used yet, put into column name
        X_df = X_df.append(year_row_wide)
        y_df = y_df.append(row)
X_df.reset_index(inplace=True, drop=True)
y_df.reset_index(inplace=True, drop=True)

CPU times: user 25.9 s, sys: 304 ms, total: 26.2 s
Wall time: 27.7 s


In [20]:
# AFTER player_seasons_df is separated, merge player's individual stats into X_df 
temp_players_df = players_df.loc[:,['canonical','height_inches','weight','rookie_yr']]
X_df = pd.merge(X_df, temp_players_df, how='left', left_on='canonical_0_ya', right_on='canonical')


In [21]:
# Calculate number of years in league and drop rookie year
for years_ago in range(0,4):
    suffix = '_{}_ya'.format(years_ago)
    X_df['yrs_in_league'+suffix] = X_df['season_year'+suffix] - X_df['rookie_yr']
X_df.drop('rookie_yr', axis=1, inplace=True)

In [22]:
# our columns got messed up in all that copying and pasting...
X_df = X_df.reindex_axis(sorted(X_df.columns), axis=1)
y_df = y_df.reindex_axis(sorted(y_df.columns), axis=1)

In [23]:
# load our predictive and standardization models
from sklearn.externals import joblib
estimators = dict()
standardizers = dict()

for category in pred_categories:
    estimators[category]=joblib.load('naive_linreg_predictor_'+category+'.pkl')
    standardizers[category]=joblib.load('naive_linreg_standardizer_'+category+'.pkl')

In [24]:
# basic prediction using 'B' mask:
level = 'B'
for category in pred_categories:
    #piggyback off the existing mask function to mask the X and y
    ready_X, ready_y = mask_data(category, level, X_df, y_df)
    std_ready_X = standardizers[category].transform(ready_X)
    y_predict = estimators[category].predict(std_ready_X)
    X_df[category+'_pred'] = y_predict

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ready_X.drop(excluded_columns, axis=1, inplace=True)


In [25]:
# get a raw efficiency score for the X _pred's and for player_seasons_df:
#    (PTS + REB + AST + STL + BLK − ((FGA − FGM) + (FTA − FTM) + TO)) multiply by g to weight it
X_df['eff_raw_pred'] = (X_df['pts_per_g_pred'] +\
                    X_df['trb_per_g_pred'] +\
                    X_df['ast_per_g_pred'] +\
                    X_df['stl_per_g_pred'] +\
                    X_df['blk_per_g_pred'] -\
                   ((X_df['fga_per_g_pred'] - X_df['fg_per_g_pred']) +\
                    (X_df['fta_per_g_pred'] - X_df['ft_per_g_pred']) +\
                     X_df['tov_per_g_pred'])) * 72#X_df['g_pred'] (assume playing 72 games...)

In [26]:
#Note: we lost a bit of nuance in the data due to row combination, ignore it for the time being
# read in the performance of the player for each season (no predictions, broken down by season)
player_seasons_df = pd.read_csv('player_seasons_list_processed.csv', index_col=0)

In [27]:
%%time

#pre-process; re-label a couple of columns to use
player_seasons_df['season_year_prev'] = player_seasons_df['season_year'].apply(lambda x: x-1)

# now we 1) try to predict a NEW eff_ratio and 2) copy last year's pace over as a prediction of pace (at the end)
for index, player_season in X_df.iterrows():

    #get a df of all the people who played in the position on the same team that year and sum their contribution scores
    teammates_df = player_seasons_df[(player_seasons_df['season_year'] == player_season['season_year_0_ya']) &
                                   (player_seasons_df['team_id'] == player_season['team_id_0_ya']) &
                                   (player_seasons_df['poscat'] == player_season['poscat_0_ya'])
                                  ]

    # merge in raw predictions from X_df
    pred_teammates_df = pd.merge(teammates_df, 
                                 X_df.loc[:,['canonical','season_year_0_ya','eff_raw_pred']], 
                                 how='left', 
                                 left_on=['canonical','season_year'], 
                                 right_on=['canonical','season_year_0_ya'])
    # merge in numbers from last year
    pred_teammates_df = pd.merge(pred_teammates_df,
                                 player_seasons_df.loc[:,['canonical','season_year_prev','eff_raw']],
                                 how='left', 
                                 left_on=['canonical','season_year'], 
                                 right_on=['canonical','season_year_prev'])
    
    #eff_raw_pred (predicted), #eff_raw_x (this year -- should actually not be used), #eff_raw_y (last year)
    # if we didn't have the eff_raw_pred, replace it
    for teammate_index, teammate_season in pred_teammates_df.iterrows():
        if pd.isnull(teammate_season['eff_raw_pred']):
            if not pd.isnull(teammate_season['eff_raw_y']):
                pred_teammates_df.loc[teammate_index, 'eff_raw_pred'] = teammate_season['eff_raw_y']
            else:
                #this player didn't HAVE a previous season..
                pred_teammates_df.loc[teammate_index, 'eff_raw_pred'] = 0
    
    #finally, take contribution score and divide contribution score of position of the team
    num = player_season['eff_raw_pred']
    denom = sum(pred_teammates_df['eff_raw_pred'])
    #careful of division by zero or zero divided by zero...
    
    if denom == 0:
        X_df.loc[index, 'eff_ratio_pred'] = 1
    else:
        X_df.loc[index, 'eff_ratio_pred'] = num/denom
    
#     #some debug code
#     if X_df.loc[index, :].isnull().any():
#         print("NULL:")
#         print(index)
#         print("SEASON:")
#         print(player_season)
#         print("TEAMMATES:")
#         print(pred_teammates_df)
#         print("NUM:")
#         print(num)
#         print("DENOM:")
#         print(denom)



CPU times: user 5.31 s, sys: 136 ms, total: 5.45 s
Wall time: 5.49 s


In [28]:
#copy over pace from 1 year ago
X_df['team_pace_pred'] = X_df['team_pace_1_ya']

In [29]:
X_df.describe()

Unnamed: 0,age_0_ya,age_1_ya,age_2_ya,age_3_ya,ast_per_g_0_ya,ast_per_g_1_ya,ast_per_g_2_ya,ast_per_g_3_ya,blk_per_g_0_ya,blk_per_g_1_ya,...,ft_per_g_pred,fta_per_g_pred,trb_per_g_pred,blk_per_g_pred,stl_per_g_pred,ast_per_g_pred,tov_per_g_pred,eff_raw_pred,eff_ratio_pred,team_pace_pred
count,324.0,324.0,324.0,324.0,324.0,324.0,324.0,324.0,324.0,324.0,...,324.0,324.0,324.0,324.0,324.0,324.0,324.0,324.0,324.0,324.0
mean,29.033951,28.033951,27.033951,26.033951,2.159162,2.159162,2.181653,2.170637,0.426862,0.426862,...,1.535321,1.992695,3.759968,0.39622,0.678319,2.061691,1.185218,742.195457,1.0,96.438797
std,3.940984,3.940984,3.940984,3.940984,1.93428,1.93428,1.903858,1.88398,0.407856,0.407856,...,1.414762,1.749492,2.303447,0.376954,0.375183,1.745379,0.731624,419.333645,0.0,2.294608
min,22.0,21.0,20.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.327445,-0.433349,0.270108,-0.061133,-0.048188,-0.074249,-0.033106,5.941716,1.0,91.6
25%,26.0,25.0,24.0,23.0,0.9,0.9,0.9,0.9,0.1,0.1,...,0.626513,0.839986,2.144569,0.138815,0.411633,0.862549,0.714636,458.774386,1.0,94.9
50%,29.0,28.0,27.0,26.0,1.5,1.5,1.649375,1.6,0.3,0.3,...,1.093019,1.442374,3.222356,0.298028,0.619481,1.524894,1.015964,629.070424,1.0,96.2
75%,32.0,31.0,30.0,29.0,2.8,2.8,2.8,3.0,0.5,0.5,...,1.973662,2.644727,4.801484,0.509754,0.895698,2.683114,1.510795,988.527043,1.0,98.0
max,41.0,40.0,39.0,38.0,11.2,11.2,11.7,10.2,2.6,2.6,...,8.532633,10.189564,13.334094,2.291496,1.827869,9.868914,4.872941,2322.652008,1.0,101.3


In [30]:
X_df.to_csv('LEBRON_data_feng_2018.csv')