### 2018 Predictions with our best models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
# To plot matplotlib figures inline on the notebook
%matplotlib inline

from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor

from sklearn.cross_validation import cross_val_score, train_test_split, KFold
from sklearn.grid_search import GridSearchCV



In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [4]:
from luther_common import *

In [5]:
# categories to predict
pred_categories = ['pts_per_g',
         'fg_per_g','fga_per_g',
         'fg3_per_g','fg3a_per_g',
         'ft_per_g','fta_per_g',
         'trb_per_g','blk_per_g',
         'stl_per_g','ast_per_g',
         'tov_per_g'
        ]

In [6]:
# load our predictive and standardization models
from sklearn.externals import joblib
estimators = dict()
standardizers = dict()

for category in pred_categories:
    estimators[category]=joblib.load('best_linreg_predictor_'+category+'.pkl')
    standardizers[category]=joblib.load('best_linreg_standardizer_'+category+'.pkl')

In [7]:
# load our data:
X_df = pd.read_csv('LEBRON_data_feng_2018.csv', index_col=0)
pred_df = X_df.copy()

In [8]:
# best prediction using 'D' mask:
level = 'D'
for category in pred_categories:
    #piggyback off the existing mask function to mask the X and y
    ready_X, _ = mask_data(category, level, X_df, None)
    std_ready_X = standardizers[category].transform(ready_X)
    pred_df[category] = estimators[category].predict(std_ready_X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ready_X.drop(excluded_columns, axis=1, inplace=True)


In [9]:
pred_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324 entries, 0 to 323
Data columns (total 170 columns):
age_0_ya              float64
age_1_ya              float64
age_2_ya              float64
age_3_ya              float64
ast_per_g_0_ya        float64
ast_per_g_1_ya        float64
ast_per_g_2_ya        float64
ast_per_g_3_ya        float64
blk_per_g_0_ya        float64
blk_per_g_1_ya        float64
blk_per_g_2_ya        float64
blk_per_g_3_ya        float64
canonical             object
canonical_0_ya        object
canonical_1_ya        object
canonical_2_ya        object
canonical_3_ya        object
drb_per_g_0_ya        float64
drb_per_g_1_ya        float64
drb_per_g_2_ya        float64
drb_per_g_3_ya        float64
eff_ratio_0_ya        float64
eff_ratio_1_ya        float64
eff_ratio_2_ya        float64
eff_ratio_3_ya        float64
eff_raw_0_ya          float64
eff_raw_1_ya          float64
eff_raw_2_ya          float64
eff_raw_3_ya          float64
fg2_per_g_0_ya        float64

In [10]:
# we need to cut down pred_df so it's usable
exclusion_criteria = ['pos_','_0_ya','_1_ya','_2_ya','_3_ya','eff_raw','_pred']
exclusion_exceptions = [
    'age_0_ya','poscat_0_ya','yrs_in_league_0_ya'
]
excluded_columns = []
for col in pred_df.columns:
    #if fits exclusion criteria but does not MATCH an exception
    if (        any(excrit in col for excrit in exclusion_criteria)
        and not any(exexcp == col for exexcp in exclusion_exceptions)):
        excluded_columns.append(col)
pred_df_save = pred_df.drop(excluded_columns, axis=1)

In [11]:
pred_df_save.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324 entries, 0 to 323
Data columns (total 18 columns):
age_0_ya              324 non-null float64
canonical             324 non-null object
height_inches         324 non-null float64
poscat_0_ya           324 non-null float64
weight                324 non-null float64
yrs_in_league_0_ya    324 non-null float64
pts_per_g             324 non-null float64
fg_per_g              324 non-null float64
fga_per_g             324 non-null float64
fg3_per_g             324 non-null float64
fg3a_per_g            324 non-null float64
ft_per_g              324 non-null float64
fta_per_g             324 non-null float64
trb_per_g             324 non-null float64
blk_per_g             324 non-null float64
stl_per_g             324 non-null float64
ast_per_g             324 non-null float64
tov_per_g             324 non-null float64
dtypes: float64(17), object(1)
memory usage: 48.1+ KB


In [12]:
# read in team rosters so we can get a player name + keys
team_rosters_df = pd.read_csv('team_rosters_list.csv', index_col=0)
# add season_year column
team_rosters_df.rename(columns={'season':'season_year','player_canonical':'canonical','initial':'team_id','player':'name'}, 
                       inplace=True)
team_rosters_keys_df = team_rosters_df[['season_year','team_id','canonical','name']]
team_rosters_keys_df = team_rosters_keys_df[team_rosters_keys_df['season_year'] == 2018].reset_index(drop=True)
team_rosters_keys_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537 entries, 0 to 536
Data columns (total 4 columns):
season_year    537 non-null int64
team_id        537 non-null object
canonical      537 non-null object
name           537 non-null object
dtypes: int64(1), object(3)
memory usage: 16.9+ KB


In [13]:
final_df = pd.merge(team_rosters_keys_df, 
                     pred_df_save, 
                     how='left', 
                     left_on=['canonical'], 
                     right_on=['canonical'])

In [16]:
final_df['eff_raw'] = (final_df['pts_per_g'] +\
                            final_df['trb_per_g'] +\
                            final_df['ast_per_g'] +\
                            final_df['stl_per_g'] +\
                            final_df['blk_per_g'] -\
                           ((final_df['fga_per_g'] - final_df['fg_per_g']) +\
                            (final_df['fta_per_g'] - final_df['ft_per_g']) +\
                             final_df['tov_per_g'])) * 72

In [27]:
final_df.loc[177,:]

season_year                   2018
team_id                        HOU
canonical                hardeja01
name                  James Harden
age_0_ya                        28
height_inches                   77
poscat_0_ya                      1
weight                         220
yrs_in_league_0_ya               8
pts_per_g                  29.4421
fg_per_g                   8.89449
fga_per_g                  20.0287
fg3_per_g                   2.9908
fg3a_per_g                 8.44136
ft_per_g                    8.6012
fta_per_g                   10.281
trb_per_g                  7.65452
blk_per_g                 0.551153
stl_per_g                  1.69029
ast_per_g                  9.94492
tov_per_g                  4.91543
eff_raw                    2271.85
Name: 177, dtype: object

In [24]:
final_df.sort_values('pts_per_g',ascending=False)

Unnamed: 0,season_year,team_id,canonical,name,age_0_ya,height_inches,poscat_0_ya,weight,yrs_in_league_0_ya,pts_per_g,...,fg3_per_g,fg3a_per_g,ft_per_g,fta_per_g,trb_per_g,blk_per_g,stl_per_g,ast_per_g,tov_per_g,eff_raw
360,2018,OKC,westbru01,Russell Westbrook,29.0,75.0,1.0,200.0,9.0,30.566640,...,2.254686,6.617798,7.886749,9.565817,9.738158,0.431623,1.875469,9.982305,4.857541,2379.709074
177,2018,HOU,hardeja01,James Harden,28.0,77.0,1.0,220.0,8.0,29.442064,...,2.990802,8.441357,8.601204,10.281031,7.654519,0.551153,1.690285,9.944917,4.915431,2271.846174
321,2018,NOP,davisan02,Anthony Davis,24.0,82.0,5.0,253.0,5.0,27.930258,...,0.758123,2.270098,6.389745,8.166826,11.583239,2.130759,1.422988,2.791606,2.509491,2234.765628
322,2018,NOP,couside01,DeMarcus Cousins,27.0,83.0,5.0,270.0,7.0,26.308361,...,1.521986,4.435801,6.854976,8.948753,10.983758,1.277914,1.490867,4.542337,3.758881,2031.800149
89,2018,CLE,thomais02,Isaiah Thomas,28.0,69.0,1.0,185.0,6.0,26.254514,...,2.822118,7.731268,6.674082,7.587227,3.418943,0.163091,1.097893,6.037119,2.914761,1625.723557
165,2018,GSW,duranke01,Kevin Durant,29.0,81.0,3.0,240.0,10.0,26.132612,...,2.028051,5.333776,5.496369,6.515223,8.459768,1.353976,1.093980,5.080867,2.764017,2088.223624
433,2018,POR,lillada01,Damian Lillard,27.0,75.0,1.0,195.0,5.0,26.059964,...,2.806670,7.576374,5.832765,6.704476,5.131477,0.364892,1.081678,6.170659,2.927807,1731.340651
489,2018,TOR,derozde01,DeMar DeRozan,28.0,79.0,2.0,221.0,8.0,25.892538,...,0.766479,2.300612,6.737942,8.014578,5.377931,0.297811,1.165963,4.285260,2.593760,1616.191040
158,2018,GSW,curryst01,Stephen Curry,29.0,75.0,1.0,190.0,8.0,25.485032,...,3.816289,9.738284,4.313595,4.911051,5.153673,0.293682,1.849761,6.761861,3.037892,1847.188699
469,2018,SAS,leonaka01,Kawhi Leonard,26.0,79.0,3.0,230.0,6.0,25.275385,...,2.001981,5.357027,5.643051,6.650303,6.816068,0.794203,1.870372,4.065692,2.305694,1854.257997


In [51]:
row = final_df[final_df['name'] == 'LeBron James']
row


Unnamed: 0,season_year,team_id,canonical,name,age_0_ya,height_inches,poscat_0_ya,weight,yrs_in_league_0_ya,pts_per_g,...,fg3_per_g,fg3a_per_g,ft_per_g,fta_per_g,trb_per_g,blk_per_g,stl_per_g,ast_per_g,tov_per_g,eff_raw
96,2018,CLE,jamesle01,LeBron James,33.0,80.0,3.0,250.0,14.0,25.042159,...,1.568791,4.447946,4.73829,6.779436,8.086377,0.638663,1.329118,7.777971,3.643547,2018.168269


In [52]:
row.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 96 to 96
Data columns (total 22 columns):
season_year           1 non-null int64
team_id               1 non-null object
canonical             1 non-null object
name                  1 non-null object
age_0_ya              1 non-null float64
height_inches         1 non-null float64
poscat_0_ya           1 non-null float64
weight                1 non-null float64
yrs_in_league_0_ya    1 non-null float64
pts_per_g             1 non-null float64
fg_per_g              1 non-null float64
fga_per_g             1 non-null float64
fg3_per_g             1 non-null float64
fg3a_per_g            1 non-null float64
ft_per_g              1 non-null float64
fta_per_g             1 non-null float64
trb_per_g             1 non-null float64
blk_per_g             1 non-null float64
stl_per_g             1 non-null float64
ast_per_g             1 non-null float64
tov_per_g             1 non-null float64
eff_raw               1 non-null float64


In [50]:
pd.isnull(row.iloc[0,4])

True

In [28]:
final_df.to_csv('LEBRON_2018_FINAL_PREDICTIONS.csv')