In [1]:
import pandas as pd
import numpy as np
from cross_validation import *
from gradient_boost import predict_xgboost
from read_write import read_merged_data, load_pickled_model
from plotting_functions import scatter_plot_eda
from helper import add_rolling_averages
import matplotlib.pyplot as plt



In [2]:
# Functions
def get_finite_residual(df):
    return df[pd.notnull(df.Residual)]

def add_exponential_smoothing(df, dictionary):
    for key, value in dictionary.iteritems():
        for window in value:
            # Name of new feature
            name = 'Last' + str(window) + 'ExponentialSmoothingAverage' + key

            # Drop column to prevent duplicates
            if name in df.columns:
                df.pop(name)

            # Get exponential smoothing rolling average
            df[name] = pd.ewma(df[key], halflife = window).shift(1)

def add_residuals(df, element):
    """
    Input:
        df -- DataFrame
        element -- string of element we are getting residuals for
    Output:
        DataFrame with residuals column added
    """
    # Load model
    m = load_pickled_model('{}GradientBoostedRegressor'.format(element))
    
    # Predict
    predictions, filtered_df = predict_xgboost(df, 
                                               element = element, 
                                               data_info = data_info)

    prediction_df = pd.DataFrame()
    prediction_df['Predicted{}'.format(element)] = predictions
    prediction_df['Player'] = filtered_df.Player
    prediction_df['Date'] = filtered_df.Date
    prediction_df['Team'] = filtered_df.Team
    return df.merge(prediction_df, on = ['Player', 'Date', 'Team'], how = 'left')

In [3]:
# Load data
df = pd.read_csv('data/merged_data.csv')

In [4]:
# Filtering info
data_info = cv_method(method = k_folds_cv,
                      splits = 5,
                      start_date = '1999-01-01',
                      end_date = '2016-09-01',
                      minutes_cutoff = 3)

In [None]:
# # Examine data
"""
1. There is a duplicate season column
"""
print "Num rows: {}".format(len(df))
for column in df.columns:
    print column

In [10]:
df.sort_values(by = ['Date', 'Team', 'GS', 'PlayerMP'], ascending = False, inplace = True)

# Blowout prediction
def filter_blowout(df):
    return (np.sum(df[:3]['PlayerMP']) + 9) < (np.sum(df[:3]['Last10AveragePlayerMP']))

t = df.groupby(['Team', 'Date']).filter(filter_blowout)[['Team', 'Opp', 'Date', 'Result', 'Player', 'PlayerMP', 'GS', 'Last10AveragePlayerMP']]
t.drop_duplicates(subset = ['Team', 'Date'], inplace = True)

Unnamed: 0,Team,Opp,Date,Result,Player,PlayerMP,GS,Last10AveragePlayerMP
393493,POR,DEN,2016-04-13,W 107-99,C.J. McCollum,27.0,1.0,34.3
393366,PHO,LAC,2016-04-13,W 114-105,P.J. Tucker,30.0,1.0,33.7
393534,MIL,IND,2016-04-13,L 92-97,Rashad Vaughn,36.0,1.0,26.9
393404,HOU,SAC,2016-04-13,W 116-81,Patrick Beverley,33.0,1.0,31.4
393549,GSW,MEM,2016-04-13,W 125-104,Harrison Barnes,31.0,1.0,34.2
393384,CHO,ORL,2016-04-13,W 117-103,Jeremy Lin,21.0,1.0,29.0
393243,TOR,PHI,2016-04-12,W 122-98,Norman Powell,38.0,1.0,30.5
393294,MEM,LAC,2016-04-12,L 84-110,Jordan Farmar,26.0,1.0,24.4
393266,LAC,MEM,2016-04-12,W 110-84,DeAndre Jordan,28.0,1.0,31.5
393189,OKC,LAL,2016-04-11,W 112-79,Russell Westbrook,28.0,1.0,33.7


In [31]:
plt.hist(pd.to_datetime(df['Date'], format = '%Y-%m-%d').astype(int))


(array([ 47781.,  29154.,  43634.,  35033.,  39855.,  39700.,  34595.,
         40631.,  31402.,  51810.]),
 array([  9.72950400e+17,   1.02170592e+18,   1.07046144e+18,
          1.11921696e+18,   1.16797248e+18,   1.21672800e+18,
          1.26548352e+18,   1.31423904e+18,   1.36299456e+18,
          1.41175008e+18,   1.46050560e+18]),
 <a list of 10 Patch objects>)

In [5]:
# Add residuals
df = add_residuals(df, 'FanDuelScore')

In [None]:
# Explore features to predict residuals
scatter_plot_eda(df, 'PlayerPTS', 'PredictedFanDuelScore', 'cat')

In [None]:
df[['PredictedFanDuelScore_y','FanDuelScore', 'Player']]

In [48]:
df[['Player', 'BucketedMinutes', 'PlayerMP', 'Overtime', 'GS', 'TeamMP', 'Date']][df.BucketedMinutes > 7].sort_values(by = ['Player', 'Date']).iloc[:80,:]

Unnamed: 0,Player,BucketedMinutes,PlayerMP,Overtime,GS,TeamMP,Date
23534,A.J. Guyton,8.0,48.0,0.0,1.0,240,2001-04-17
231379,Aaron Brooks,9.0,59.0,3.0,1.0,315,2010-01-13
232127,Aaron Brooks,8.0,50.0,1.0,1.0,265,2010-01-18
238827,Aaron Brooks,8.0,48.0,1.0,1.0,265,2010-03-07
8707,Aaron McKie,8.0,48.0,1.0,1.0,265,2000-12-30
11751,Aaron McKie,8.0,48.0,1.0,1.0,265,2001-01-21
90605,Aaron McKie,8.0,48.0,2.0,1.0,290,2004-03-12
147742,Adam Morrison,8.0,48.0,1.0,1.0,265,2006-11-15
166675,Adam Morrison,8.0,48.0,1.0,1.0,265,2007-03-24
18546,Al Harrington,9.0,54.0,3.0,1.0,315,2001-03-13


In [34]:
df

Unnamed: 0,2P,2PA,3P,3PA,Age,ChangedTeams,Date,FG,FGA,FT,...,Last10AveragePlayerTOVPerPossession,Last20AveragePlayerTOVPerPossession,EstimatedPlayerPossessions,PossessionMinuteAdjustedPlayerAST,PossessionMinuteAdjustedPlayerSTL,PossessionMinuteAdjustedPlayerORB,PossessionMinuteAdjustedPlayerBLK,PossessionMinuteAdjustedPlayerPTS,PossessionMinuteAdjustedPlayerDRB,PossessionMinuteAdjustedPlayerTOV
393430,6.0,11.0,3.0,5.0,30-002,0.0,2016-04-13,9.0,16.0,1.0,...,4.921932e-02,0.040144,67.344266,5.365723,0.809728,0.651245,0.076044,14.520305,3.054667,2.485415
393439,4.0,4.0,1.0,4.0,30-278,0.0,2016-04-13,5.0,8.0,3.0,...,1.916698e-02,0.017831,67.344266,3.546875,1.277899,1.044800,0.300966,9.581855,3.906234,1.344395
393434,8.0,12.0,0.0,0.0,33-213,0.0,2016-04-13,8.0,12.0,3.0,...,4.696427e-02,0.036380,55.099854,2.233887,1.056121,1.529785,0.564400,13.794621,4.847916,1.897736
393437,5.0,10.0,1.0,4.0,28-313,0.0,2016-04-13,6.0,14.0,1.0,...,2.204986e-02,0.019217,55.099854,2.271973,1.076926,1.170288,0.326336,11.625917,2.640915,1.199372
393438,2.0,3.0,0.0,4.0,22-315,0.0,2016-04-13,2.0,7.0,1.0,...,1.178854e-02,0.011580,42.855442,0.915039,0.642908,0.929688,0.331656,7.987375,2.596635,0.742697
393433,3.0,7.0,0.0,1.0,20-126,0.0,2016-04-13,3.0,8.0,0.0,...,6.699897e-02,0.068776,55.099854,0.850098,1.002537,2.925903,0.073606,17.071102,5.422729,2.641197
393440,1.0,2.0,1.0,3.0,24-166,0.0,2016-04-13,2.0,5.0,2.0,...,,,42.855442,,,,,,,
393431,2.0,7.0,0.0,0.0,27-222,0.0,2016-04-13,2.0,7.0,2.0,...,1.873500e-02,0.025352,42.855442,1.208496,0.762167,1.910767,0.135779,7.918176,3.504131,1.056834
393435,2.0,3.0,2.0,4.0,29-341,0.0,2016-04-13,4.0,7.0,2.0,...,1.070836e-02,0.015553,30.611030,1.086426,0.439781,0.344482,0.195327,4.045838,1.656546,0.614994
393436,0.0,1.0,1.0,3.0,33-180,0.0,2016-04-13,1.0,4.0,0.0,...,1.210584e-02,0.014167,30.611030,0.857422,0.381334,0.147095,0.093762,4.913060,1.883112,0.576926


In [None]:
filtered_df = align_players(filtered_df)
print len(filtered_df)

# Display DataFrame
filtered_df[pd.notnull(filtered_df.StarterLineupOrder)].loc[:,['Player_x','StarterLineupOrder', 'Residual_x', 'BucketedMinutes_x', 'Player_y', 'BucketedMinutes_y', 'Residual_y']]

# Remove all rows with NaN for 'Residual_y'
filtered_df.dropna(subset = ['PlayerDefenseMatchupResidual'], inplace = True, axis = 0)
print len(filtered_df)

In [None]:
# Experimentation shows that 30.85 is the optimal half-life decay parameter for predicting 'Residual_y'
# based on the exponential smoothing of 

player_exponential_smoothing_dict = {'PlayerDefenseMatchupResidual': [30.80,30.85,30.90]}
def add_position_defense_stats(df):
    # Remove all rows with NaN for 'Residual_y'
    filtered_df = df.dropna(subset = ['PlayerDefenseMatchupResidual'], axis = 0)
    filtered_df = filtered_df.groupby(['Player']).apply(GB_apply_player_position)
    return filtered_df
filtered_df = add_position_defense_stats(filtered_df)

In [None]:
for key, value in player_exponential_smoothing_dict.iteritems():
    for window in value:
        scatter_plot_eda(filtered_df, key + 'ExponentialSmoothing' + str(window), 'PlayerDefenseMatchupResidual', 'cat')

In [None]:
f = filtered_df.groupby(['Date', 'Team', 'Frontcourt']).aggregate(np.sum)
f.reset_index(level=['Team', 'Date', 'Frontcourt'], inplace=True)
for key, value in player_exponential_smoothing_dict.iteritems():
    for window in value:
        old_name = key + 'ExponentialSmoothing' + str(window)
        rename_dict = {old_name : 'PositionGroup' + old_name}
        f.rename(columns = rename_dict, inplace = True)

In [None]:
features = ['PositionGroup'+ key + 'ExponentialSmoothing' + str(window) for window in value for key, value in player_exponential_smoothing_dict.iteritems()]
features += ['Date', 'Team', 'Frontcourt']
filtered_df = filtered_df.merge(f[features], on = ['Date', 'Team', 'Frontcourt'], how = 'left')

In [None]:
for key, value in player_exponential_smoothing_dict.iteritems():
    for window in value:
        scatter_plot_eda(filtered_df, 'PositionGroup' + key + 'ExponentialSmoothing' + str(window), 'PlayerDefenseMatchupResidual', 'cat')

In [None]:
features = [key + 'ExponentialSmoothing' + str(window) for window in value for key, value in player_exponential_smoothing_dict.iteritems()]
features += ['PlayerDefenseMatchup', 'Date', 'Team']
df = df.merge(filtered_df[features], left_on = ['Player', 'Date', 'Opp'], right_on = ['PlayerDefenseMatchup', 'Date', 'Team'], how = 'left')