In [1]:
import pandas as pd
import numpy as np
from cross_validation import *
from gradient_boost import predict_xgboost
from read_write import read_merged_data, load_pickled_model
from plotting_functions import scatter_plot_eda
from helper import add_rolling_averages
import matplotlib.pyplot as plt



In [2]:
# Functions
def get_finite_residual(df):
    return df[pd.notnull(df.Residual)]

def add_exponential_smoothing(df, dictionary):
    for key, value in dictionary.iteritems():
        for window in value:
            # Name of new feature
            name = 'Last' + str(window) + 'ExponentialSmoothingAverage' + key

            # Drop column to prevent duplicates
            if name in df.columns:
                df.pop(name)

            # Get exponential smoothing rolling average
            df[name] = pd.ewma(df[key], halflife = window).shift(1)

def add_residuals(df, element):
    """
    Input:
        df -- DataFrame
        element -- string of element we are getting residuals for
    Output:
        DataFrame with residuals column added
    """
    # Load model
    m = load_pickled_model('{}GradientBoostedRegressor'.format(element))
    
    # Predict
    predictions, filtered_df = predict_xgboost(df, 
                                               element = element, 
                                               data_info = data_info)

    prediction_df = pd.DataFrame()
    prediction_df['Predicted{}'.format(element)] = predictions
    prediction_df['Player'] = filtered_df.Player
    prediction_df['Date'] = filtered_df.Date
    prediction_df['Team'] = filtered_df.Team
    return df.merge(prediction_df, on = ['Player', 'Date', 'Team'], how = 'left')

In [3]:
# Load data
df = pd.read_csv('data/merged_data.csv')

In [4]:
# Filtering info
data_info = cv_method(method = k_folds_cv,
                      splits = 5,
                      start_date = '1999-01-01',
                      end_date = '2016-09-01',
                      minutes_cutoff = 3)

In [None]:
# # Examine data
"""
1. There is a duplicate season column
"""
print "Num rows: {}".format(len(df))
for column in df.columns:
    print column

In [8]:
df.sort_values(by = ['Date', 'Team', 'GS', 'PlayerMP'], ascending = False, inplace = True)

# Blowout prediction
def filter_blowout(df):
    return (np.sum(df[:3]['PlayerMP']) + 9) < (np.sum(df[:3]['Last10AveragePlayerMP']))

t = df.groupby(['Team', 'Date']).filter(filter_blowout)[['Team', 'Opp', 'Date', 'Result', 'Player', 'PlayerMP', 'GS', 'Last10AveragePlayerMP']]
t[t.GS == 1].iloc[:100,:]
t.drop_duplicates(subset = ['Team', 'Date'], inplace = True)

Unnamed: 0,Team,Opp,Date,Result,Player,PlayerMP,GS,Last10AveragePlayerMP
393493,POR,DEN,2016-04-13,W 107-99,C.J. McCollum,27.0,1.0,34.3
393484,POR,DEN,2016-04-13,W 107-99,Mason Plumlee,24.0,1.0,25.4
393492,POR,DEN,2016-04-13,W 107-99,Damian Lillard,24.0,1.0,34.9
393485,POR,DEN,2016-04-13,W 107-99,Al-Farouq Aminu,19.0,1.0,30.2
393488,POR,DEN,2016-04-13,W 107-99,Maurice Harkless,18.0,1.0,28.5
393366,PHO,LAC,2016-04-13,W 114-105,P.J. Tucker,30.0,1.0,33.7
393372,PHO,LAC,2016-04-13,W 114-105,Alex Len,28.0,1.0,31.6
393373,PHO,LAC,2016-04-13,W 114-105,Devin Booker,27.0,1.0,36.2
393365,PHO,LAC,2016-04-13,W 114-105,Jon Leuer,20.0,1.0,20.1
393370,PHO,LAC,2016-04-13,W 114-105,Ronnie Price,18.0,1.0,26.7


In [5]:
# Add residuals
df = add_residuals(df, 'FanDuelScore')

In [None]:
# Explore features to predict residuals
scatter_plot_eda(df, 'PlayerPTS', 'PredictedFanDuelScore', 'cat')

In [None]:
df[['PredictedFanDuelScore_y','FanDuelScore', 'Player']]

In [None]:
filtered_df = align_players(filtered_df)
print len(filtered_df)

# Display DataFrame
filtered_df[pd.notnull(filtered_df.StarterLineupOrder)].loc[:,['Player_x','StarterLineupOrder', 'Residual_x', 'BucketedMinutes_x', 'Player_y', 'BucketedMinutes_y', 'Residual_y']]

# Remove all rows with NaN for 'Residual_y'
filtered_df.dropna(subset = ['PlayerDefenseMatchupResidual'], inplace = True, axis = 0)
print len(filtered_df)

In [None]:
# Experimentation shows that 30.85 is the optimal half-life decay parameter for predicting 'Residual_y'
# based on the exponential smoothing of 

player_exponential_smoothing_dict = {'PlayerDefenseMatchupResidual': [30.80,30.85,30.90]}
def add_position_defense_stats(df):
    # Remove all rows with NaN for 'Residual_y'
    filtered_df = df.dropna(subset = ['PlayerDefenseMatchupResidual'], axis = 0)
    filtered_df = filtered_df.groupby(['Player']).apply(GB_apply_player_position)
    return filtered_df
filtered_df = add_position_defense_stats(filtered_df)

In [None]:
for key, value in player_exponential_smoothing_dict.iteritems():
    for window in value:
        scatter_plot_eda(filtered_df, key + 'ExponentialSmoothing' + str(window), 'PlayerDefenseMatchupResidual', 'cat')

In [None]:
f = filtered_df.groupby(['Date', 'Team', 'Frontcourt']).aggregate(np.sum)
f.reset_index(level=['Team', 'Date', 'Frontcourt'], inplace=True)
for key, value in player_exponential_smoothing_dict.iteritems():
    for window in value:
        old_name = key + 'ExponentialSmoothing' + str(window)
        rename_dict = {old_name : 'PositionGroup' + old_name}
        f.rename(columns = rename_dict, inplace = True)

In [None]:
features = ['PositionGroup'+ key + 'ExponentialSmoothing' + str(window) for window in value for key, value in player_exponential_smoothing_dict.iteritems()]
features += ['Date', 'Team', 'Frontcourt']
filtered_df = filtered_df.merge(f[features], on = ['Date', 'Team', 'Frontcourt'], how = 'left')

In [None]:
for key, value in player_exponential_smoothing_dict.iteritems():
    for window in value:
        scatter_plot_eda(filtered_df, 'PositionGroup' + key + 'ExponentialSmoothing' + str(window), 'PlayerDefenseMatchupResidual', 'cat')

In [None]:
features = [key + 'ExponentialSmoothing' + str(window) for window in value for key, value in player_exponential_smoothing_dict.iteritems()]
features += ['PlayerDefenseMatchup', 'Date', 'Team']
df = df.merge(filtered_df[features], left_on = ['Player', 'Date', 'Opp'], right_on = ['PlayerDefenseMatchup', 'Date', 'Team'], how = 'left')