# Import Settings

In [58]:
import nba_model as model

import os
import path
from datetime import datetime, timedelta
import random
import pickle
 
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 150)
pd.set_option("display.precision", 5)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:.2f}".format

import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
%matplotlib inline

import seaborn as sns
sns.set(context="notebook", style="darkgrid", rc={"figure.facecolor":"white"}, font_scale= 1.2)
#from yellowbrick.regressor import ResidualsPlot

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = FutureWarning)

current_season_path = os.getcwd() + '/data/2019-20_raw/'
raw_path = os.getcwd() + '/data/raw/'
fc_path = os.getcwd() + '/data/external/' #fantasycruncher csv files
fc_pickled_path = os.getcwd() + '/data/external/pickled_version/' #pickled versions of FC .csv files
processed_path = os.getcwd() + '/data/processed/' #fuzzy_matched files
model_path = os.getcwd() + '/data/model/' #datasets to be used for modeling

def MAE(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def RMSE(actuals, preds): #root mean squared error
    return np.sqrt(np.mean((actuals - preds)**2))

### DraftKings NBA Fantasy Breakdown

#### DraftKings Scoring Breakdown

Metric        | Point Value   
--------------| ------------  
Point         | + 1.00 Pts  
Made 3pt Shot | + 0.50 Pts  
Rebound       | + 1.25 Pts  
Assist        | + 1.50 Pts  
Steal         | + 2.00 Pts  
Block         | + 2.00 Pts  
Turnover      | - 0.50 Pts  
Double-double | + 1.50 Pts  
Triple-double | + 3.00 Pts  

#### DraftKings Lineup Requirements

Position       | Required   
---------------| ------------  
PG             | 1
SG             | 1
SF             | 1
PF             | 1
C              | 1 
G (PG, SG)     | 1  
F (SF, PF)     | 1  
Util (ALL POS) | 1  

# Feature Engineering

In [14]:
df = pd.read_pickle(model_path + '2018-19_season_fc_merged_v2')

### Creating Predicted Label (Fantasy Points Produced)

In [15]:
# Creating to keep track for double-double and triple-doubles
pts_more_10 = [1 if x >= 10 else 0 for x in df['PTS']] 
reb_more_10 = [1 if x >= 10 else 0 for x in df['REB']] 
ast_more_10 = [1 if x >= 10 else 0 for x in df['AST']] 
stl_more_10 = [1 if x >= 10 else 0 for x in df['STL']] 
blk_more_10 = [1 if x >= 10 else 0 for x in df['BLK']] 
dd_count = np.array(pts_more_10) + np.array(reb_more_10) + np.array(ast_more_10) + np.array(stl_more_10) + np.array(blk_more_10)
double_double = [1 if x >= 2 else 0 for x in dd_count]
triple_double = [1 if x >= 3 else 0 for x in dd_count]

# Need to track this calculate the FP value.
df.insert(32, 'TD', triple_double)
df.insert(32, 'DD', double_double)

# Fantasy Points scored. This is the predicted label of the model
df['FP'] = df['PTS'] + df['3PM']*0.5 + df['REB']*1.25 + df['AST']*1.5 + df['STL']*2 + df['BLK']*2 - df['TOV']*.5 + df['DD']*1.5 + df['TD']*3

### Feature Engineering

In [16]:
# Track if team wins or loses. This will be used for team-aggregation to track win-loss record going into each game
df['W'] = df.apply(lambda x: 1 if x['TEAM_SCORE'] > x['OPP_SCORE'] else 0, axis = 1)
df['L'] = df.apply(lambda x: 1 if x['TEAM_SCORE'] < x['OPP_SCORE'] else 0, axis = 1)

# Keep track if team is on Home court or playing Away
df['AWAY'] = [1 if x == 'AWAY' else 0 for x in df['COURT']]
df['HOME'] = [1 if x == 'HOME' else 0 for x in df['COURT']]

# Creating merge columns to aggregate team-level statistics
df['team_merge'] = df['TEAM'] + ' ' + df['DATE'].map(lambda x: datetime.strftime(x, '%Y-%m-%d'))
df['opp_merge'] = df['OPP'] + ' ' + df['DATE'].map(lambda x: datetime.strftime(x, '%Y-%m-%d'))

df.sort_values(by = ['PLAYER','DATE'], inplace = True)


# Some player positions are not set correctly; manual fix
player_position_dict = df.set_index('PLAYER').to_dict()['Pos']
player_position_dict['Kyle Lowry'] = 'PG'
player_position_dict['Frank Ntilikina'] = 'PG/SF'
player_position_dict['Eric Moreland'] = 'PF/C'
player_position_dict['Rodney McGruder'] = 'SG/SF'
player_position_dict['Serge Ibaka'] = 'PF/C'

df['Pos'].fillna(df['PLAYER'].map(player_position_dict), axis = 0, inplace = True)
df.dropna(subset = ['Pos'], axis = 0, how = 'all', inplace = True )

# This tracks position eligiblity 
pg_list = [1 if 'PG' in x.split('/') else 0 for x in df['Pos']]
sg_list = [1 if 'SG' in x.split('/') else 0 for x in df['Pos']]
sf_list = [1 if 'SF' in x.split('/') else 0 for x in df['Pos']]
pf_list = [1 if 'PF' in x.split('/') else 0 for x in df['Pos']]
c_list = [1 if 'C' in x.split('/') else 0 for x in df['Pos']]

df['PG_POS'] = pg_list
df['SG_POS'] = sg_list
df['SF_POS'] = sf_list
df['PF_POS'] = pf_list
df['C_POS'] = c_list

# Point differential for each game. For season average aggregation later on.
df['PT_DIFF'] = df['TEAM_SCORE'] - df['OPP_SCORE']

In [17]:
# Bring in player information such as age, weight, height, school, country, draft pick
player_df = pd.read_csv(fc_path + 'player_info.csv')

# Convert player height to inches
player_df['HGT'] = player_df['HEIGHT'].map(lambda x: (int(x.split(' ')[0]) * 12) + int(x.split(' ')[1])) 

# Clean up the order of columns 
player_df = player_df[['PLAYER', 'AGE', 'WEIGHT', 'HGT', 'COLLEGE', 'COUNTRY', 'DFT_YR', 'DFT_RND', 'DFT_NUM']]

df = df.merge(player_df, how = 'left', on = 'PLAYER')

### Team-level Aggregations

In [18]:
team_list = list(df['TEAM'].unique())
team_list.sort()
opp_list = list(df['OPP'].unique())
opp_list.sort()
# Create groupby object for team-level aggregation
team_df = df.groupby(['team_merge', 'opp_merge','TEAM', 'OPP', 'DATE'])[['MIN', 'FGM', 'FGA', '3PM', '3PA', 'FTM', 'FTA', 'OREB', 'DREB', 'REB',
                             'AST', 'TOV', 'STL', 'BLK', 'PF', 'PTS', 'PT_DIFF']].sum().reset_index().copy()
#opp_df = df.groupby(['opp_merge','OPP', 'DATE'])[['MIN', 'FGM', 'FGA', '3PM', '3PA', 'FTM', 'FTA', 'OREB', 'DREB', 'REB',
                             #'AST', 'TOV', 'STL', 'BLK', 'PF', 'PTS']].sum().reset_index().copy()

In [19]:
# Add in cumulative team stats for each game. This will allow for additional team-level aggregations
# This needs to be done first so custom pace can be calculated.
team_cumul_dfs = []
for team in team_list:
    team_cumul_df = team_df[team_df['TEAM'] == team].groupby('team_merge')[['MIN', 'FGM', 'FGA', '3PM', '3PA', 'FTM', 
                                                                         'FTA', 'OREB', 'DREB', 'REB', 'AST', 
                                                                         'TOV', 'STL', 'BLK', 'PF', 'PTS']].sum().reset_index()
    team_cumul_dfs.append(team_cumul_df)

team_gamelog_df = pd.concat(team_cumul_dfs, axis = 0, ignore_index = True)
team_gamelog_df.rename(columns = {
                                 'MIN' : 'TM_MIN',
                                 'FGM' : 'TM_FGM',
                                 'FGA' : 'TM_FGA',
                                 '3PM' : 'TM_3PM',  
                                 '3PA' : 'TM_3PA',
                                 'FTM' : 'TM_FTM',
                                 'FTA' : 'TM_FTA',
                                 'OREB': 'TM_OREB',
                                 'DREB': 'TM_DREB',
                                 'REB' : 'TM_REB',
                                 'AST' : 'TM_AST',
                                 'TOV' : 'TM_TOV',
                                 'STL' : 'TM_STL',
                                 'BLK' : 'TM_BLK',
                                 'PF'  : 'TM_PF',
                                 'PTS' : 'TM_PTS'}, inplace = True)

df = df.merge(team_gamelog_df, how = 'left', on = 'team_merge')

In [20]:
# Calculate season average point differential for the team going into each game
pt_diff_groupby_df = df.groupby(['team_merge', 'opp_merge','TEAM', 'OPP', 'DATE'])[['PT_DIFF']].first().reset_index().copy()

pt_diff_dfs = []
for team in team_list:
    pt_diff_df = pt_diff_groupby_df[pt_diff_groupby_df['TEAM'] == team].groupby('team_merge')[['PT_DIFF']].first().shift(1).expanding().mean().reset_index()
    
    pt_diff_dfs.append(pt_diff_df)
    
average_point_differential_df = pd.concat(pt_diff_dfs, axis = 0, ignore_index = True) 
average_point_differential_df.rename(columns = {'PT_DIFF': 'AVG_TM_PT_DIFF'}, inplace = True)

df = df.merge(average_point_differential_df, how = 'left', on = 'team_merge')

# Add additional calculation for average point differential for the season going into each game
# This is to capture if a team is going through a cold or hot streak 

In [21]:
# Add in feature tracking the number of days since last game
# Teams that play back-to-backs are more likely to rest their impact players
days_behind_dfs = []
for team in team_list:
    last_game_df = team_df[team_df['TEAM'] == team][['team_merge','DATE']]
    days_between = last_game_df['DATE'] - last_game_df['DATE'].shift(1)
    days_between.fillna(pd.Timedelta('0 days'), axis = 0, inplace = True)
    last_game_df['PREV_GAME'] = days_between.map(lambda x: int(x.days))
    
    days_behind_dfs.append(last_game_df)
team_last_game_df = pd.concat(days_behind_dfs, axis = 0, ignore_index = True)

df = df.merge(team_last_game_df, how = 'left', on = 'team_merge')

# Quick clean-up for the merge. 
df.drop('DATE_y', axis = 1, inplace = True)
df.rename(columns = {'DATE_x': 'DATE'}, inplace = True)

In [22]:
# Calculate team wins and losses going into each game. Win% is calculated based on these two numbers. 
record_groupby = df.groupby(['team_merge', 'TEAM', 'DATE'])[['W', 'L']].first().reset_index()
win_loss_dfs = []
for team in team_list:
    team_record_df = record_groupby[record_groupby['TEAM'] == team].groupby('team_merge')[['W', 'L']].first().shift(1).expanding().sum().reset_index()
    win_loss_dfs.append(team_record_df)

win_loss_df = pd.concat(win_loss_dfs, axis = 0, ignore_index = True)
win_loss_df.rename(columns = {
                              'W' : 'WIN',  
                              'L' : 'LOSS'}, inplace = True)
win_loss_df.fillna(0, inplace = True)

df = df.merge(win_loss_df, how = 'left', on = 'team_merge')

df['WIN%'] = (df['WIN'] / (df['WIN'] + df['LOSS']))

### Player-level aggregations

In [24]:
player_list = list(df['PLAYER'].unique())
rolling_df = df.groupby(['merge','PLAYER','DATE'])[['FP', 'PTS', '3PM', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'DD', 'TD', 'MIN', 'USG%_AD']].sum().reset_index().copy()    

In [25]:
# Usage rate calculation based on basketball-reference formula. Seems to be a little different than the one stats.nba.com uses
regular_usage = (100*((df['FGA']) + (0.44 * df['FTA']) + (df['TOV'])) * (df['TM_MIN']))/ (((df['TM_FGA']) + (0.44 * df['TM_FTA']) + (df['TM_TOV'])) * 5 * (df['MIN']))
df['USG_CALC'] = regular_usage

# Custom usage rate metric. Subtracts for turnovers and adds for assists. This is more in line with DFS scoring
custom_usage = (100*((df['FGA']) + (0.44 * df['FTA']) + (0.5 * df['AST']) - (0.3 * df['TOV'])) * (df['TM_MIN']))/ (((df['TM_FGA']) + (0.44 * df['TM_FTA']) + (0.5 * df['TM_AST']) - (0.3 * df['TM_TOV'])) * 5 * (df['MIN']))
df['USG_CUST'] = custom_usage

In [28]:
def rolling_averages(df, games_rolling, rolling_statistics):
    '''
    Add a lagged statistical feature(s) for rolling averages going into each game.
    YER','DATE'])[['FP', 'PTS', '3PM', 
    Inputs: DataFrame object
          : games_rolling:
          : list of statistics that will be included
    returns: Merged DataFrame
    
    '''
    
    rolling_dfs = []
    rolling_df = df.groupby(['merge', 'PLAYER', 'DATE'])[rolling_statistics].sum().reset_index().copy()
    
    stat_dictionary = {}
    for stat in rolling_statistics:
        stat_dictionary[stat] = 'R{}_AVG_{}'.format(str(games_rolling), stat) 
    
    for player in player_list:
        rolling_averages_df = rolling_df[rolling_df['PLAYER'] == player].groupby('merge')[rolling_statistics].sum().shift(1).rolling(games_rolling).mean().reset_index()
        rolling_dfs.append(rolling_averages_df)
    
    rolling_avg_df = pd.concat(rolling_dfs, axis = 0, ignore_index = True)
    rolling_avg_df.rename(columns = stat_dictionary, inplace = True)
    
    df = df.merge(rolling_avg_df, how = 'left', on = 'merge')
    
    return df

def season_averages(df, rolling_statistics):
    '''
    Calculate season average for given list of statistics going into each game.
    
    Inputs: DataFrame object
          : games_rolling:
          : list of statistics that will be included
    returns: Merged DataFrame
    
    '''
    
    rolling_dfs = []
    rolling_df = df.groupby(['merge', 'PLAYER', 'DATE'])[rolling_statistics].sum().reset_index().copy()
    
    stat_dictionary = {}
    for stat in rolling_statistics:
        stat_dictionary[stat] = 'AVG_{}'.format(stat) 
    
    for player in player_list:
        rolling_averages_df = rolling_df[rolling_df['PLAYER'] == player].groupby('merge')[rolling_statistics].sum().shift(1).expanding().mean().reset_index()
        rolling_dfs.append(rolling_averages_df)
    
    rolling_avg_df = pd.concat(rolling_dfs, axis = 0, ignore_index = True)
    rolling_avg_df.rename(columns = stat_dictionary, inplace = True)
    
    df = df.merge(rolling_avg_df, how = 'left', on = 'merge')
    
    return df

In [29]:
# Initial batch of lagged features for the given stats. 
rolling_statistics = ['FP', 'PTS', '3PM', 'REB', 'AST', 'STL', 'BLK',
                      'TOV', 'MIN', 'DD', 'TD', 'USG%_AD', 'USG_CUST']

df = rolling_averages(df, 4, rolling_statistics)
df = rolling_averages(df, 8, rolling_statistics)
df = rolling_averages(df, 14, rolling_statistics)
df = season_averages(df, rolling_statistics)

In [None]:
# Add additional lagged features here either either rolling or season average
#rolling_statistics = []
#df = rolling_averages(df, 4, rolling_statistics)
#df = season_averages(df, rolling_statistics)

In [31]:
df.to_pickle(model_path + '101819_nans_uncleaned')

# Baseline Model and EDA

In [68]:
df = pd.read_pickle(model_path + '101819_nans_uncleaned')

#### NaN cleanup

In [69]:
'''
Dealing with NaNs. To maintain as many data points for each player, if there are NaN values for the rolling averages, 
the season average will be used instead. The only exemption is for the first game of each season. This line will be dropped. 

Unfortunately, this is hard to write out as a function as this will change depending on the statistics that I had. This will always need
to be run before we can start analyzing the data. 

'''

df['R4_AVG_FP'].fillna(df['AVG_FP'], inplace = True)
df['R8_AVG_FP'].fillna(df['AVG_FP'], inplace = True)
df['R14_AVG_FP'].fillna(df['AVG_FP'], inplace = True)


df['R4_AVG_PTS'].fillna(df['AVG_PTS'], inplace = True)
df['R8_AVG_PTS'].fillna(df['AVG_PTS'], inplace = True)
df['R14_AVG_PTS'].fillna(df['AVG_PTS'], inplace = True)


df['R4_AVG_3PM'].fillna(df['AVG_3PM'], inplace = True)
df['R8_AVG_3PM'].fillna(df['AVG_3PM'], inplace = True)
df['R14_AVG_3PM'].fillna(df['AVG_3PM'], inplace = True)


df['R4_AVG_REB'].fillna(df['AVG_REB'], inplace = True)
df['R8_AVG_REB'].fillna(df['AVG_REB'], inplace = True)
df['R14_AVG_REB'].fillna(df['AVG_REB'], inplace = True)


df['R4_AVG_AST'].fillna(df['AVG_AST'], inplace = True)
df['R8_AVG_AST'].fillna(df['AVG_AST'], inplace = True)
df['R14_AVG_AST'].fillna(df['AVG_AST'], inplace = True)


df['R4_AVG_STL'].fillna(df['AVG_STL'], inplace = True)
df['R8_AVG_STL'].fillna(df['AVG_STL'], inplace = True)
df['R14_AVG_STL'].fillna(df['AVG_STL'], inplace = True)


df['R4_AVG_BLK'].fillna(df['AVG_BLK'], inplace = True)
df['R8_AVG_BLK'].fillna(df['AVG_BLK'], inplace = True)
df['R14_AVG_BLK'].fillna(df['AVG_BLK'], inplace = True)


df['R4_AVG_TOV'].fillna(df['AVG_TOV'], inplace = True)
df['R8_AVG_TOV'].fillna(df['AVG_TOV'], inplace = True)
df['R14_AVG_TOV'].fillna(df['AVG_TOV'], inplace = True)


df['R4_AVG_DD'].fillna(df['AVG_DD'], inplace = True)
df['R8_AVG_DD'].fillna(df['AVG_DD'], inplace = True)
df['R14_AVG_DD'].fillna(df['AVG_DD'], inplace = True)


df['R4_AVG_TD'].fillna(df['AVG_TD'], inplace = True)
df['R8_AVG_TD'].fillna(df['AVG_TD'], inplace = True)
df['R14_AVG_TD'].fillna(df['AVG_TD'], inplace = True)


df['R4_AVG_MIN'].fillna(df['AVG_MIN'], inplace = True)
df['R8_AVG_MIN'].fillna(df['AVG_MIN'], inplace = True)
df['R14_AVG_MIN'].fillna(df['AVG_MIN'], inplace = True)


df['R4_AVG_USG%_AD'].fillna(df['AVG_USG%_AD'], inplace = True)
df['R8_AVG_USG%_AD'].fillna(df['AVG_USG%_AD'], inplace = True)
df['R14_AVG_USG%_AD'].fillna(df['AVG_USG%_AD'], inplace = True)

df['R4_AVG_USG_CUST'].fillna(df['AVG_USG_CUST'], inplace = True)
df['R8_AVG_USG_CUST'].fillna(df['AVG_USG_CUST'], inplace = True)
df['R14_AVG_USG_CUST'].fillna(df['AVG_USG_CUST'], inplace = True)

#df.dropna(axis = 0, how = 'any', subset = ['R4_AVG_PTS'], inplace = True)

df['Index'] = df['PLAYER'] + ' ' + df['DATE'].map(lambda x: datetime.strftime(x, '%Y-%m-%d'))
df.set_index(keys = 'Index', drop = True, inplace = True)

# This needs to be done list the game chronologically. 
df.sort_values(by = ['DATE', 'PLAYER'], axis = 0, inplace = True)


#### Feature Selection 

In [39]:
mask = ((df['AVG_FP'] > 5) & (df['AVG_FP'] < 70))
df = df[mask]

In [70]:
# Feature list is based on custom choices
feature_list = [
'PLAYER', 'DATE', 'FP', 'FC Proj','AGE', 'WEIGHT', 'HGT', 'VegasPts', 'Floor', 'Ceiling', 'Proj Mins', 'Def v Pos',
'AWAY', 'HOME', 'PG_POS', 'SG_POS', 'SF_POS', 'PF_POS', 'C_POS', 'AVG_TM_PT_DIFF', 'PREV_GAME',
'WIN%'#,'FPG','FPPM', 'AVG/36', 'STDV/36', 'STDV', 'ProjSTV', 'USG'
]

rolling_stats = df.loc[:,'R4_AVG_FP': 'AVG_USG_CUST'].columns.to_list()
feature_list.extend(rolling_stats)

stats = df[feature_list].copy()
stats.dropna(axis = 0, how = 'any', inplace = True)
stats['Def v Pos'] = stats['Def v Pos'].copy().astype(int)

### Analysis via Functions

In [None]:
# linear_regression(df, test_size, validation_size, show_test_set, coef_shown):

In [102]:
model.linear_regression(stats, .25, .20, True, 200)

Baseline Linear Regression Model 

Training Set: 
r^2 score: 0.6774558690806636
Mean Absolute Error: 6.621380187491896
Root Mean Squared Error: 8.781073626644446 

Validation Set: 
r^2 score: 0.6835046419910391
Mean Absolute Error: 6.802666093191275
Root Mean Squared Error: 9.024457256576973 

Test Set: 
r^2 score: 0.6467153807200252
Mean Absolute Error: 6.946723396828019
Root Mean Squared Error: 9.34074553163691 

FC Projections r^2 (Test Set): 0.6426951635749032
FC Projections MAE (Test Set): 6.736317679558011 

Model r2 Difference: 0.004020217145122018
Model MAE Difference: 0.21040571727000756

              Feature  Coef  Coef_Abs
29         R4_AVG_TD -3.90      3.90
55        R14_AVG_TD  3.36      3.36
28         R4_AVG_DD  2.40      2.40
50       R14_AVG_STL -1.90      1.90
67            AVG_DD  1.69      1.69
41         R8_AVG_DD -1.42      1.42
25        R4_AVG_BLK  1.35      1.35
24        R4_AVG_STL  1.11      1.11
47       R14_AVG_3PM -1.11      1.11
65           AVG_TOV  1.

In [101]:
#ridge_regression(df, test_size, validation_size, ridge_alpha, tolerance, random_state, show_test_set, coef_shown):
model.ridge_regression(stats, .20, .25, 20, .001, 2, True, 300)

Standard Scaled Ridge Regression Model 

Training Set: 
r^2 score: 0.6774421946802357
Mean Absolute Error: 6.622968892507409
Root Mean Squared Error: 8.78125976341273 

Validation Set: 
r^2 score: 0.6844681841329541
Mean Absolute Error: 6.7606703359724785
Root Mean Squared Error: 8.971735012368237 

Test Set: 
r^2 score: 0.6367356124749854
Mean Absolute Error: 7.030218081338159
Root Mean Squared Error: 9.46542852618361 

FC Projections r^2 (Test Set): 0.6339703244513564
FC Projections MAE (Test Set): 6.807045925414365 

Model r2 Performance: 0.0027652880236290756
Model MAE Performance: -0.2231721559237938

              Feature  Coef  Coef_Abs
7          Proj Mins  4.64      4.64
0            FC Proj  3.89      3.89
66           AVG_MIN -1.98      1.98
59           AVG_PTS  1.83      1.83
27        R4_AVG_MIN -1.26      1.26
58            AVG_FP  1.22      1.22
40        R8_AVG_MIN  1.15      1.15
6            Ceiling  0.99      0.99
20        R4_AVG_PTS  0.97      0.97
60           AV

In [8]:
#polynomial_lasso(df, test_size, validation_size, poly_degree, lasso_alpha, tolerance, max_iterations, random_state, show_test_set, coef_shown):
model.polynomial_lasso(stats, .20, .25, 2, 5, .001, 3000, 92, True, 30)



Lasso Regression Model, Polynomial Features 

Training Set: 
r^2 score: 0.6153902713169896
Mean Absolute Error: 7.226256251842857
Root Mean Squared Error: 9.30305569237666 

Validation Set: 
r^2 score: 0.6471264218735193
Mean Absolute Error: 7.216577235752153
Root Mean Squared Error: 9.325754429872122 

Test Set: 
r^2 score: 0.6082862434218514
Mean Absolute Error: 7.36942774073422
Root Mean Squared Error: 9.700664472868148 

FC Projections r^2 (Test Set): 0.5982590664434331
FC Projections MAE (Test Set): 7.2990412646273874 

Model r2 Performance: 0.010027176978418373
Model MAE Performance: -0.07038647610683224

                         Feature  Coef  Coef_Abs
599            Proj Mins AVG_FP  0.01      0.01
420             Floor Def v Pos  0.00      0.00
522          Ceiling R14_AVG_FP  0.00      0.00
543             Ceiling AVG_MIN -0.00      0.00
586        Proj Mins R14_AVG_FP  0.00      0.00
276              WEIGHT AVG_TOV  0.00      0.00
535              Ceiling AVG_FP  0.00      0

Unnamed: 0,Feature,Coef,Coef_Abs
599,Proj Mins AVG_FP,0.01,0.01
420,Floor Def v Pos,0.00,0.00
522,Ceiling R14_AVG_FP,0.00,0.00
543,Ceiling AVG_MIN,-0.00,0.00
586,Proj Mins R14_AVG_FP,0.00,0.00
276,WEIGHT AVG_TOV,0.00,0.00
535,Ceiling AVG_FP,0.00,0.00
1874,R8_AVG_FP AVG_FP,-0.00,0.00
221,WEIGHT HOME,0.00,0.00
222,WEIGHT PG_POS,0.00,0.00


### Manual Analysis

#### Feature Preparation

In [71]:
X, y, fc_proj = stats.drop(['FP', 'PLAYER', 'DATE'], axis = 1), stats['FP'].values, stats['FC Proj'].values

holdout_df = X[-round(len(X)/5):].copy() # Holdout set predictions
fc_y_test = fc_proj[-round(len(X)/5):].copy()  # FantasyCruncher predictions

X_train = X[:-round(len(X)/5)].copy()
y_train = y[:-round(len(X)/5)].copy()

X_test = X[-round(len(X)/5):].copy()
y_test = y[-round(len(X)/5):].copy()

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = .25, random_state = 98924)

# Since we are trying to outproject FantasyCruncher, calculate the MAE and r2 for their own projections
#print('FC Projections MAE (Full Season):', MAE(stats['FP'].values, stats['FC Proj'].values))
#print('FC Projections r^2 (Full Season):', r2_score(stats['FP'].values, stats['FC Proj'].values),'\n')
print('FC Projections MAE (Test Set):' , MAE(y_test, fc_y_test))
print('FC Projections r^2 (Test Set):', r2_score(y_test, fc_y_test))

FC Projections MAE (Test Set): 6.807045925414365
FC Projections r^2 (Test Set): 0.6339703244513564


#### Linear Regression (No Scaling)

In [72]:
linear_model = LinearRegression()
linear_model.fit(X_tr, y_tr)

y_tr_predicted = linear_model.predict(X_tr)
y_val_predicted = linear_model.predict(X_val)
y_test_predicted = linear_model.predict(X_test)

print('Baseline Linear Regression Model','\n')
print('Training Set: ')
print('r^2 score: {}'.format(r2_score(y_tr, y_tr_predicted)),)
print('Mean Absolute Error: {}'.format(MAE(y_tr, y_tr_predicted)))
print('Root Mean Squared Error: {}'.format(model.RMSE(y_tr, y_tr_predicted)), '\n')

print('Validation Set: ')
print('r^2 score: {}'.format(r2_score(y_val, y_val_predicted)))
print('Mean Absolute Error: {}'.format(MAE(y_val, y_val_predicted)))
print('Root Mean Squared Error: {}'.format(model.RMSE(y_val, y_val_predicted)), '\n')


print('Test Set: ')
print('r^2 score: {}'.format(r2_score(y_test, y_test_predicted)))
print('Mean Absolute Error: {}'.format(MAE(y_test, y_test_predicted)))
print('Root Mean Squared Error: {}'.format(model.RMSE(y_test, y_test_predicted)),'\n')

print('FC Projections r^2 (Test Set):', r2_score(y_test, fc_y_test))
print('FC Projections MAE (Test Set):' , MAE(y_test, fc_y_test), '\n')


print('Model r2 Score: {}'.format(r2_score(y_test, y_test_predicted) - r2_score(y_test, fc_y_test)))
print('Model MAE Score: {}'.format(MAE(y_test, y_test_predicted) - MAE(y_test, fc_y_test)))

df_coef = pd.DataFrame()
df_coef['Feature'] = X_test.columns.to_list()
df_coef['Coef'] = np.abs(linear_model.coef_)
df_coef.sort_values(by = 'Coef', ascending = False).head(10)

Baseline Linear Regression Model 

Training Set: 
r^2 score: 0.6765856867290243
Mean Absolute Error: 6.673802898338027
Root Mean Squared Error: 8.8546563083444 

Validation Set: 
r^2 score: 0.6886462965940854
Mean Absolute Error: 6.574781202267123
Root Mean Squared Error: 8.736000618617142 

Test Set: 
r^2 score: 0.6398729522987872
Mean Absolute Error: 6.98946079273682
Root Mean Squared Error: 9.424465713201315 

FC Projections r^2 (Test Set): 0.6339703244513564
FC Projections MAE (Test Set): 6.807045925414365 

Model r2 Score: 0.005902627847430786
Model MAE Score: 0.18241486732245527


Unnamed: 0,Feature,Coef
42,R8_AVG_TD,8.21
68,AVG_TD,6.32
38,R8_AVG_BLK,2.93
64,AVG_BLK,2.88
37,R8_AVG_STL,2.66
29,R4_AVG_TD,2.65
63,AVG_STL,2.63
36,R8_AVG_AST,2.34
41,R8_AVG_DD,2.16
35,R8_AVG_REB,2.16


#### Linear Regression (Polynomial Features)

In [51]:
poly = PolynomialFeatures(degree = 2)
poly_features = X_tr.columns.to_list

X_tr_poly = poly.fit_transform(X_tr.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test)

poly_reg = LinearRegression()
poly_reg.fit(X_tr_poly, y_tr)

y_tr_predicted_linreg_poly = poly_reg.predict(X_tr_poly)
y_val_predicted_linreg_poly = poly_reg.predict(X_val_poly)
y_test_predicted_linreg_poly = poly_reg.predict(X_test_poly)

print('Polynomial Linear Regression Model','\n')
print('Training Set: ')
print('r^2 score: {}'.format(r2_score(y_tr, y_tr_predicted_linreg_poly)))
print('Mean Absolute Error: {}'.format(MAE(y_tr, y_tr_predicted_linreg_poly)))
print('Root Mean Squared Error: {}'.format(RMSE(y_tr, y_tr_predicted_linreg_poly)), '\n')

print('Validation Set: ')
print('r^2 score: {}'.format(r2_score(y_val, y_val_predicted_linreg_poly)))
print('Mean Absolute Error: {}'.format(MAE(y_val, y_val_predicted_linreg_poly)))
print('Root Mean Squared Error: {}'.format(RMSE(y_val, y_val_predicted_linreg_poly)), '\n')

print('Test Set: ')
print('r^2 score: {}'.format(r2_score(y_test, y_test_predicted_linreg_poly)))
print('Mean Absolute Error: {}'.format(MAE(y_test, y_test_predicted_linreg_poly)))
print('Root Mean Squared Error: {}'.format(RMSE(y_test, y_test_predicted_linreg_poly)))

Polynomial Linear Regression Model 

Training Set: 
r^2 score: 0.7362918181402429
Mean Absolute Error: 6.100840585293199
Root Mean Squared Error: 7.995652166595328 

Validation Set: 
r^2 score: -3.946657573485216
Mean Absolute Error: 7.807374956605386
Root Mean Squared Error: 34.82102589654425 

Test Set: 
r^2 score: -2.3413135115409407
Mean Absolute Error: 9.082380501302644
Root Mean Squared Error: 28.706978356823065


In [None]:
poly_features = X_tr.columns.to_list()
df_coef_poly_linreg = pd.DataFrame()
df_coef_poly_linreg['Feature'] = poly.get_feature_names(input_features = 
                                                       poly_features)
df_coef_poly_linreg['Coef'] = poly_reg.coef_
df_coef_poly_linreg['Coef_Abs'] = np.abs(poly_reg.coef_)
df_coef_poly_linreg.sort_values(by = 'Coef_Abs', ascending = False)

#### Ridge Regression (Standard Scaled)

In [73]:
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

ridge = Ridge(alpha = 5, tol = .01, random_state = 5)
ridge.fit(X_tr_scaled, y_tr)

y_tr_predicted_ridge = ridge.predict(X_tr_scaled)
y_val_predicted_ridge = ridge.predict(X_val_scaled)
y_test_predicted_ridge = ridge.predict(X_test_scaled)

print('Standard Scaled Ridge Regression Model','\n')
print('Training Set: ')
print('r^2 score: {}'.format(r2_score(y_tr, y_tr_predicted_ridge)),)
print('Mean Absolute Error: {}'.format(MAE(y_tr, y_tr_predicted_ridge)))
print('Root Mean Squared Error: {}'.format(RMSE(y_tr, y_tr_predicted_ridge)), '\n')

print('Validation Set: ')
print('r^2 score: {}'.format(r2_score(y_val, y_val_predicted_ridge)))
print('Mean Absolute Error: {}'.format(MAE(y_val, y_val_predicted_ridge)))
print('Root Mean Squared Error: {}'.format(RMSE(y_val, y_val_predicted_ridge)), '\n')

print('Test Set: ')
print('r^2 score: {}'.format(r2_score(y_test, y_test_predicted_ridge)))
print('Mean Absolute Error: {}'.format(MAE(y_test, y_test_predicted_ridge)))
print('Root Mean Squared Error: {}'.format(RMSE(y_test, y_test_predicted_ridge)))

df_coef_ridge = pd.DataFrame()
df_coef_ridge['Feature'] = X_test.columns.to_list()
df_coef_ridge['Coef'] = ridge.coef_
df_coef_ridge['Coef_Abs'] = np.abs(ridge.coef_)
df_coef_ridge.sort_values(by = 'Coef_Abs', ascending = False)

Standard Scaled Ridge Regression Model 

Training Set: 
r^2 score: 0.6765846419804742
Mean Absolute Error: 6.674193195748351
Root Mean Squared Error: 8.854670610249864 

Validation Set: 
r^2 score: 0.6886696734859383
Mean Absolute Error: 6.574864133366716
Root Mean Squared Error: 8.735672656603374 

Test Set: 
r^2 score: 0.6399278591258268
Mean Absolute Error: 6.989753170362301
Root Mean Squared Error: 9.423747234489975


Unnamed: 0,Feature,Coef,Coef_Abs
7,Proj Mins,4.92,4.92
0,FC Proj,4.03,4.03
27,R4_AVG_MIN,-1.88,1.88
66,AVG_MIN,-1.57,1.57
59,AVG_PTS,1.53,1.53
20,R4_AVG_PTS,1.48,1.48
70,AVG_USG_CUST,1.14,1.14
57,R14_AVG_USG_CUST,-1.02,1.02
58,AVG_FP,0.93,0.93
40,R8_AVG_MIN,0.89,0.89


#### Lasso Regression (Standard Scaled)

In [None]:
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

lasso = Lasso(alpha = 5, random_state = 5, tol = 0.01)
lasso.fit(X_tr_scaled, y_tr)

y_tr_predicted_lasso = lasso.predict(X_tr_scaled)
y_val_predicted_lasso = lasso.predict(X_val_scaled)
y_test_predicted_lasso = lasso.predict(X_test_scaled)

print('Standard Scaled Lasso Regression Model','\n')
print('Training Set: ')
print('r^2 score: {}'.format(r2_score(y_tr, y_tr_predicted_lasso)))
print('Mean Absolute Error: {}'.format(MAE(y_tr, y_tr_predicted_lasso)))
print('Root Mean Squared Error: {}'.format(RMSE(y_tr, y_tr_predicted_lasso)), '\n')

print('Validation Set: ')
print('r^2 score: {}'.format(r2_score(y_val, y_val_predicted_lasso)))
print('Mean Absolute Error: {}'.format(MAE(y_val, y_val_predicted_lasso)))
print('Root Mean Squared Error: {}'.format(RMSE(y_val, y_val_predicted_lasso)), '\n')

print('Test Set: ')
print('r^2 score: {}'.format(r2_score(y_test, y_test_predicted_lasso)))
print('Mean Absolute Error: {}'.format(MAE(y_test, y_test_predicted_lasso)))
print('Root Mean Squared Error: {}'.format(RMSE(y_test, y_test_predicted_lasso)))


#### Lasso Regression (Polynomial Features)

In [9]:
poly = PolynomialFeatures(degree = 2)
poly_features = X_tr.columns.to_list

X_tr_poly = poly.fit_transform(X_tr.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test)

lasso_poly = Lasso(alpha = 5, random_state = 5, tol = 0.01, max_iter = 1500)
lasso_poly.fit(X_tr_poly, y_tr)

y_tr_predicted_lasso = lasso_poly.predict(X_tr_poly)
y_val_predicted_lasso = lasso_poly.predict(X_val_poly)
y_test_predicted_lasso = lasso_poly.predict(X_test_poly)

print('Lasso Regression Model, Polynomial Features','\n')
print('Training Set: ')
print('r^2 score: {}'.format(r2_score(y_tr, y_tr_predicted_lasso)))
print('Mean Absolute Error: {}'.format(MAE(y_tr, y_tr_predicted_lasso)))
print('Root Mean Squared Error: {}'.format(model.RMSE(y_tr, y_tr_predicted_lasso)), '\n')

print('Validation Set: ')
print('r^2 score: {}'.format(r2_score(y_val, y_val_predicted_lasso)))
print('Mean Absolute Error: {}'.format(MAE(y_val, y_val_predicted_lasso)))
print('Root Mean Squared Error: {}'.format(model.RMSE(y_val, y_val_predicted_lasso)), '\n')

print('Test Set: ')
print('r^2 score: {}'.format(r2_score(y_test, y_test_predicted_lasso)))
print('Mean Absolute Error: {}'.format(MAE(y_test, y_test_predicted_lasso)))
print('Root Mean Squared Error: {}'.format(model.RMSE(y_test, y_test_predicted_lasso)), '\n')

print('FC Projections r^2 (Test Set):', r2_score(y_test, fc_y_test))
print('FC Projections MAE (Test Set):' , MAE(y_test, fc_y_test), '\n')

print('Model r2 Performance: {}'.format(r2_score(y_test, y_test_predicted_lasso) - r2_score(y_test, fc_y_test)))
print('Model MAE Performance: {}'.format(-(MAE(y_test, y_test_predicted_lasso) - MAE(y_test, fc_y_test))))


Lasso Regression Model, Polynomial Features 

Training Set: 
r^2 score: 0.683076876944293
Mean Absolute Error: 6.584021937201559
Root Mean Squared Error: 8.719143566013184 

Validation Set: 
r^2 score: 0.685030730525306
Mean Absolute Error: 6.673788357505516
Root Mean Squared Error: 8.912252139598365 

Test Set: 
r^2 score: 0.646816331758457
Mean Absolute Error: 6.885569041943503
Root Mean Squared Error: 9.337330522885287 

FC Projections r^2 (Test Set): 0.6348062597973789
FC Projections MAE (Test Set): 6.807570967193609 

Model r2 Performance: 0.012010071961078062
Model MAE Performance: -0.07799807474989429




In [18]:
poly_features = X_tr.columns.to_list()
df_coef_poly_lasso = pd.DataFrame()
df_coef_poly_lasso['Feature'] = poly.get_feature_names(input_features = 
                                                       poly_features)
df_coef_poly_lasso['Coef'] = lasso_poly.coef_
df_coef_poly_lasso['Coef_Abs'] = np.abs(lasso_poly.coef_)
df_coef_poly_lasso.sort_values(by = 'Coef_Abs', ascending = False)

Unnamed: 0,Feature,Coef,Coef_Abs
586,Proj Mins R14_AVG_FP,0.01,0.01
599,Proj Mins AVG_FP,0.00,0.00
117,FC Proj R14_AVG_FP,0.00,0.00
2616,AVG_MIN AVG_USG%_AD,0.00,0.00
504,Ceiling R4_AVG_MIN,-0.00,0.00
1848,R8_AVG_FP^2,-0.00,0.00
276,WEIGHT AVG_TOV,0.00,0.00
549,Proj Mins Def v Pos,0.00,0.00
224,WEIGHT SF_POS,-0.00,0.00
1677,R4_AVG_MIN AVG_MIN,-0.00,0.00


#### Regression Tree

In [103]:
regression_tree = DecisionTreeRegressor(max_depth = None, max_features = 'auto', random_state = 10, max_leaf_nodes = 10)
regression_tree.fit(X_tr, y_tr)

tree_y_tr_predict = regression_tree.predict(X_tr)
tree_y_val_predict = regression_tree.predict(X_val)
tree_y_test_predict = regression_tree.predict(X_test)

print('Regression Tree Model','\n')
print('Training Set: ')
print('r^2 score: {}'.format(r2_score(y_tr, tree_y_tr_predict)),)
print('Mean Absolute Error: {}'.format(MAE(y_tr, tree_y_tr_predict)))
print('Root Mean Squared Error: {}'.format(model.RMSE(y_tr, tree_y_tr_predict)), '\n')

print('Validation Set: ')
print('r^2 score: {}'.format(r2_score(y_val, tree_y_val_predict)))
print('Mean Absolute Error: {}'.format(MAE(y_val, tree_y_val_predict)))
print('Root Mean Squared Error: {}'.format(model.RMSE(y_val, tree_y_val_predict)), '\n')

print('Test Set: ')
print('r^2 score: {}'.format(r2_score(y_test, tree_y_test_predict)))
print('Mean Absolute Error: {}'.format(MAE(y_test, tree_y_test_predict)))
print('Root Mean Squared Error: {}'.format(model.RMSE(y_test, tree_y_test_predict)),'\n')

print('FC Projections r^2 (Test Set):', r2_score(y_test, fc_y_test))
print('FC Projections MAE (Test Set):' , MAE(y_test, fc_y_test), '\n')

print('Model r2 Score: {}'.format(r2_score(y_test, tree_y_test_predict) - r2_score(y_test, fc_y_test)))
print('Model MAE Score: {}'.format(MAE(y_test, tree_y_test_predict) - MAE(y_test, fc_y_test)))

Regression Tree Model 

Training Set: 
r^2 score: 0.6592901687044648
Mean Absolute Error: 6.87868983931522
Root Mean Squared Error: 9.088337016376093 

Validation Set: 
r^2 score: 0.6591141434042453
Mean Absolute Error: 6.9074092992199985
Root Mean Squared Error: 9.14092466121778 

Test Set: 
r^2 score: 0.6247486358466301
Mean Absolute Error: 7.162832251165195
Root Mean Squared Error: 9.620330856454503 

FC Projections r^2 (Test Set): 0.6339703244513564
FC Projections MAE (Test Set): 6.807045925414365 

Model r2 Score: -0.009221688604726275
Model MAE Score: 0.35578632575083
