## Dataset Evaluation

Sort of a feature engineering 2.0, we want to do a quick evaluation of which weighting system is best at predicting future performance. This important to figure out now so we can test all the models on just one weighting method later on rather than all.

In [1]:
import math
import os
import utils
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from constants import PUNT_TYPES
from constants import DATA_DIR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

Get baseline MAE and RMSE using the average scores across all players for a given punt type and weighting system

In [2]:
def avg_errors(punt_type):
    df = utils.csv_concatenate(os.path.join(DATA_DIR,punt_type,'Weighted','base'))
    columns = utils.get_punt_columns([])
    X = df.loc[:, columns]
    y = df['VALUE']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=None)
    mean_train = np.mean(y_train)
    baseline_predictions = np.ones(y_test.shape)*mean_train
    return mae(y_test, baseline_predictions), math.sqrt(mse(y_test, baseline_predictions))

Next we'll define a generic function that uses linear regression to predict scores for all players for a given punt/weight.

In [3]:
def reg_errors(punt, weight):
    df = utils.csv_concatenate(os.path.join(DATA_DIR,punt,'Weighted',weight))
    df.drop(columns=['PLAYER','TEAM','SEASON'], inplace=True)
    df = pd.get_dummies(df)
    X = df.loc[:, df.columns != 'VALUE']
    y = df['VALUE'].values.reshape(-1,1).flatten()
    errors = utils.cross_val(LinearRegression(), X, y, n_folds=3, verbose=0)
    return np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

Across all punting strategies, using the quad dataset seems to yield the smallest RMSE and MAE. As such, this dataset will be used in the next step, which is to select the best algorithm for each punting strategy.

In [4]:
# analyze which weighting is best for each punt type
rmse_table = pd.DataFrame()
mae_table = pd.DataFrame()
weights = ['average','base','sqrt','linear','quad']
for punt in PUNT_TYPES:
    if punt == []:
        punt = 'Base'
    else:
        punt = '+'.join(punt)
    for weight in weights:
        if weight =='average':
            mae_value, rmse_value = avg_errors(punt)
        else:
            mae_value, rmse_value = reg_errors(punt, weight)
        rmse_table.loc[punt,weight] = rmse_value
        mae_table.loc[punt,weight] = mae_value
print(rmse_table)
print(mae_table)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


          average      base      sqrt    linear      quad
Base     4.554577  1.954075  1.745552  1.570686  1.324874
FG%      3.779172  1.685420  1.510998  1.364161  1.157095
FT%      3.752739  1.689935  1.505564  1.350984  1.134753
PTS      3.189849  1.584244  1.420491  1.284359  1.095474
TRB      3.438141  1.634721  1.460911  1.317014  1.119045
AST      3.511815  1.712135  1.531025  1.381595  1.176833
STL      3.374491  1.613566  1.445645  1.307155  1.117064
BLK      3.560252  1.695209  1.516528  1.368121  1.162745
FG%+TRB  3.253613  1.437589  1.289658  1.165626  0.992359
BLK+FG%  3.396806  1.504075  1.349512  1.219324  1.035781
AST+STL  2.980106  1.470513  1.318611  1.194542  1.026974
PTS+FT%  2.935076  1.391951  1.244009  1.119583  0.944919
          average      base      sqrt    linear      quad
Base     3.638107  1.466071  1.287948  1.130732  0.887857
FG%      3.006359  1.258832  1.110090  0.974600  0.767228
FT%      3.019512  1.261361  1.106508  0.969847  0.766891
PTS      2.46

Create ABTs for each punt.

In [5]:
for punt in PUNT_TYPES:
    if punt == []:
        punt = 'Base'
    else:
        punt = '+'.join(punt)
    df = utils.csv_concatenate(os.path.join(DATA_DIR,punt,'Weighted','quad'))
    df.drop(columns=['PLAYER','TEAM','SEASON'], inplace=True)
    df = pd.get_dummies(df)
    directory = os.path.join(DATA_DIR,'ABT')
    if not os.path.exists(directory):
        os.makedirs(directory)
    df.to_csv(os.path.join(DATA_DIR,'ABT',punt+'.csv'), index=False)