## Dataset Evaluation

Sort of a feature engineering 2.0, we want to do a quick evaluation of which weighting system is best at predicting future performance. This important to figure out now so we can test all the models on just one weighting method later on rather than all.

In [1]:
import math
import os
import utils
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from constants import PUNT_TYPES
from constants import DATA_DIR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

Get baseline MAE and RMSE using the average scores across all players for a given punt type and weighting system

In [2]:
def avg_errors(punt_type):
    df = utils.csv_concatenate(os.path.join(DATA_DIR,punt_type,'Weighted','base'))
    columns = utils.get_punt_columns([])
    X = df.loc[:, columns]
    y = df['VALUE']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=None)
    mean_train = np.mean(y_train)
    baseline_predictions = np.ones(y_test.shape)*mean_train
    return mae(y_test, baseline_predictions), math.sqrt(mse(y_test, baseline_predictions))

Next we'll define a generic function that uses linear regression to predict scores for all players for a given punt/weight.

In [3]:
def reg_errors(punt, weight):
    df = utils.csv_concatenate(os.path.join(DATA_DIR,punt,'Weighted',weight))
    df.drop(columns=['PLAYER','TEAM','SEASON'], inplace=True)
    df = pd.get_dummies(df)
    X = df.loc[:, df.columns != 'VALUE']
    y = df['VALUE'].values.reshape(-1,1).flatten()
    errors = utils.cross_val(LinearRegression(), X, y, n_folds=3, verbose=0)
    return np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

Across all punting strategies, using the quad dataset seems to yield the smallest RMSE and MAE. As such, this dataset will be used in the next step, which is to select the best algorithm for each punting strategy.

In [4]:
# analyze which weighting is best for each punt type
rmse_table = pd.DataFrame()
mae_table = pd.DataFrame()
weights = ['average','base','sqrt','linear','quad']
for punt in PUNT_TYPES:
    if punt == []:
        punt = 'Base'
    else:
        punt = '+'.join(punt)
    for weight in weights:
        if weight =='average':
            mae_value, rmse_value = avg_errors(punt)
        else:
            mae_value, rmse_value = reg_errors(punt, weight)
        rmse_table.loc[punt,weight] = rmse_value
        mae_table.loc[punt,weight] = mae_value
print(rmse_table)
print(mae_table)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

          average      base      sqrt    linear      quad
FG%      4.520541  2.693742  2.630471  2.588626  2.551013
FT%      4.448247  2.764350  2.701690  2.660382  2.623586
PTS      4.069522  2.528208  2.471565  2.434229  2.401121
TRB      4.395504  2.653984  2.596743  2.559652  2.528187
AST      4.436189  2.747983  2.684297  2.641655  2.602720
STL      4.301335  2.612119  2.553075  2.514172  2.479196
BLK      4.627029  2.741989  2.680671  2.640446  2.604305
FG%+TRB  3.845354  2.276777  2.228812  2.198202  2.173251
BLK+FG%  4.073944  2.357082  2.304798  2.270801  2.240823
AST+STL  3.745995  2.311106  2.260251  2.226558  2.196426
PTS+FT%  3.457323  2.245463  2.198375  2.167819  2.141827
Base     4.533830  2.729065  2.664717  2.621930  2.582999
          average      base      sqrt    linear      quad
FG%      3.553158  2.059476  2.005970  1.968155  1.930998
FT%      3.547790  2.115966  2.059686  2.023190  1.986221
PTS      3.170360  1.909548  1.861259  1.826987  1.794002
TRB      3.430

Create ABTs for each punt.

In [5]:
for punt in PUNT_TYPES:
    if punt == []:
        punt = 'Base'
    else:
        punt = '+'.join(punt)
    df = utils.csv_concatenate(os.path.join(DATA_DIR,punt,'Weighted','quad'))
    df.drop(columns=['PLAYER','TEAM','SEASON'], inplace=True)
    df = pd.get_dummies(df)
    directory = os.path.join(DATA_DIR,'ABT')
    if not os.path.exists(directory):
        os.makedirs(directory)
    df.to_csv(os.path.join(DATA_DIR,'ABT',punt+'.csv'), index=False)