In [None]:
import os
import glob
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import utils
from constants import DATA_DIR

In [None]:
# Linear Regression with basic 9 variables counted for fantasy points
def pred(df, weight):
    df.fillna(0, inplace=True)
    #pred_columns = ['FG%','FT%','3P','PTS','TRB','AST','STL','BLK','TOV']
    #X = df.loc[:, pred_columns]
    X = df.loc[:, df.columns != 'VALUE']._get_numeric_data()
    X = MinMaxScaler().fit_transform(X)
    y = df['VALUE']
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=None)
    model = Lasso(alpha=0.001)
    # define model evaluation method
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # define grid
    model = LassoCV(alphas=np.arange(0, 1, 0.01), cv=cv, n_jobs=-1)
    # fit model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #results
    reg = LassoCV(alphas=np.arange(0, 1, 0.01), cv=cv, n_jobs=-1)
    print(weight)
    print(mae(y_test, y_pred))
    print(mse(y_test, y_pred, squared=False))
    print(r2(y_test, y_pred))
    print('\n')
#errors = utils.cross_val(reg, X, y, n_folds=5, verbose=0)
#utils.summarize_errors(errors)

In [None]:
def select(df):
    # split into train and test sets
    X = df.loc[:, df.columns != 'VALUE']._get_numeric_data()
    #X = MinMaxScaler().fit_transform(X)
    y = df['VALUE']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
    X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
    # what are scores for the features
    #for i in range(len(fs.scores_)):
    #    print('Feature %d: %f' % (i, fs.scores_[i]))
    # plot the scores
    #plt.bar([i for i in range(len(fs.scores_))], fs.scores_)
    #plt.show()
    return X_train_fs, X_test_fs, fs
    # feature selection

def select_features(X_train, y_train, X_test):
    # configure to select all features
    fs = SelectKBest(score_func=f_regression, k=9)
    # learn relationship from training data
    fs.fit(X_train, y_train)
    # transform train input data
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

In [None]:
weights = ['base','sqrt','linear','quad']
seasons = ['2016-17','2017-18','2018-19','2019-20']

df1 = pd.read_csv(os.path.join(DATA_DIR,'Weighted',weight,season+'.csv'))
df = df.append(df1)
df.fillna(0, inplace=True)
X = df.loc[:, df.columns != 'VALUE']._get_numeric_data()
y = df['VALUE']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=None)
cv = RepeatedKFold(n_splits=2, n_repeats=1000)

reg = LinearRegression()
scores = cross_val_score(reg, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv)
model = reg.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean.append(np.mean(scores))
print(weight)
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
print(mae(y_test, y_pred))
print(mse(y_test, y_pred, squared=False))
print(r2(y_test, y_pred))
print('\n')
    
    #print(sum(mean) / len(mean))

In [None]:
simple average
linear regression w/8 predictors
linear regression w/all predictors
lasso w/all predictors
ridge w/all predictors
xgboost w/out parameter
xgboost w/parameter tuning
neural networks