In [85]:
# std
import os
import sys
import inspect
import time
import pathlib
from math import sqrt
from math import log2
# packgaes
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
%matplotlib inline

# packages
from matplotlib.colors import ListedColormap

## sklearn
from sklearn.preprocessing import StandardScaler,PowerTransformer,MinMaxScaler,QuantileTransformer,normalize
from sklearn.model_selection import train_test_split, learning_curve, ShuffleSplit
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2

from sklearn.metrics import r2_score
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import median_absolute_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


# for selection the right path
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from common.DataParser import parse_moneyball
from common.model_trainer_reg import *
from common.regression_plotfunctions import *

from GD.LinearRegression import LinearRegression
from KNN.KNNRegressor import KNNRegressor

import moneyball_preprocession

In [86]:
df_raw = parse_moneyball()
df_raw

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415
1,ATL,NL,2012,700,600,94,0.320,0.389,0.247,1,4.0,5.0,162,0.306,0.378
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403
3,BOS,AL,2012,734,806,69,0.315,0.415,0.260,0,,,162,0.331,0.428
4,CHC,NL,2012,613,759,61,0.302,0.378,0.240,0,,,162,0.335,0.424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1227,PHI,NL,1962,705,759,81,0.330,0.390,0.260,0,,,161,,
1228,PIT,NL,1962,706,626,93,0.321,0.394,0.268,0,,,161,,
1229,SFG,NL,1962,878,690,103,0.341,0.441,0.278,1,1.0,2.0,165,,
1230,STL,NL,1962,774,664,84,0.335,0.394,0.271,0,,,163,,


# Train and Test

In [87]:
mode="mean"
X, Y = moneyball_preprocession.preprocessing(df_raw, drop_missing_values=False)
#mode="dropped"
#X, Y = moneyball_preprocession.preprocessing(df_raw, drop_missing_values=True)

n_splits = 10

## SGD-Regression

In [88]:
MODEL = SGDRegressor
NAME = "SGD"
PATH = "out/"+NAME+"/"
params = {"alpha" : [0.001],
          "max_iter" : [1000]}


modeltrainer = ModelTrainer(MODEL, params, X, Y, Variationerror = mean_gamma_deviance, NameOfError = "MGE", thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = 0.5)
modeltrainer.train()
results = modeltrainer.retResults(PATH + f"sklearn_TTS_SGD_{mode}.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = 0.5, random_state = 42)
results = modeltrainer.retResults(PATH + f"sklearn_CV_SGD_{mode}.csv")
#display(results)




Training all model variations took 0.0040s - holdout


Unnamed: 0,alpha,max_iter,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train,MGE
0,0.001,1000,0.003002,0.0,0,0.876508,15.986514,8,1232,616,616,0.00271


Using CV with k=10 folds.
Training all model variations took 0.0030s - CV fold # =1
Training all model variations took 0.0030s - CV fold # =2
Training all model variations took 0.0030s - CV fold # =3
Training all model variations took 0.0020s - CV fold # =4
Training all model variations took 0.0040s - CV fold # =5
Training all model variations took 0.0030s - CV fold # =6
Training all model variations took 0.0030s - CV fold # =7
Training all model variations took 0.0040s - CV fold # =8
Training all model variations took 0.0040s - CV fold # =9
Training all model variations took 0.0020s - CV fold # =10


## My SGD-Regression

In [89]:
MODEL = LinearRegression
#params = {"alpha" : [0.001,0.001],
          #"max_iter" : [1000,10000,100000]}
#n_splits = 2
modeltrainer = ModelTrainer(MODEL, params, X, Y, Variationerror = mean_gamma_deviance, NameOfError = "MGE", thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = 0.5)
modeltrainer.train()
results = modeltrainer.retResults(PATH + f"my_TTS_SGD_{mode}.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = 0.5, random_state = 42)
results = modeltrainer.retResults(PATH + f"my_CV_SGD_{mode}.csv")
#display(results)

Training all model variations took 0.0150s - holdout


Unnamed: 0,alpha,max_iter,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train,MGE
0,0.001,1000,0.014997,0.0,0,0.875578,16.106947,8,1232,616,616,0.002735


Using CV with k=10 folds.
Training all model variations took 0.0160s - CV fold # =1
Training all model variations took 0.0150s - CV fold # =2
Training all model variations took 0.0160s - CV fold # =3
Training all model variations took 0.0150s - CV fold # =4
Training all model variations took 0.0150s - CV fold # =5
Training all model variations took 0.0150s - CV fold # =6
Training all model variations took 0.0150s - CV fold # =7
Training all model variations took 0.0150s - CV fold # =8
Training all model variations took 0.0150s - CV fold # =9
Training all model variations took 0.0150s - CV fold # =10


## KNN-Regression

In [90]:
MODEL = KNeighborsRegressor
params = {"n_neighbors": [10],
        "weights" : ["uniform"],
        #"algorithm": ["brute","kd_tree","ball_tree"]
        }
NAME = "KNN"
PATH = "out/"+NAME+"/"

#n_splits = 2
modeltrainer = ModelTrainer(MODEL, params, X, Y, Variationerror = mean_gamma_deviance, NameOfError = "MGE", thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = 0.5)
modeltrainer.train()
results = modeltrainer.retResults(PATH + f"sklearn_TTS_KNN_{mode}.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = 0.5, random_state = 42)
results = modeltrainer.retResults(PATH + f"sklearn_CV_KNN_{mode}.csv")
display(results)

Training all model variations took 0.0060s - holdout


Unnamed: 0,n_neighbors,weights,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train,MGE
0,5,uniform,0.001,0.004,0,0.785048,27.826299,8,1232,616,616,0.004589


Using CV with k=10 folds.
Training all model variations took 0.0060s - CV fold # =1
Training all model variations took 0.0050s - CV fold # =2
Training all model variations took 0.0050s - CV fold # =3
Training all model variations took 0.0050s - CV fold # =4
Training all model variations took 0.0050s - CV fold # =5
Training all model variations took 0.0050s - CV fold # =6
Training all model variations took 0.0060s - CV fold # =7
Training all model variations took 0.0050s - CV fold # =8
Training all model variations took 0.0050s - CV fold # =9
Training all model variations took 0.0050s - CV fold # =10


Unnamed: 0,n_neighbors,weights,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train,MGE
0,5,uniform,0.001,0.004999,1,0.785048,27.826299,8,1232,616,616,0.004589
1,5,uniform,0.001001,0.003999,2,0.775452,26.522792,8,1232,616,616,0.004234
2,5,uniform,0.0,0.003999,3,0.774916,28.791364,8,1232,616,616,0.004601
3,5,uniform,0.001,0.002998,4,0.783267,29.830325,8,1232,616,616,0.00504
4,5,uniform,0.001,0.003998,5,0.763784,31.424026,8,1232,616,616,0.005034
5,5,uniform,0.001001,0.003999,6,0.797125,26.737403,8,1232,616,616,0.004425
6,5,uniform,0.001,0.003998,7,0.7983,27.387727,8,1232,616,616,0.004559
7,5,uniform,0.001,0.003997,8,0.763411,31.249545,8,1232,616,616,0.005175
8,5,uniform,0.001001,0.002998,9,0.788514,28.904805,8,1232,616,616,0.004918
9,5,uniform,0.001,0.003999,10,0.774185,29.712013,8,1232,616,616,0.004895


## my KNN-Regression

In [91]:
MODEL = KNNRegressor
params = {"n_neighbors" : [10],
           "p": [2],
           "chunk_size": [600]}
NAME = "KNN"
PATH = "out/"+NAME+"/"

#n_splits = 2
modeltrainer = ModelTrainer(MODEL, params, X, Y, Variationerror = mean_gamma_deviance, NameOfError = "MGE", thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = 0.5)
modeltrainer.train()
results = modeltrainer.retResults(PATH + f"my_TTS_KNN_{mode}.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = 0.5, random_state = 42)
results = modeltrainer.retResults(PATH + f"my_CV_KNN_{mode}.csv")
display(results)

Training all model variations took 0.0250s - holdout


Unnamed: 0,n_neighbors,p,chunk_size,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train,MGE
0,5,2,600,0.0,0.024,0,0.785048,27.826299,8,1232,616,616,0.004589


Using CV with k=10 folds.
Training all model variations took 0.0240s - CV fold # =1
Training all model variations took 0.0240s - CV fold # =2
Training all model variations took 0.0240s - CV fold # =3
Training all model variations took 0.0230s - CV fold # =4
Training all model variations took 0.0230s - CV fold # =5
Training all model variations took 0.0230s - CV fold # =6
Training all model variations took 0.0230s - CV fold # =7
Training all model variations took 0.0230s - CV fold # =8
Training all model variations took 0.0230s - CV fold # =9
Training all model variations took 0.0240s - CV fold # =10


Unnamed: 0,n_neighbors,p,chunk_size,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train,MGE
0,5,2,600,0.0,0.023998,1,0.785048,27.826299,8,1232,616,616,0.004589
1,5,2,600,0.0,0.023006,2,0.775452,26.522792,8,1232,616,616,0.004234
2,5,2,600,0.0,0.023998,3,0.774916,28.791364,8,1232,616,616,0.004601
3,5,2,600,0.0,0.023001,4,0.783267,29.830325,8,1232,616,616,0.00504
4,5,2,600,0.0,0.023002,5,0.763784,31.424026,8,1232,616,616,0.005034
5,5,2,600,0.0,0.021997,6,0.797125,26.737403,8,1232,616,616,0.004425
6,5,2,600,0.0,0.023,7,0.7983,27.387727,8,1232,616,616,0.004559
7,5,2,600,0.0,0.023001,8,0.763411,31.249545,8,1232,616,616,0.005175
8,5,2,600,0.0,0.022,9,0.788514,28.904805,8,1232,616,616,0.004918
9,5,2,600,0.0,0.022998,10,0.774185,29.712013,8,1232,616,616,0.004895


## RF-Regression

In [92]:
MODEL = RandomForestRegressor
params = {"n_estimators" : [100]}
NAME = "RF"
PATH = "out/"+NAME+"/"


#n_splits = 2
modeltrainer = ModelTrainer(MODEL, params, X, Y, Variationerror = mean_gamma_deviance, NameOfError = "MGE", thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = 0.5)
modeltrainer.train()
results = modeltrainer.retResults(PATH + f"sklearn_TTS_RF_{mode}.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = 0.5, random_state = 42)
results = modeltrainer.retResults(PATH + f"sklearn_CV_RF_{mode}.csv")
display(results)

Training all model variations took 0.1950s - holdout


Unnamed: 0,n_estimators,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train,MGE
0,100,0.184997,0.009,0,0.846807,19.831416,8,1232,616,616,0.003267


Using CV with k=10 folds.
Training all model variations took 0.1920s - CV fold # =1
Training all model variations took 0.1950s - CV fold # =2
Training all model variations took 0.1920s - CV fold # =3
Training all model variations took 0.1940s - CV fold # =4
Training all model variations took 0.1920s - CV fold # =5
Training all model variations took 0.1930s - CV fold # =6
Training all model variations took 0.1940s - CV fold # =7
Training all model variations took 0.1910s - CV fold # =8
Training all model variations took 0.1920s - CV fold # =9
Training all model variations took 0.1910s - CV fold # =10


Unnamed: 0,n_estimators,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train,MGE
0,100,0.181999,0.009999,1,0.847386,19.756484,8,1232,616,616,0.003244
1,100,0.184999,0.009001,2,0.841231,18.753167,8,1232,616,616,0.002954
2,100,0.182003,0.009007,3,0.84815,19.423698,8,1232,616,616,0.002987
3,100,0.183991,0.009,4,0.85164,20.419629,8,1232,616,616,0.00349
4,100,0.182,0.009,5,0.843465,20.824019,8,1232,616,616,0.003317
5,100,0.182997,0.009001,6,0.861276,18.282771,8,1232,616,616,0.002996
6,100,0.184,0.009,7,0.855455,19.627069,8,1232,616,616,0.003327
7,100,0.180996,0.009003,8,0.822955,23.384748,8,1232,616,616,0.00392
8,100,0.182001,0.008998,9,0.847105,20.896924,8,1232,616,616,0.003544
9,100,0.180999,0.009001,10,0.83657,21.503672,8,1232,616,616,0.003523


## DT-Regression

In [93]:
MODEL = DecisionTreeRegressor
params = {"criterion": ["mse"]}
NAME = "DT"
PATH = "out/"+NAME+"/"


modeltrainer = ModelTrainer(MODEL, params, X, Y, Variationerror = mean_gamma_deviance, NameOfError = "MGE", thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = 0.5)
modeltrainer.train()
results = modeltrainer.retResults(PATH + f"sklearn_TTS_DT_{mode}.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = 0.5, random_state = 42)
results = modeltrainer.retResults(PATH + f"sklearn_CV_DT_{mode}.csv")
display(results)

Training all model variations took 0.0030s - holdout


Unnamed: 0,criterion,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train,MGE
0,mse,0.003002,0.0,0,0.709155,37.650974,8,1232,616,616,0.006106


Using CV with k=10 folds.
Training all model variations took 0.0030s - CV fold # =1
Training all model variations took 0.0020s - CV fold # =2
Training all model variations took 0.0020s - CV fold # =3
Training all model variations took 0.0030s - CV fold # =4
Training all model variations took 0.0030s - CV fold # =5
Training all model variations took 0.0030s - CV fold # =6
Training all model variations took 0.0020s - CV fold # =7
Training all model variations took 0.0020s - CV fold # =8
Training all model variations took 0.0020s - CV fold # =9
Training all model variations took 0.0020s - CV fold # =10


Unnamed: 0,criterion,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train,MGE
0,mse,0.001998,0.0,1,0.708678,37.712662,8,1232,616,616,0.006187
1,mse,0.002,0.0,2,0.666092,39.439935,8,1232,616,616,0.00625
2,mse,0.001997,0.0,3,0.718777,35.972403,8,1232,616,616,0.005833
3,mse,0.002003,0.001,4,0.719982,38.540584,8,1232,616,616,0.006452
4,mse,0.001997,0.0,5,0.728471,36.121753,8,1232,616,616,0.005785
5,mse,0.001998,0.0,6,0.743842,33.75974,8,1232,616,616,0.005581
6,mse,0.001998,0.0,7,0.708823,39.537338,8,1232,616,616,0.006818
7,mse,0.002001,0.0,8,0.695428,40.228896,8,1232,616,616,0.006768
8,mse,0.002,0.0,9,0.697346,41.36526,8,1232,616,616,0.006716
9,mse,0.001999,0.0,10,0.719413,36.918831,8,1232,616,616,0.006025
