### Import Module

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from keras import optimizers
from keras.models import Sequential, load_model
from keras.layers import Dense,Activation, LSTM, Dropout, TimeDistributed, Flatten
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

from sklearn.utils import class_weight
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC 
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  from numpy.core.umath_tests import inner1d


### Data Preparation

In [2]:
training_load = pd.read_csv('training_1961_2016.csv', delimiter=',')
testing_2017_load = pd.read_csv('testing_2017.csv', delimiter=',')
testing_2018_load = pd.read_csv('testing_2018.csv', delimiter=',')
testing_2019_load = pd.read_csv('testing_2019.csv', delimiter=',')

In [3]:
training_load.head(5)

Unnamed: 0,FirstYear_index,FirstYear_playerID,FirstYear_yearID,FirstYear_teamID,FirstYear_lgID,FirstYear_stint,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,...,SixthYear_HBP,SixthYear_SH,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age
0,41261,aaronha01,1961,ML1,NL,1.0,155.0,603.0,115.0,197.0,...,1.0,0.0,8.0,14.0,Hank,Aaron,Henry Louis,180.0,72.0,32.0
1,41959,aaronha01,1962,ML1,NL,1.0,156.0,592.0,127.0,191.0,...,0.0,0.0,6.0,11.0,Hank,Aaron,Henry Louis,180.0,72.0,33.0
2,42719,aaronha01,1963,ML1,NL,1.0,161.0,631.0,121.0,201.0,...,1.0,0.0,5.0,21.0,Hank,Aaron,Henry Louis,180.0,72.0,34.0
3,43471,aaronha01,1964,ML1,NL,1.0,145.0,570.0,103.0,187.0,...,2.0,0.0,3.0,14.0,Hank,Aaron,Henry Louis,180.0,72.0,35.0
4,44225,aaronha01,1965,ML1,NL,1.0,150.0,570.0,109.0,181.0,...,2.0,0.0,6.0,13.0,Hank,Aaron,Henry Louis,180.0,72.0,36.0


In [4]:
var_name = ['index', 'playerID', 'teamID', 'lgID', 'stint','nameFirst', 'nameLast', 'nameGiven']

First = []
for i in range(len(var_name)):
    First.append(f"FirstYear_{var_name[i]}")

Second = []
for i in range(len(var_name)):
    Second.append(f"SecondYear_{var_name[i]}")

Third = []
for i in range(len(var_name)):
    Third.append(f"ThirdYear_{var_name[i]}")
    
Fourth = []
for i in range(len(var_name)):
    Fourth.append(f"FourthYear_{var_name[i]}")
    
Fifth = []
for i in range(len(var_name)):
    Fifth.append(f"FifthYear_{var_name[i]}")
    
FileColumnName = First + Second + Third + Fourth + Fifth

In [5]:
x_train = training_load.iloc[:,0:-29].drop(FileColumnName, axis=1).values
y_train = training_load['SixthYear_HR'].values
x_test_2017 = testing_2017_load.iloc[:,0:-29].drop(FileColumnName, axis=1).values
y_test_2017 = testing_2017_load['SixthYear_HR'].values
x_test_2018 = testing_2018_load.iloc[:,0:-29].drop(FileColumnName, axis=1).values
y_test_2018 = testing_2018_load['SixthYear_HR'].values
x_test_2019 = testing_2019_load.iloc[:,0:-29].drop(FileColumnName, axis=1).values
y_test_2019 = testing_2019_load['SixthYear_HR'].values

In [6]:
y_train_class = np.select([np.floor(y_train/5) < 8, np.floor(y_train/5) > 7], [np.floor(y_train/5), 8])
y_test_2017_class = np.select([np.floor(y_test_2017/5) < 8, np.floor(y_test_2017/5) > 7], [np.floor(y_test_2017/5), 8])
y_test_2018_class = np.select([np.floor(y_test_2018/5) < 8, np.floor(y_test_2018/5) > 7], [np.floor(y_test_2018/5), 8])
y_test_2019_class = np.select([np.floor(y_test_2019/5) < 8, np.floor(y_test_2019/5) > 7], [np.floor(y_test_2019/5), 8])

In [7]:
y_train_cat = np_utils.to_categorical(y_train_class, 9)
y_test_2017_cat = np_utils.to_categorical(y_test_2017_class, 9)
y_test_2018_cat = np_utils.to_categorical(y_test_2018_class, 9)
y_test_2019_cat = np_utils.to_categorical(y_test_2019_class, 9)

In [8]:
x_train_2017 = np.concatenate((x_train, x_test_2017))
y_train_2017 = np.concatenate((y_train, y_test_2017))
y_train_2017_class = np.concatenate((y_train_class, y_test_2017_class))
y_train_2017_cat = np.concatenate((y_train_cat, y_test_2017_cat))

In [9]:
x_train_2018 = np.concatenate((x_train, x_test_2017, x_test_2018))
y_train_2018 = np.concatenate((y_train, y_test_2017, y_test_2018))
y_train_2018_class = np.concatenate((y_train_class, y_test_2017_class, y_test_2018_class))
y_train_2018_cat = np.concatenate((y_train_cat, y_test_2017_cat, y_test_2018_cat))

In [107]:
testing_2018 = testing_2018_load.drop(FileColumnName, axis=1).copy()
testing_2019 = testing_2019_load.drop(FileColumnName, axis=1).copy()

In [10]:
from sklearn.utils import class_weight
class_weight = np.array(class_weight.compute_class_weight('balanced'
                                               ,np.unique(y_train_class)
                                               ,y_train_class))

In [11]:
scaler = StandardScaler().fit(training_load.iloc[:,0:-29].drop(FileColumnName, axis=1).values)

### Linear Regression

In [21]:
regr_2017 = LinearRegression()
regr_2017.fit(x_train_2017,y_train_2017)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [108]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = regr_2017.predict(x_test_2018)
for i in range(len(y_test_2018)):
    k = np.round(predictions)[i] - y_test_2018[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2018[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2018[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2018[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2018[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2018[i])
                        p1.append(i)
    else:
        q.append(y_test_2018[i])
        q1.append(i)
print("答對：%.4f" % (t/len(y_test_2018)))
print("正負1：%.4f" % (a/len(y_test_2018)))
print("正負3：%.4f" % (b/len(y_test_2018)))
print("正負5：%.4f" % (c/len(y_test_2018)))
print("正負10：%.4f" % (d/len(y_test_2018)))

答對：0.0435
正負1：0.1739
正負3：0.4130
正負5：0.6033
正負10：0.9022


In [109]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2018))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2018))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2018)))

mean_absolute_error: 5.205107357191003
mean_squared_error: 43.38290756464203
rmse: 6.586570242898958


In [110]:
Counter(sorted(p))

Counter({0.0: 1, 5.0: 1, 10.0: 2, 16.0: 1, 20.0: 1, 22.0: 1, 23.0: 1})

In [111]:
Counter(sorted(n))

Counter({0.0: 2,
         1.0: 1,
         2.0: 2,
         3.0: 1,
         4.0: 2,
         5.0: 2,
         6.0: 1,
         9.0: 3,
         10.0: 3,
         11.0: 2,
         12.0: 1,
         13.0: 1,
         14.0: 1,
         15.0: 1,
         16.0: 1,
         19.0: 1,
         20.0: 1,
         21.0: 2,
         22.0: 1,
         23.0: 2,
         32.0: 1})

In [112]:
Counter(sorted(m))

Counter({0.0: 2,
         1.0: 6,
         2.0: 3,
         3.0: 2,
         4.0: 4,
         5.0: 4,
         6.0: 5,
         7.0: 1,
         8.0: 4,
         9.0: 5,
         10.0: 6,
         11.0: 5,
         12.0: 3,
         13.0: 2,
         14.0: 4,
         15.0: 4,
         16.0: 2,
         18.0: 1,
         19.0: 2,
         20.0: 2,
         21.0: 3,
         22.0: 1,
         23.0: 2,
         24.0: 1,
         32.0: 1,
         38.0: 1})

In [113]:
Counter(sorted(q))

Counter({2.0: 1,
         3.0: 1,
         4.0: 1,
         6.0: 1,
         8.0: 1,
         9.0: 2,
         10.0: 1,
         12.0: 1,
         22.0: 1,
         23.0: 2,
         27.0: 1,
         36.0: 2,
         39.0: 1,
         43.0: 1,
         48.0: 1})

In [114]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [71, 77, 83, 108, 119, 129, 131, 159]
1: [2, 11, 13, 16, 22, 34, 47, 50, 58, 62, 71, 73, 77, 83, 107, 108, 114, 115, 119, 123, 129, 131, 133, 144, 152, 154, 155, 158, 159, 162, 180, 182]
3: [1, 2, 8, 10, 11, 13, 15, 16, 17, 22, 23, 28, 34, 39, 47, 48, 49, 50, 51, 54, 58, 59, 60, 62, 69, 70, 71, 73, 77, 78, 81, 83, 86, 90, 91, 92, 94, 97, 98, 99, 107, 108, 113, 114, 115, 116, 119, 120, 123, 124, 129, 131, 133, 136, 142, 144, 146, 152, 154, 155, 157, 158, 159, 160, 161, 162, 165, 166, 169, 171, 173, 176, 177, 178, 180, 182]
NA: [9, 18, 21, 24, 25, 30, 38, 41, 44, 52, 87, 88, 104, 111, 145, 167, 175, 179]


In [116]:
regr_2018_dataframe = testing_2018.copy()
regr_2018_dataframe['Prediction'] = np.round(predictions)
regr_2018_dataframe['Difference'] = regr_2018_dataframe.SixthYear_HR - regr_2018_dataframe.Prediction

In [185]:
regr_2018_dataframe[regr_2018_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
38,2013,56.0,136.0,27.0,38.0,10.0,0.0,11.0,27.0,3.0,...,7.0,16.0,Khris,Davis,Khristopher Adrian,205.0,71.0,31.0,30.0,18.0
111,2013,86.0,296.0,24.0,74.0,17.0,0.0,7.0,36.0,2.0,...,7.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,31.0,29.0,14.0


In [29]:
regr_2018 = LinearRegression()
regr_2018.fit(x_train_2018,y_train_2018)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [167]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = regr_2018.predict(x_test_2019)
for i in range(len(y_test_2019)):
    k = np.round(predictions)[i] - y_test_2019[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2019[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2019[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2019[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2019[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2019[i])
                        p1.append(i)
    else:
        q.append(y_test_2019[i])
        q1.append(i)
print("答對：%.4f" % (t/len(y_test_2019)))
print("正負1：%.4f" % (a/len(y_test_2019)))
print("正負3：%.4f" % (b/len(y_test_2019)))
print("正負5：%.4f" % (c/len(y_test_2019)))
print("正負10：%.4f" % (d/len(y_test_2019)))

答對：0.0314
正負1：0.1466
正負3：0.3874
正負5：0.5131
正負10：0.8010


In [168]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2019))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2019))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2019)))

mean_absolute_error: 6.702678261002945
mean_squared_error: 79.7636809415237
rmse: 8.931051502568089


In [169]:
Counter(sorted(p))

Counter({5.0: 3, 7.0: 1, 12.0: 1, 15.0: 1})

In [170]:
Counter(sorted(n))

Counter({1.0: 1,
         2.0: 3,
         3.0: 1,
         5.0: 4,
         6.0: 1,
         7.0: 2,
         9.0: 3,
         10.0: 1,
         11.0: 1,
         12.0: 4,
         13.0: 2,
         14.0: 1,
         15.0: 1,
         16.0: 1,
         17.0: 1,
         32.0: 1})

In [171]:
Counter(sorted(m))

Counter({0.0: 2,
         1.0: 4,
         2.0: 4,
         3.0: 4,
         4.0: 1,
         5.0: 5,
         6.0: 3,
         7.0: 5,
         8.0: 3,
         9.0: 7,
         10.0: 2,
         11.0: 5,
         12.0: 9,
         13.0: 3,
         14.0: 2,
         15.0: 2,
         16.0: 1,
         17.0: 2,
         18.0: 2,
         20.0: 1,
         21.0: 1,
         22.0: 1,
         23.0: 2,
         27.0: 1,
         29.0: 1,
         32.0: 1})

In [172]:
Counter(sorted(q))

Counter({1.0: 2,
         2.0: 3,
         3.0: 2,
         6.0: 1,
         12.0: 1,
         16.0: 1,
         17.0: 1,
         18.0: 1,
         21.0: 1,
         23.0: 2,
         24.0: 2,
         26.0: 1,
         27.0: 1,
         28.0: 1,
         31.0: 2,
         33.0: 4,
         34.0: 2,
         35.0: 2,
         37.0: 1,
         38.0: 1,
         39.0: 1,
         41.0: 1,
         44.0: 1,
         45.0: 1,
         48.0: 1,
         49.0: 1})

In [173]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [2, 72, 75, 98, 136, 160]
1: [2, 21, 22, 25, 36, 42, 59, 67, 72, 75, 80, 91, 97, 98, 108, 111, 113, 114, 117, 119, 123, 130, 136, 148, 153, 160, 165, 177]
3: [2, 4, 7, 9, 11, 13, 20, 21, 22, 23, 25, 27, 29, 36, 42, 48, 55, 56, 57, 59, 60, 67, 69, 72, 74, 75, 78, 80, 88, 91, 94, 95, 96, 97, 98, 102, 103, 108, 110, 111, 113, 114, 117, 118, 119, 121, 123, 124, 125, 130, 131, 134, 136, 137, 140, 142, 146, 147, 148, 153, 154, 157, 158, 159, 160, 163, 164, 165, 167, 168, 175, 177, 184, 185]
NA: [0, 6, 15, 24, 34, 38, 41, 47, 54, 61, 63, 65, 66, 71, 79, 89, 99, 100, 106, 107, 126, 128, 129, 139, 141, 145, 155, 162, 166, 169, 170, 172, 173, 174, 179, 180, 181, 187]


In [174]:
regr_2019_dataframe = testing_2019.copy()
regr_2019_dataframe['Prediction'] = np.round(predictions)
regr_2019_dataframe['Difference'] = regr_2019_dataframe.SixthYear_HR - regr_2019_dataframe.Prediction

In [186]:
regr_2019_dataframe[regr_2019_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
8,2014,111.0,432.0,58.0,124.0,34.0,2.0,18.0,61.0,2.0,...,8.0,14.0,Nolan,Arenado,Nolan James,215.0,74.0,28.0,33.0,8.0
38,2014,159.0,613.0,87.0,166.0,32.0,2.0,40.0,108.0,4.0,...,3.0,14.0,Nelson,Cruz,Nelson Ramon,230.0,74.0,39.0,28.0,13.0
170,2014,24.0,89.0,11.0,26.0,8.0,1.0,5.0,20.0,1.0,...,4.0,16.0,Jorge,Soler,Jorge Carlos,235.0,76.0,27.0,10.0,38.0
174,2014,85.0,244.0,33.0,59.0,9.0,1.0,4.0,23.0,3.0,...,6.0,12.0,Eugenio,Suarez,Eugenio Alejandro,213.0,71.0,28.0,24.0,25.0
179,2014,157.0,602.0,115.0,173.0,39.0,9.0,36.0,111.0,16.0,...,4.0,5.0,Mike,Trout,Michael Nelson,235.0,74.0,28.0,33.0,12.0
187,2014,144.0,582.0,94.0,165.0,30.0,6.0,9.0,54.0,21.0,...,3.0,8.0,Christian,Yelich,Christian Stephen,195.0,75.0,28.0,24.0,20.0


### SVM

In [45]:
from sklearn.utils import class_weight
class_weight = np.array(class_weight.compute_class_weight('balanced'
                                               ,np.unique(y_train_2017)
                                               ,y_train_2017))

clf_2017 = SVC(class_weight=dict(zip(list(np.unique(y_train_2017)), list(class_weight))))
clf_2017.fit(scaler.transform(x_train_2017), y_train_2017)

SVC(C=1.0, cache_size=200,
  class_weight={0.0: 0.18552147239263803, 1.0: 0.17104072398190046, 2.0: 0.20712328767123286, 3.0: 0.2507462686567164, 4.0: 0.301195219123506, 5.0: 0.301195219123506, 6.0: 0.32238805970149254, 7.0: 0.4053619302949062, 8.0: 0.4108695652173913, 9.0: 0.45, 10.0: 0.504, 11.0: 0.5361702127659574, 12.0: 0.5...0: 151.2, 59.0: 151.2, 63.0: 151.2, 64.0: 151.2, 65.0: 151.2, 66.0: 151.2, 70.0: 151.2, 73.0: 151.2},
  coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [118]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = clf_2017.predict(scaler.transform(x_test_2018))
for i in range(len(y_test_2018)):
    k = np.round(predictions)[i] - y_test_2018[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2018[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2018[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2018[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2018[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2018[i])
                        p1.append(i)
    else:
        q.append(y_test_2018[i])
        q1.append(i)
print("答對：%.4f" % (t/len(y_test_2018)))
print("正負1：%.4f" % (a/len(y_test_2018)))
print("正負3：%.4f" % (b/len(y_test_2018)))
print("正負5：%.4f" % (c/len(y_test_2018)))
print("正負10：%.4f" % (d/len(y_test_2018)))

答對：0.0870
正負1：0.1467
正負3：0.3641
正負5：0.5217
正負10：0.7880


In [119]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2018))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2018))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2018)))

mean_absolute_error: 6.706521739130435
mean_squared_error: 78.3913043478261
rmse: 8.853886397951246


In [120]:
Counter(sorted(p))

Counter({0.0: 1,
         1.0: 1,
         2.0: 2,
         7.0: 1,
         8.0: 1,
         13.0: 1,
         14.0: 4,
         22.0: 1,
         27.0: 1,
         34.0: 1,
         38.0: 2})

In [121]:
Counter(sorted(n))

Counter({0.0: 1,
         1.0: 3,
         2.0: 4,
         3.0: 2,
         5.0: 1,
         7.0: 1,
         8.0: 1,
         10.0: 1,
         13.0: 2,
         14.0: 4,
         15.0: 1,
         22.0: 1,
         27.0: 1,
         33.0: 1,
         34.0: 1,
         38.0: 2})

In [122]:
Counter(sorted(m))

Counter({0.0: 2,
         1.0: 5,
         2.0: 4,
         3.0: 2,
         4.0: 5,
         5.0: 5,
         6.0: 1,
         7.0: 1,
         8.0: 2,
         9.0: 3,
         10.0: 4,
         11.0: 5,
         12.0: 1,
         13.0: 3,
         14.0: 5,
         15.0: 2,
         16.0: 3,
         20.0: 1,
         21.0: 3,
         22.0: 1,
         27.0: 2,
         28.0: 1,
         33.0: 1,
         34.0: 1,
         37.0: 1,
         38.0: 2,
         43.0: 1})

In [123]:
Counter(sorted(q))

Counter({0.0: 1,
         1.0: 3,
         2.0: 2,
         3.0: 1,
         4.0: 1,
         5.0: 1,
         6.0: 2,
         7.0: 2,
         8.0: 3,
         9.0: 3,
         10.0: 1,
         11.0: 2,
         12.0: 3,
         13.0: 5,
         15.0: 2,
         18.0: 1,
         21.0: 1,
         23.0: 2,
         27.0: 1,
         36.0: 1,
         37.0: 1})

In [124]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [7, 8, 13, 15, 34, 41, 60, 77, 80, 81, 109, 135, 136, 165, 169, 174]
1: [2, 7, 8, 13, 15, 16, 22, 34, 41, 47, 49, 60, 66, 77, 78, 80, 81, 85, 94, 102, 109, 115, 135, 136, 165, 169, 174]
3: [0, 2, 4, 7, 8, 13, 15, 16, 17, 22, 32, 33, 34, 35, 41, 43, 47, 48, 49, 51, 59, 60, 62, 66, 67, 71, 76, 77, 78, 79, 80, 81, 83, 85, 94, 98, 99, 102, 107, 109, 111, 112, 113, 115, 117, 123, 126, 129, 130, 132, 135, 136, 138, 152, 154, 155, 156, 160, 162, 164, 165, 169, 171, 172, 173, 174, 177]
NA: [9, 18, 21, 24, 27, 28, 30, 40, 42, 44, 45, 52, 55, 56, 57, 68, 69, 74, 82, 86, 87, 88, 104, 106, 118, 121, 124, 125, 127, 128, 137, 139, 141, 145, 151, 175, 176, 179, 181]


In [125]:
SVM_2018_dataframe = testing_2018.copy()
SVM_2018_dataframe['Prediction'] = np.round(predictions)
SVM_2018_dataframe['Difference'] = SVM_2018_dataframe.SixthYear_HR - SVM_2018_dataframe.Prediction

In [187]:
SVM_2018_dataframe[SVM_2018_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
38,2013,56.0,136.0,27.0,38.0,10.0,0.0,11.0,27.0,3.0,...,7.0,16.0,Khris,Davis,Khristopher Adrian,205.0,71.0,31.0,43.0,5.0
111,2013,86.0,296.0,24.0,74.0,17.0,0.0,7.0,36.0,2.0,...,7.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,31.0,40.0,3.0


In [53]:
from sklearn.utils import class_weight
class_weight = np.array(class_weight.compute_class_weight('balanced'
                                               ,np.unique(y_train_2018)
                                               ,y_train_2018))

clf_2018 = SVC(class_weight=dict(zip(list(np.unique(y_train_2018)), list(class_weight))))
clf_2018.fit(scaler.transform(x_train_2018), y_train_2018)

SVC(C=1.0, cache_size=200,
  class_weight={0.0: 0.1878424015009381, 1.0: 0.1717176914501329, 2.0: 0.2092809364548495, 3.0: 0.25375744519072363, 4.0: 0.30084134615384617, 5.0: 0.3032101756511205, 6.0: 0.32156736791392326, 7.0: 0.4074888074888075, 8.0: 0.4107487179487179, 9.0: 0.4464659977703456, 10.0: 0.5017288900025056, 11.0: 0...0: 154.03076923076924, 66.0: 154.03076923076924, 70.0: 154.03076923076924, 73.0: 154.03076923076924},
  coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [159]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = clf_2018.predict(scaler.transform(x_test_2019))
for i in range(len(y_test_2019)):
    k = np.round(predictions)[i] - y_test_2019[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2019[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2019[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2019[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2019[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2019[i])
                        p1.append(i)
    else:
        q.append(y_test_2019[i])
        q1.append(i)
print("答對：%.4f" % (t/len(y_test_2019)))
print("正負1：%.4f" % (a/len(y_test_2019)))
print("正負3：%.4f" % (b/len(y_test_2019)))
print("正負5：%.4f" % (c/len(y_test_2019)))
print("正負10：%.4f" % (d/len(y_test_2019)))

答對：0.0785
正負1：0.1937
正負3：0.3246
正負5：0.4974
正負10：0.7330


In [160]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2019))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2019))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2019)))

mean_absolute_error: 7.780104712041885
mean_squared_error: 115.63350785340315
rmse: 10.753302183673773


In [161]:
Counter(sorted(p))

Counter({1.0: 1,
         2.0: 4,
         3.0: 1,
         6.0: 1,
         7.0: 2,
         11.0: 1,
         12.0: 1,
         13.0: 1,
         14.0: 1,
         23.0: 2})

In [162]:
Counter(sorted(n))

Counter({0.0: 1,
         1.0: 5,
         2.0: 4,
         3.0: 2,
         5.0: 2,
         6.0: 2,
         7.0: 3,
         8.0: 1,
         9.0: 1,
         11.0: 1,
         12.0: 4,
         13.0: 2,
         14.0: 3,
         23.0: 2,
         24.0: 1,
         26.0: 1,
         29.0: 1,
         32.0: 1})

In [163]:
Counter(sorted(m))

Counter({0.0: 2,
         1.0: 5,
         2.0: 6,
         3.0: 3,
         4.0: 1,
         5.0: 5,
         6.0: 2,
         7.0: 3,
         8.0: 1,
         9.0: 3,
         11.0: 3,
         12.0: 6,
         13.0: 3,
         14.0: 3,
         20.0: 2,
         22.0: 2,
         23.0: 2,
         24.0: 1,
         25.0: 1,
         26.0: 1,
         27.0: 1,
         28.0: 1,
         29.0: 2,
         32.0: 1,
         34.0: 1,
         41.0: 1})

In [164]:
Counter(sorted(q))

Counter({1.0: 2,
         2.0: 6,
         3.0: 3,
         4.0: 1,
         6.0: 2,
         7.0: 1,
         8.0: 1,
         9.0: 1,
         10.0: 3,
         11.0: 2,
         12.0: 4,
         13.0: 1,
         15.0: 3,
         16.0: 2,
         17.0: 3,
         19.0: 1,
         20.0: 1,
         23.0: 3,
         24.0: 1,
         27.0: 1,
         28.0: 1,
         31.0: 1,
         32.0: 1,
         35.0: 2,
         37.0: 1,
         44.0: 1,
         48.0: 1,
         49.0: 1})

In [165]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [21, 23, 57, 67, 74, 80, 88, 90, 93, 119, 124, 148, 150, 154, 164]
1: [2, 4, 13, 19, 21, 23, 42, 57, 67, 73, 74, 75, 80, 86, 88, 90, 91, 93, 94, 98, 110, 111, 119, 121, 124, 137, 138, 148, 149, 150, 153, 154, 157, 160, 164, 165, 175]
3: [2, 4, 9, 13, 18, 19, 21, 23, 28, 31, 37, 38, 42, 44, 55, 57, 62, 67, 69, 73, 74, 75, 76, 78, 80, 82, 86, 88, 90, 91, 93, 94, 98, 104, 108, 109, 110, 111, 119, 121, 124, 130, 135, 136, 137, 138, 140, 142, 148, 149, 150, 153, 154, 155, 157, 159, 160, 164, 165, 171, 175, 186]
NA: [1, 3, 5, 6, 14, 16, 26, 30, 36, 39, 40, 41, 45, 47, 49, 50, 52, 54, 56, 65, 66, 70, 71, 81, 87, 99, 100, 105, 106, 115, 122, 126, 128, 129, 141, 145, 147, 151, 152, 158, 161, 170, 173, 174, 176, 177, 181, 183, 187, 189, 190]


In [166]:
SVM_2019_dataframe = testing_2019.copy()
SVM_2019_dataframe['Prediction'] = np.round(predictions)
SVM_2019_dataframe['Difference'] = SVM_2019_dataframe.SixthYear_HR - SVM_2019_dataframe.Prediction

In [188]:
SVM_2019_dataframe[SVM_2019_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
8,2014,111.0,432.0,58.0,124.0,34.0,2.0,18.0,61.0,2.0,...,8.0,14.0,Nolan,Arenado,Nolan James,215.0,74.0,28.0,33.0,8.0
38,2014,159.0,613.0,87.0,166.0,32.0,2.0,40.0,108.0,4.0,...,3.0,14.0,Nelson,Cruz,Nelson Ramon,230.0,74.0,39.0,39.0,2.0
170,2014,24.0,89.0,11.0,26.0,8.0,1.0,5.0,20.0,1.0,...,4.0,16.0,Jorge,Soler,Jorge Carlos,235.0,76.0,27.0,7.0,41.0
174,2014,85.0,244.0,33.0,59.0,9.0,1.0,4.0,23.0,3.0,...,6.0,12.0,Eugenio,Suarez,Eugenio Alejandro,213.0,71.0,28.0,34.0,15.0
179,2014,157.0,602.0,115.0,173.0,39.0,9.0,36.0,111.0,16.0,...,4.0,5.0,Mike,Trout,Michael Nelson,235.0,74.0,28.0,39.0,6.0
187,2014,144.0,582.0,94.0,165.0,30.0,6.0,9.0,54.0,21.0,...,3.0,8.0,Christian,Yelich,Christian Stephen,195.0,75.0,28.0,29.0,15.0


### Random Forest

In [90]:
forest_2017 = RandomForestClassifier(n_estimators=200)
forest_2017.fit(x_train_2017, y_train_2017)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [126]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = forest_2017.predict(x_test_2018)
for i in range(len(y_test_2018)):
    k = np.round(predictions)[i] - y_test_2018[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2018[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2018[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2018[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2018[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2018[i])
                        p1.append(i)
    else:
        q.append(y_test_2018[i])
        q1.append(i)
print("答對：%.4f" % (t/len(y_test_2018)))
print("正負1：%.4f" % (a/len(y_test_2018)))
print("正負3：%.4f" % (b/len(y_test_2018)))
print("正負5：%.4f" % (c/len(y_test_2018)))
print("正負10：%.4f" % (d/len(y_test_2018)))

答對：0.0543
正負1：0.1739
正負3：0.3207
正負5：0.4565
正負10：0.7446


In [127]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2018))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2018))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2018)))

mean_absolute_error: 7.331521739130435
mean_squared_error: 88.44021739130434
rmse: 9.404265914536039


In [128]:
Counter(sorted(p))

Counter({0.0: 2, 1.0: 4, 3.0: 1, 6.0: 1, 8.0: 1, 11.0: 1})

In [129]:
Counter(sorted(n))

Counter({0.0: 2,
         1.0: 8,
         2.0: 2,
         3.0: 2,
         4.0: 2,
         5.0: 2,
         6.0: 2,
         8.0: 2,
         10.0: 2,
         11.0: 1,
         12.0: 1,
         13.0: 2,
         14.0: 2,
         15.0: 1,
         24.0: 1})

In [130]:
Counter(sorted(m))

Counter({0.0: 3,
         1.0: 8,
         2.0: 5,
         3.0: 3,
         4.0: 9,
         5.0: 2,
         6.0: 5,
         7.0: 1,
         8.0: 4,
         9.0: 1,
         10.0: 3,
         11.0: 2,
         12.0: 2,
         13.0: 2,
         14.0: 4,
         15.0: 2,
         16.0: 1,
         21.0: 1,
         24.0: 1})

In [131]:
Counter(sorted(q))

Counter({0.0: 1,
         2.0: 1,
         4.0: 1,
         7.0: 1,
         8.0: 1,
         9.0: 2,
         10.0: 1,
         13.0: 2,
         15.0: 3,
         16.0: 1,
         17.0: 1,
         18.0: 1,
         20.0: 3,
         21.0: 3,
         22.0: 1,
         23.0: 8,
         24.0: 2,
         25.0: 1,
         27.0: 3,
         28.0: 1,
         36.0: 2,
         37.0: 2,
         38.0: 2,
         39.0: 1,
         43.0: 1,
         48.0: 1})

In [132]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [2, 15, 26, 56, 77, 115, 128, 147, 155, 156]
1: [2, 11, 13, 15, 22, 26, 33, 36, 46, 49, 51, 56, 62, 72, 77, 78, 86, 94, 99, 105, 113, 115, 128, 132, 136, 139, 142, 147, 151, 155, 156, 157]
3: [2, 6, 11, 13, 15, 22, 26, 33, 34, 36, 46, 47, 49, 51, 55, 56, 59, 62, 69, 71, 72, 73, 77, 78, 81, 85, 86, 93, 94, 96, 98, 99, 101, 105, 113, 115, 116, 117, 123, 125, 127, 128, 130, 132, 134, 136, 139, 142, 147, 150, 151, 152, 155, 156, 157, 160, 169, 177, 182]
NA: [0, 3, 4, 7, 9, 14, 18, 19, 24, 25, 30, 32, 35, 38, 40, 41, 44, 52, 57, 58, 63, 64, 67, 75, 76, 82, 84, 87, 88, 95, 104, 106, 110, 111, 118, 122, 126, 135, 140, 143, 146, 148, 149, 165, 167, 179, 183]


In [133]:
RF_2018_dataframe = testing_2018.copy()
RF_2018_dataframe['Prediction'] = np.round(predictions)
RF_2018_dataframe['Difference'] = RF_2018_dataframe.SixthYear_HR - RF_2018_dataframe.Prediction

In [190]:
RF_2018_dataframe[RF_2018_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
38,2013,56.0,136.0,27.0,38.0,10.0,0.0,11.0,27.0,3.0,...,7.0,16.0,Khris,Davis,Khristopher Adrian,205.0,71.0,31.0,28.0,20.0
111,2013,86.0,296.0,24.0,74.0,17.0,0.0,7.0,36.0,2.0,...,7.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,31.0,24.0,19.0


In [99]:
forest_2018 = RandomForestClassifier(n_estimators=200)
forest_2018.fit(x_train_2018, y_train_2018)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [151]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = forest_2018.predict(x_test_2019)
for i in range(len(y_test_2019)):
    k = np.round(predictions)[i] - y_test_2019[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2019[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2019[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2019[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2019[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2019[i])
                        p1.append(i)
    else:
        q.append(y_test_2019[i])
        q1.append(i)
print("答對：%.4f" % (t/len(y_test_2019)))
print("正負1：%.4f" % (a/len(y_test_2019)))
print("正負3：%.4f" % (b/len(y_test_2019)))
print("正負5：%.4f" % (c/len(y_test_2019)))
print("正負10：%.4f" % (d/len(y_test_2019)))

答對：0.0681
正負1：0.1780
正負3：0.2984
正負5：0.4346
正負10：0.6387


In [152]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2019))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2019))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2019)))

mean_absolute_error: 8.397905759162304
mean_squared_error: 122.47120418848168
rmse: 11.066670871968755


In [153]:
Counter(sorted(p))

Counter({1.0: 4, 2.0: 2, 3.0: 1, 6.0: 2, 8.0: 1, 9.0: 1, 12.0: 1, 13.0: 1})

In [154]:
Counter(sorted(n))

Counter({0.0: 3,
         1.0: 5,
         2.0: 8,
         3.0: 2,
         5.0: 1,
         6.0: 3,
         7.0: 2,
         8.0: 1,
         9.0: 2,
         12.0: 2,
         13.0: 2,
         15.0: 1,
         22.0: 1,
         34.0: 1})

In [155]:
Counter(sorted(m))

Counter({0.0: 4,
         1.0: 5,
         2.0: 8,
         3.0: 6,
         4.0: 2,
         5.0: 2,
         6.0: 3,
         7.0: 2,
         8.0: 2,
         9.0: 5,
         10.0: 2,
         12.0: 4,
         13.0: 2,
         14.0: 1,
         15.0: 1,
         17.0: 1,
         20.0: 1,
         22.0: 1,
         25.0: 1,
         27.0: 1,
         32.0: 1,
         34.0: 2})

In [156]:
Counter(sorted(q))

Counter({2.0: 1,
         3.0: 1,
         6.0: 1,
         12.0: 4,
         13.0: 2,
         15.0: 2,
         16.0: 2,
         17.0: 4,
         18.0: 2,
         19.0: 3,
         20.0: 2,
         21.0: 4,
         22.0: 3,
         23.0: 6,
         24.0: 4,
         26.0: 2,
         27.0: 2,
         28.0: 2,
         29.0: 3,
         30.0: 1,
         31.0: 2,
         32.0: 1,
         33.0: 4,
         34.0: 2,
         35.0: 2,
         37.0: 1,
         38.0: 1,
         39.0: 1,
         41.0: 1,
         44.0: 1,
         48.0: 1,
         49.0: 1})

In [157]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [4, 42, 73, 80, 91, 93, 105, 123, 138, 140, 142, 157, 184]
1: [4, 21, 25, 32, 35, 42, 43, 55, 67, 68, 72, 73, 80, 82, 88, 90, 91, 92, 93, 94, 95, 105, 119, 123, 138, 140, 142, 148, 151, 157, 167, 171, 175, 184]
3: [4, 5, 9, 21, 25, 32, 35, 37, 42, 43, 44, 45, 50, 53, 55, 67, 68, 70, 72, 73, 74, 75, 78, 80, 82, 88, 90, 91, 92, 93, 94, 95, 96, 103, 105, 108, 110, 111, 119, 123, 134, 137, 138, 140, 142, 148, 151, 154, 156, 157, 158, 159, 167, 171, 175, 184, 190]
NA: [0, 1, 6, 10, 13, 14, 15, 17, 18, 19, 24, 26, 28, 29, 30, 31, 33, 34, 38, 39, 41, 46, 47, 52, 54, 58, 60, 61, 63, 64, 65, 76, 79, 87, 89, 100, 106, 107, 115, 121, 124, 126, 127, 129, 131, 133, 135, 139, 141, 143, 144, 145, 149, 152, 155, 162, 163, 164, 166, 168, 170, 172, 173, 174, 176, 180, 181, 182, 187]


In [158]:
RF_2019_dataframe = testing_2019.copy()
RF_2019_dataframe['Prediction'] = np.round(predictions)
RF_2019_dataframe['Difference'] = RF_2019_dataframe.SixthYear_HR - RF_2019_dataframe.Prediction

In [191]:
RF_2019_dataframe[RF_2019_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
8,2014,111.0,432.0,58.0,124.0,34.0,2.0,18.0,61.0,2.0,...,8.0,14.0,Nolan,Arenado,Nolan James,215.0,74.0,28.0,33.0,8.0
38,2014,159.0,613.0,87.0,166.0,32.0,2.0,40.0,108.0,4.0,...,3.0,14.0,Nelson,Cruz,Nelson Ramon,230.0,74.0,39.0,16.0,25.0
170,2014,24.0,89.0,11.0,26.0,8.0,1.0,5.0,20.0,1.0,...,4.0,16.0,Jorge,Soler,Jorge Carlos,235.0,76.0,27.0,2.0,46.0
174,2014,85.0,244.0,33.0,59.0,9.0,1.0,4.0,23.0,3.0,...,6.0,12.0,Eugenio,Suarez,Eugenio Alejandro,213.0,71.0,28.0,15.0,34.0
179,2014,157.0,602.0,115.0,173.0,39.0,9.0,36.0,111.0,16.0,...,4.0,5.0,Mike,Trout,Michael Nelson,235.0,74.0,28.0,39.0,6.0
187,2014,144.0,582.0,94.0,165.0,30.0,6.0,9.0,54.0,21.0,...,3.0,8.0,Christian,Yelich,Christian Stephen,195.0,75.0,28.0,13.0,31.0


### Neural Networks

In [303]:
model_2017 = Sequential()
model_2017.add(Dense(1024, input_dim = 105))
model_2017.add(Dropout(rate=0.5))
model_2017.add(Dense(512, activation="relu"))
model_2017.add(Dropout(rate=0.5))
model_2017.add(Dense(128, activation="relu"))
model_2017.add(Dense(1, activation='relu'))
adam = optimizers.Adam(lr = 0.001)
model_2017.compile(loss = "mse", optimizer=adam, metrics=["accuracy"])
model_2017.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_68 (Dense)             (None, 1024)              108544    
_________________________________________________________________
dropout_33 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_69 (Dense)             (None, 512)               524800    
_________________________________________________________________
dropout_34 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_70 (Dense)             (None, 128)               65664     
_________________________________________________________________
dense_71 (Dense)             (None, 1)                 129       
Total params: 699,137
Trainable params: 699,137
Non-trainable params: 0
_________________________________________________________________


In [304]:
#model_9 LSTM128, FC9

model_2018 = Sequential()
model_2018.add(Dense(1024, input_dim = 105))
model_2018.add(Dropout(rate=0.5))
model_2018.add(Dense(512, activation="relu"))
model_2018.add(Dropout(rate=0.5))
model_2018.add(Dense(128, activation="relu"))
model_2018.add(Dense(1, activation='relu'))
adam = optimizers.Adam(lr = 0.001)
model_2018.compile(loss = "mse", optimizer=adam, metrics=["accuracy"])
model_2018.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_72 (Dense)             (None, 1024)              108544    
_________________________________________________________________
dropout_35 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_73 (Dense)             (None, 512)               524800    
_________________________________________________________________
dropout_36 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_74 (Dense)             (None, 128)               65664     
_________________________________________________________________
dense_75 (Dense)             (None, 1)                 129       
Total params: 699,137
Trainable params: 699,137
Non-trainable params: 0
_________________________________________________________________


### Training

In [305]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced'
                                               ,np.unique(y_train_class)
                                               ,y_train_class)

filepath = "saved-model-{epoch:03d}-{val_acc:.4f}-2018.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_2017_stat = model_2017.fit(x_train_2017, y_train_2017, 
                              batch_size=128, 
                              epochs=10, 
                              validation_data=(x_test_2018, y_test_2018),
                              callbacks=[checkpoint],
                              shuffle=True,
                              class_weight=class_weight
                             )

Train on 9828 samples, validate on 184 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.02717, saving model to B/saved-model-001-0.0272.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve
Epoch 3/10

Epoch 00003: val_acc did not improve
Epoch 4/10

Epoch 00004: val_acc did not improve
Epoch 5/10

Epoch 00005: val_acc did not improve
Epoch 6/10

Epoch 00006: val_acc did not improve
Epoch 7/10

Epoch 00007: val_acc did not improve
Epoch 8/10

Epoch 00008: val_acc did not improve
Epoch 9/10

Epoch 00009: val_acc did not improve
Epoch 10/10

Epoch 00010: val_acc did not improve


In [306]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced'
                                               ,np.unique(y_train_class)
                                               ,y_train_class)

filepath = "saved-model-{epoch:03d}-{val_acc:.4f}-2019.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_2018_stat = model_2018.fit(x_train_2018, y_train_2018, 
                              batch_size=128, 
                              epochs=10, 
                              validation_data=(x_test_2019, y_test_2019),
                              callbacks=[checkpoint],
                              shuffle=True,
                              class_weight=class_weight
                             )

Train on 10012 samples, validate on 191 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.02094, saving model to C/saved-model-001-0.0209.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve
Epoch 3/10

Epoch 00003: val_acc did not improve
Epoch 4/10

Epoch 00004: val_acc did not improve
Epoch 5/10

Epoch 00005: val_acc did not improve
Epoch 6/10

Epoch 00006: val_acc did not improve
Epoch 7/10

Epoch 00007: val_acc did not improve
Epoch 8/10

Epoch 00008: val_acc did not improve
Epoch 9/10

Epoch 00009: val_acc did not improve
Epoch 10/10

Epoch 00010: val_acc did not improve


In [73]:
BBB = load_model('ML/saved-model-001-0.0272-2018.hdf5')
CCC = load_model('ML/saved-model-001-0.0209-2019.hdf5')

In [134]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = BBB.predict(x_test_2018)
for i in range(len(y_test_2018)):
    k = np.round(predictions)[i] - y_test_2018[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2018[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2018[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2018[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2018[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2018[i])
                        p1.append(i)
    else:
        q.append(y_test_2018[i])
        q1.append(i)
print("答對：%.4f" % (t/len(y_test_2018)))
print("正負1：%.4f" % (a/len(y_test_2018)))
print("正負3：%.4f" % (b/len(y_test_2018)))
print("正負5：%.4f" % (c/len(y_test_2018)))
print("正負10：%.4f" % (d/len(y_test_2018)))

答對：0.0272
正負1：0.0978
正負3：0.1522
正負5：0.2391
正負10：0.4457


In [135]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2018))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2018))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2018)))

mean_absolute_error: 13.293478260869565
mean_squared_error: 273.4673913043478
rmse: 16.53684949754178


In [136]:
Counter(sorted(p))

Counter({0.0: 5})

In [137]:
Counter(sorted(n))

Counter({0.0: 5, 1.0: 13})

In [138]:
Counter(sorted(m))

Counter({0.0: 5, 1.0: 13, 2.0: 6, 3.0: 4})

In [139]:
Counter(sorted(q))

Counter({11.0: 10,
         12.0: 6,
         13.0: 9,
         14.0: 7,
         15.0: 9,
         16.0: 6,
         17.0: 2,
         18.0: 3,
         19.0: 3,
         20.0: 5,
         21.0: 7,
         22.0: 2,
         23.0: 9,
         24.0: 3,
         25.0: 3,
         27.0: 3,
         28.0: 1,
         29.0: 1,
         30.0: 1,
         32.0: 1,
         33.0: 1,
         34.0: 1,
         36.0: 2,
         37.0: 2,
         38.0: 2,
         39.0: 1,
         43.0: 1,
         48.0: 1})

In [140]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [65, 77, 82, 101, 155]
1: [15, 27, 31, 36, 39, 49, 65, 77, 82, 101, 115, 137, 139, 142, 147, 155, 156, 171]
3: [2, 13, 15, 21, 27, 31, 36, 39, 47, 49, 55, 65, 77, 82, 85, 88, 93, 94, 101, 115, 136, 137, 139, 142, 147, 155, 156, 171]
NA: [0, 3, 4, 7, 10, 11, 12, 14, 16, 17, 19, 23, 25, 28, 29, 32, 34, 35, 37, 38, 40, 41, 42, 43, 45, 46, 50, 52, 53, 57, 58, 59, 60, 61, 62, 63, 64, 66, 67, 70, 71, 72, 74, 75, 76, 79, 80, 84, 87, 90, 92, 95, 97, 99, 100, 102, 103, 104, 106, 109, 110, 111, 112, 119, 122, 123, 124, 125, 126, 127, 128, 131, 133, 135, 138, 140, 143, 144, 146, 148, 149, 151, 153, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 174, 175, 176, 179, 181, 183]


In [141]:
BB = testing_2018.copy()
BB['Prediction'] = np.round(predictions)
BB['Difference'] = BB.SixthYear_HR - BB.Prediction

In [143]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = CCC.predict(x_test_2019)
for i in range(len(y_test_2019)):
    k = np.round(predictions)[i] - y_test_2019[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2019[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2019[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2019[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2019[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2019[i])
                        p1.append(i)
    else:
        q.append(y_test_2019[i])
        q1.append(i)
print("答對：%.4f" % (t/len(y_test_2019)))
print("正負1：%.4f" % (a/len(y_test_2019)))
print("正負3：%.4f" % (b/len(y_test_2019)))
print("正負5：%.4f" % (c/len(y_test_2019)))
print("正負10：%.4f" % (d/len(y_test_2019)))

答對：0.0209
正負1：0.0733
正負3：0.1885
正負5：0.2356
正負10：0.3927


In [144]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2019))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2019))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2019)))

mean_absolute_error: 15.178010471204189
mean_squared_error: 360.3193717277487
rmse: 18.982080279246233


In [145]:
Counter(sorted(p))

Counter({0.0: 4})

In [146]:
Counter(sorted(n))

Counter({0.0: 4, 1.0: 10})

In [147]:
Counter(sorted(m))

Counter({0.0: 4, 1.0: 10, 2.0: 14, 3.0: 8})

In [148]:
Counter(sorted(q))

Counter({11.0: 8,
         12.0: 15,
         13.0: 5,
         14.0: 4,
         15.0: 5,
         16.0: 4,
         17.0: 7,
         18.0: 3,
         19.0: 4,
         20.0: 5,
         21.0: 4,
         22.0: 4,
         23.0: 7,
         24.0: 4,
         25.0: 1,
         26.0: 2,
         27.0: 3,
         28.0: 2,
         29.0: 3,
         30.0: 1,
         31.0: 2,
         32.0: 2,
         33.0: 4,
         34.0: 4,
         35.0: 3,
         36.0: 1,
         37.0: 1,
         38.0: 1,
         39.0: 1,
         41.0: 2,
         44.0: 1,
         45.0: 1,
         48.0: 1,
         49.0: 1})

In [149]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [35, 82, 96, 175]
1: [4, 35, 42, 82, 84, 88, 96, 99, 138, 157, 169, 175, 178, 189]
3: [4, 32, 35, 42, 43, 66, 67, 70, 71, 74, 78, 81, 82, 84, 88, 91, 93, 96, 99, 119, 126, 128, 132, 134, 138, 142, 148, 151, 157, 161, 169, 171, 173, 175, 178, 189]
NA: [0, 1, 3, 6, 7, 8, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24, 25, 26, 28, 29, 30, 31, 33, 34, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47, 48, 52, 53, 54, 56, 58, 59, 60, 61, 62, 63, 64, 65, 68, 69, 72, 73, 75, 76, 77, 79, 83, 86, 87, 89, 90, 92, 97, 100, 101, 103, 106, 107, 109, 111, 112, 115, 116, 120, 121, 124, 127, 129, 130, 131, 133, 135, 139, 140, 141, 143, 144, 146, 149, 150, 152, 153, 154, 155, 158, 162, 163, 164, 165, 166, 168, 170, 172, 174, 176, 177, 179, 180, 181, 182, 183, 185, 186, 187]


### Analysis

In [150]:
CC = testing_2019.copy()
CC['Prediction'] = np.round(predictions)
CC['Difference'] = CC.SixthYear_HR - CC.Prediction

In [197]:
regr_2018_dataframe[regr_2018_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference


In [204]:
len(regr_2018_dataframe[regr_2018_dataframe.Difference < 0])

101

In [205]:
len(regr_2018_dataframe[regr_2018_dataframe.Difference == 0])

8

In [206]:
len(regr_2018_dataframe[regr_2018_dataframe.Difference > 0])

75

In [196]:
SVM_2018_dataframe[SVM_2018_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
38,2013,56.0,136.0,27.0,38.0,10.0,0.0,11.0,27.0,3.0,...,7.0,16.0,Khris,Davis,Khristopher Adrian,205.0,71.0,31.0,43.0,5.0
111,2013,86.0,296.0,24.0,74.0,17.0,0.0,7.0,36.0,2.0,...,7.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,31.0,40.0,3.0


In [203]:
len(SVM_2018_dataframe[SVM_2018_dataframe.Difference < 0])

115

In [207]:
len(SVM_2018_dataframe[SVM_2018_dataframe.Difference == 0])

16

In [208]:
len(SVM_2018_dataframe[SVM_2018_dataframe.Difference > 0])

53

In [195]:
RF_2018_dataframe[RF_2018_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference


In [202]:
len(RF_2018_dataframe[RF_2018_dataframe.Difference < 0])

64

In [209]:
len(RF_2018_dataframe[RF_2018_dataframe.Difference == 0])

10

In [210]:
len(RF_2018_dataframe[RF_2018_dataframe.Difference > 0])

110

In [183]:
BB[BB.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
38,2013,56.0,136.0,27.0,38.0,10.0,0.0,11.0,27.0,3.0,...,7.0,16.0,Khris,Davis,Khristopher Adrian,205.0,71.0,31.0,0.0,48.0
111,2013,86.0,296.0,24.0,74.0,17.0,0.0,7.0,36.0,2.0,...,7.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,31.0,0.0,43.0


In [211]:
len(BB[BB.Difference < 0])

0

In [212]:
len(BB[BB.Difference == 0])

5

In [213]:
len(BB[BB.Difference > 0])

179

In [194]:
regr_2019_dataframe[regr_2019_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference


In [201]:
len(regr_2019_dataframe[regr_2019_dataframe.Difference < 0])

71

In [214]:
len(regr_2019_dataframe[regr_2019_dataframe.Difference == 0])

6

In [215]:
len(regr_2019_dataframe[regr_2019_dataframe.Difference > 0])

114

In [193]:
SVM_2019_dataframe[SVM_2019_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
83,2014,100.0,352.0,41.0,96.0,10.0,2.0,13.0,32.0,2.0,...,4.0,10.0,Bryce,Harper,Bryce Aron Max,210.0,75.0,27.0,40.0,-5.0
116,2014,123.0,441.0,57.0,139.0,30.0,3.0,23.0,76.0,6.0,...,5.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,32.0,44.0,-8.0


In [200]:
len(SVM_2019_dataframe[SVM_2019_dataframe.Difference < 0])

99

In [216]:
len(SVM_2019_dataframe[SVM_2019_dataframe.Difference == 0])

15

In [217]:
len(SVM_2019_dataframe[SVM_2019_dataframe.Difference > 0])

77

In [192]:
RF_2019_dataframe[RF_2019_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
83,2014,100.0,352.0,41.0,96.0,10.0,2.0,13.0,32.0,2.0,...,4.0,10.0,Bryce,Harper,Bryce Aron Max,210.0,75.0,27.0,40.0,-5.0


In [199]:
len(RF_2019_dataframe[RF_2019_dataframe.Difference < 0])

43

In [218]:
len(RF_2019_dataframe[RF_2019_dataframe.Difference == 0])

13

In [219]:
len(RF_2019_dataframe[RF_2019_dataframe.Difference > 0])

135

In [184]:
CC[CC.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
8,2014,111.0,432.0,58.0,124.0,34.0,2.0,18.0,61.0,2.0,...,8.0,14.0,Nolan,Arenado,Nolan James,215.0,74.0,28.0,0.0,41.0
38,2014,159.0,613.0,87.0,166.0,32.0,2.0,40.0,108.0,4.0,...,3.0,14.0,Nelson,Cruz,Nelson Ramon,230.0,74.0,39.0,0.0,41.0
170,2014,24.0,89.0,11.0,26.0,8.0,1.0,5.0,20.0,1.0,...,4.0,16.0,Jorge,Soler,Jorge Carlos,235.0,76.0,27.0,0.0,48.0
174,2014,85.0,244.0,33.0,59.0,9.0,1.0,4.0,23.0,3.0,...,6.0,12.0,Eugenio,Suarez,Eugenio Alejandro,213.0,71.0,28.0,0.0,49.0
179,2014,157.0,602.0,115.0,173.0,39.0,9.0,36.0,111.0,16.0,...,4.0,5.0,Mike,Trout,Michael Nelson,235.0,74.0,28.0,0.0,45.0
187,2014,144.0,582.0,94.0,165.0,30.0,6.0,9.0,54.0,21.0,...,3.0,8.0,Christian,Yelich,Christian Stephen,195.0,75.0,28.0,0.0,44.0


In [220]:
len(CC[CC.Difference < 0])

0

In [221]:
len(CC[CC.Difference == 0])

4

In [222]:
len(CC[CC.Difference > 0])

187

### Save Models

In [223]:
from sklearn.externals import joblib

In [224]:
joblib.dump(regr, 'LR.pkl')

['LR.pkl']

In [225]:
joblib.dump(regr_2017, 'LR_2017.pkl')

['LR_2017.pkl']

In [226]:
joblib.dump(regr_2018, 'LR_2018.pkl')

['LR_2018.pkl']

In [228]:
joblib.dump(clf, 'SVM.pkl')

['SVM.pkl']

In [229]:
joblib.dump(clf_2017, 'SVM_2017.pkl')

['SVM_2017.pkl']

In [230]:
joblib.dump(clf_2018, 'SVM_2018.pkl')

['SVM_2018.pkl']

In [231]:
joblib.dump(forest, 'RF.pkl')

['RF.pkl']

In [232]:
joblib.dump(forest_2017, 'RF_2017.pkl')

['RF_2017.pkl']

In [233]:
joblib.dump(forest_2018, 'RF_2018.pkl')

['RF_2018.pkl']