In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import json
# import os
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
def Train_Test_XY_normalize_PCA(TrainX, TrainY, PCA_n = 40):
    
    # normalize data
    scaler_new = StandardScaler()
    scaler_new.fit(TrainX)
    X_train_scale = scaler_new.transform(TrainX)
    print(f'X shape {X_train_scale.shape}')
    
    # dimension reduction
    TrainX_PCA = Train_Test_XY_PCA(X_train_scale, PCA_n = PCA_n)
    print(f'X shape {TrainX_PCA.shape}')
    
    X_train, X_test, y_train, y_test = train_test_split(TrainX_PCA,TrainY,
                                                       test_size=.2,
                                                       shuffle=True,
                                                       random_state=2020,
                                                       stratify=TrainY)
    return X_train, X_test, y_train, y_test

In [3]:
def Train_Test_XY_PCA(TrainX, PCA_n = 40):
    pca = PCA(n_components=PCA_n)
    pca.fit(TrainX)
    TrainX = pca.transform(TrainX)
    print(f'PCA accuracy {np.sum(pca.explained_variance_ratio_)}')
    return TrainX

In [4]:
def NN_prototype(TrainX, TrainY, layers, PCA_n = 40):
    X_train, X_test, y_train, y_test = Train_Test_XY_normalize_PCA(TrainX, TrainY, PCA_n = PCA_n)
    
    print(f'mean of one of the features {np.mean(X_train[:,3])}')
    print(f'var of one of the features {np.var(X_train[:,3])}')
    
    X_train_train, X_val, y_train_train, y_val = train_test_split(X_train,
                                                                  y_train,
                                                                  test_size=.2,
                                                                  shuffle=True,
                                                                  random_state=2021,
                                                                  stratify=y_train)

    accuracy_Tr = []; accuracy_V = []
    for l in layers:
        print(f'Working with structure {l}')
        mlp = MLPClassifier(hidden_layer_sizes=l, max_iter=5000)
        mlp.fit(X_train_train, y_train_train)
        a_train = np.round(100*accuracy_score(y_train_train, mlp.predict(X_train_train)),2)
        a_valid = np.round(100*accuracy_score(y_val, mlp.predict(X_val)),2)
        print(f"training accuracy is {a_train}, vaidation accuracy is {a_valid}.")
        print(f'home win rate (validation set) = {sum(mlp.predict(X_val))/len(mlp.predict(X_val))}')
        print()
        accuracy_Tr.append(a_train); accuracy_V.append(a_valid)
    return accuracy_Tr, accuracy_V

# Read training X and training Y data

In [5]:
g_list = np.array([0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1])
filename = str(20142021)+'_'+str(0.9)+'.csv'

In [6]:
Traindf = pd.read_csv(filename)
Traindf = Traindf.dropna()
columns_name = Traindf.columns
TrainX_array = Traindf[columns_name[:-1]].to_numpy()
TrainY_array = Traindf[columns_name[-1]].to_numpy()

home wins are smaller than away wins

In [7]:
sum(TrainY_array)/len(TrainY_array)

0.44613374510036824

# simple NN model

In [8]:
see_column = np.array(['AvAge', 'W/GP', 'P/GP', 'W', 'PTS', 'GF/G', 'GA/G',
                       'SRS', 'SOS', 'PP%', 'PPA'])

column_home = np.array([i+'_H' for i in see_column])
column_away = np.array([i+'_A' for i in see_column])

column_name = np.concatenate((column_home,column_away))
column_name = np.concatenate((column_name,['class']))

In [9]:
# layers = [(10,), (10, 20), (10, 20, 10)]
layers = [(5,), (5, 10), (5, 10, 5)]

In [10]:
for g in g_list:
    print('---------------------')
    print(f'------g = {g}--------')
    filename = str(20142021)+'_'+str(g)+'.csv'
    Traindf = pd.read_csv(filename)[column_name]
    Traindf = Traindf.dropna()
    columns_name = Traindf.columns
    TrainX_array = Traindf[columns_name[:-1]].to_numpy()
    TrainY_array = Traindf[columns_name[-1]].to_numpy()
    
    NN_prototype(TrainX_array, TrainY_array, layers, PCA_n = 15)
    print()

---------------------
------g = 0.9--------
X shape (8683, 22)
PCA accuracy 0.9980929238903969
X shape (8683, 15)
mean of one of the features 0.007808411611065889
var of one of the features 1.1521589789635702
Working with structure (5,)
training accuracy is 57.16, vaidation accuracy is 55.04.
home win rate (validation set) = 0.24676258992805755

Working with structure (5, 10)
training accuracy is 58.03, vaidation accuracy is 53.74.
home win rate (validation set) = 0.3302158273381295

Working with structure (5, 10, 5)
training accuracy is 58.44, vaidation accuracy is 54.96.
home win rate (validation set) = 0.20719424460431654


---------------------
------g = 0.8--------
X shape (8683, 22)
PCA accuracy 0.9981013101611644
X shape (8683, 15)
mean of one of the features -0.007996413045609512
var of one of the features 1.1514537619557137
Working with structure (5,)
training accuracy is 57.24, vaidation accuracy is 55.18.
home win rate (validation set) = 0.23237410071942446

Working with str