In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os
import random
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import plot_importance
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [None]:
path = os.getcwd()

In [4]:
train_PL = pd.read_csv('PL_preprocessed.csv')
train_LIGA = pd.read_csv('LIGA_preprocessed.csv')
train_SE = pd.read_csv('SE_preprocessed.csv')
train_BL = pd.read_csv('BL_preprocessed.csv')
test_PL = pd.read_csv('PL_test_pp.csv')
test_LIGA = pd.read_csv('LIGA_test_pp.csv')
test_SE = pd.read_csv('SE_test_pp.csv')
test_BL = pd.read_csv('BL_test_pp.csv')

In [5]:
train_df = pd.concat([train_PL, train_LIGA, train_SE, train_BL], axis=0)
test_df = pd.concat([test_PL, test_LIGA, test_SE, test_BL], axis=0)

In [6]:
train_df.columns

Index(['season', 'home', 'away', 'full_home_score', 'full_away_score',
       'result', 'home_team', 'P_h', 'W_h', 'D_h', 'L_h', 'GF_h', 'GA_h',
       'GD_h', 'Pts_h', 'raised_h', 'home_bs', 'away_bs', 'h_skill', 'HA',
       'away_team', 'P_a', 'W_a', 'D_a', 'L_a', 'GF_a', 'GA_a', 'GD_a',
       'Pts_a', 'raised_a', 'a_skill', 'home_team_fixed', 'home_bs_fixed',
       'away_bs_fixed', 'h_skill_fixed', 'HA_fixed', 'away_team_fixed',
       'a_skill_fixed'],
      dtype='object')

### Train/test set

In [7]:
# set1 : 이전 시즌의 팀별 스텟
# set2 : 동일한 HA 적용한 model에서 얻은 parameter 추가
# set3 : 코로나 이후 변화한 HA 를 적용한 model에서 얻은 parameter 추가
set1 = ['W_h', 'D_h', 'L_h', 'GF_h', 'GA_h', 'GD_h', 'Pts_h', 'raised_h',
        'W_a', 'D_a', 'L_a', 'GF_a', 'GA_a', 'GD_a', 'Pts_h', 'raised_a']
set2 = set1 + ['home_bs_fixed', 'away_bs_fixed', 'h_skill_fixed', 'a_skill_fixed', 'HA_fixed']
set3 = set1 + ['home_bs', 'away_bs', 'h_skill', 'a_skill', 'HA']

In [70]:
train_x_1, train_y_1 = train_df[set1], train_df['result']
X_test_1, y_test_1 = test_df[set1], test_df['result']
train_x_2, train_y_2 = train_df[set2], train_df['result']
X_test_2, y_test_2 = test_df[set2], test_df['result']
train_x_3, train_y_3 = train_df[set3], train_df['result']
X_test_3, y_test_3 = test_df[set3], test_df['result']

In [9]:
train_x

Unnamed: 0,W_h,D_h,L_h,GF_h,GA_h,GD_h,Pts_h,raised_h,W_a,D_a,...,GF_a,GA_a,GD_a,Pts_h.1,raised_a,home_bs,away_bs,h_skill,a_skill,HA
0,13,8,17,45,52,-7,47,0,14,10,...,47,40,7,47,0,0.409582,0.141514,-0.391699,0.000000,0.354171
1,21,7,10,74,49,25,70,0,11,12,...,45,46,-1,70,0,0.409582,0.141514,-0.086634,-0.398331,0.354171
2,10,7,21,43,66,-23,37,0,12,11,...,44,51,-7,37,0,0.409582,0.141514,-0.537188,-0.370428,0.354171
3,14,10,14,48,51,-3,52,0,12,11,...,52,66,-14,52,0,0.409582,0.141514,-0.438315,-0.482256,0.354171
4,19,8,11,56,51,5,65,0,20,9,...,66,41,25,65,0,0.409582,0.141514,-0.388565,-0.148938,0.354171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2361,16,7,11,62,50,12,55,0,19,9,...,63,29,34,55,0,0.530207,0.275442,-0.269599,-0.050384,0.382172
2362,16,7,11,55,42,13,55,0,23,7,...,81,44,37,55,0,0.530207,0.275442,-0.212703,0.000000,0.382172
2363,12,7,15,46,57,-11,43,0,13,5,...,49,65,-16,43,0,0.530207,0.275442,-0.432233,-0.506149,0.382172
2364,24,6,4,88,32,56,78,0,8,8,...,51,71,-20,78,0,0.530207,0.275442,0.230830,-0.448249,0.382172


In [10]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size = 0.3, random_state=1)

### logistic Regression

In [71]:
lm = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', C=10)
lm.fit(train_x_1, train_y_1)
lm.score(X_test_1, y_test_1)

0.5062326869806094

In [72]:
lm = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', C=10)
lm.fit(train_x_2, train_y_2)
lm.score(X_test_2, y_test_2)

0.5408587257617729

In [73]:
lm = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', C=10)
lm.fit(train_x_3, train_y_3)
lm.score(X_test_3, y_test_3)

0.5470914127423823

## MLP

In [294]:
train_x, train_y = train_df[set3], train_df['result']
X_test, y_test = test_df[set3], test_df['result']

In [78]:
scaler1 = StandardScaler()
scaler2 = StandardScaler()
scaler3 = StandardScaler()
scaler1.fit(pd.concat([train_x_1, X_test_1], axis=0).values)
scaler2.fit(pd.concat([train_x_2, X_test_2], axis=0).values)
scaler3.fit(pd.concat([train_x_3, X_test_3], axis=0).values)
train_x_no_1 = scaler1.transform(train_x_1.values)
train_x_no_2 = scaler2.transform(train_x_2.values)
train_x_no_3 = scaler3.transform(train_x_3.values)
X_test_no_1 = scaler1.transform(X_test_1.values)
X_test_no_2 = scaler2.transform(X_test_2.values)
X_test_no_3 = scaler3.transform(X_test_3.values)

In [79]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-2,
                     hidden_layer_sizes=(8), random_state=1, max_iter = 1000)

In [87]:
clf.fit(train_x_no_1, train_y)
clf.score(X_test_no_1, y_test)

0.4979224376731302

In [91]:
clf.fit(train_x_no_2, train_y)
clf.score(X_test_no_2, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.5380886426592798

In [95]:
clf.fit(train_x_no_3, train_y)
clf.score(X_test_no_3, y_test)

0.5380886426592798

In [96]:
#################### RANDOM FOREST CLASSIFIER
####################

In [101]:
rf = RandomForestClassifier(n_estimators=100, max_features=5)

In [102]:
rf.fit(train_x_1, train_y)
rf.score(X_test_1,y_test)

0.46883656509695293

In [103]:
rf.fit(train_x_2, train_y)
rf.score(X_test_2, y_test)

0.5117728531855956

In [104]:
rf.fit(train_x_3, train_y)
rf.score(X_test_3, y_test)

0.5020775623268698

In [356]:
################### SVM 까지 비교 score 예측
###################

In [115]:
clf = svm.SVC(kernel='linear', verbose = 1, tol = 1e-4, random_state=1)

In [116]:
clf.fit(train_x_1, train_y)
clf.score(X_test_1,y_test)

[LibSVM]

0.4951523545706371

In [117]:
clf.fit(train_x_2, train_y)
clf.score(X_test_2,y_test)

[LibSVM]

0.5367036011080333

In [None]:
clf.fit(train_x_3, train_y)
clf.score(X_test_3,y_test)

[LibSVM]

In [109]:
######################### naive_bayes

In [110]:
clf = GaussianNB()
clf.fit(train_x_1, train_y)
clf.score(X_test_1, y_test)

0.4792243767313019

In [111]:
clf = GaussianNB()
clf.fit(train_x_2, train_y)
clf.score(X_test_2, y_test)

0.4826869806094183

In [112]:
clf = GaussianNB()
clf.fit(train_x_3, train_y)
clf.score(X_test_3, y_test)

0.4868421052631579