In [237]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os
import random
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import plot_importance
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [2]:
path = os.getcwd()

In [3]:
train_PL = pd.read_csv('PL_preprocessed.csv')
train_LIGA = pd.read_csv('LIGA_preprocessed.csv')
train_SE = pd.read_csv('SE_preprocessed.csv')
train_BL = pd.read_csv('BL_preprocessed.csv')
test_PL = pd.read_csv('PL_test_pp_new.csv')
test_LIGA = pd.read_csv('LIGA_test_pp_new.csv')
test_SE = pd.read_csv('SE_test_pp_new.csv')
test_BL = pd.read_csv('BL_test_pp_new.csv')

In [4]:
train_df = pd.concat([train_PL, train_LIGA, train_SE, train_BL], axis=0)
test_df = pd.concat([test_PL, test_LIGA, test_SE, test_BL], axis=0)

In [5]:
train_df.columns

Index(['season', 'home', 'away', 'full_home_score', 'full_away_score',
       'result', 'home_team', 'P_h', 'W_h', 'D_h', 'L_h', 'GF_h', 'GA_h',
       'GD_h', 'Pts_h', 'raised_h', 'home_bs', 'away_bs', 'h_skill', 'HA',
       'away_team', 'P_a', 'W_a', 'D_a', 'L_a', 'GF_a', 'GA_a', 'GD_a',
       'Pts_a', 'raised_a', 'a_skill', 'home_team_fixed', 'home_bs_fixed',
       'away_bs_fixed', 'h_skill_fixed', 'HA_fixed', 'away_team_fixed',
       'a_skill_fixed'],
      dtype='object')

In [6]:
############ lamba 추가 set
train_df['h_lambda'] = np.exp(train_df['home_bs'] + train_df['h_skill'] - train_df['a_skill'])
train_df['a_lambda'] = np.exp(train_df['away_bs'] + train_df['a_skill'] - train_df['h_skill'])
train_df['h_lambda_fixed'] = np.exp(train_df['home_bs_fixed'] + 
                                    train_df['h_skill_fixed'] - train_df['a_skill_fixed'])
train_df['a_lambda_fixed'] = np.exp(train_df['away_bs_fixed'] +
                                    train_df['a_skill_fixed'] - train_df['h_skill_fixed'])
test_df['h_lambda'] = np.exp(test_df['home_bs'] + test_df['h_skill'] - test_df['a_skill'])
test_df['a_lambda'] = np.exp(test_df['away_bs'] + test_df['a_skill'] - test_df['h_skill'])
test_df['h_lambda_fixed'] = np.exp(test_df['home_bs_fixed'] + 
                                    test_df['h_skill_fixed'] - test_df['a_skill_fixed'])
test_df['a_lambda_fixed'] = np.exp(test_df['away_bs_fixed'] +
                                    test_df['a_skill_fixed'] - test_df['h_skill_fixed'])

# set1 : 이전 시즌의 팀별 스텟
# set2 : 동일한 HA 적용한 model에서 얻은 parameter 추가
# set3 : 코로나 이후 변화한 HA 를 적용한 model에서 얻은 parameter 추가
set1 = ['W_h', 'D_h', 'L_h', 'GF_h', 'GA_h', 'GD_h', 'Pts_h', 'raised_h',
        'W_a', 'D_a', 'L_a', 'GF_a', 'GA_a', 'GD_a', 'Pts_h', 'raised_a']
set2 = set1 + ['home_bs_fixed', 'away_bs_fixed', 'h_skill_fixed', 'a_skill_fixed', 
               'HA_fixed', 'h_lambda_fixed', 'a_lambda_fixed']
set3 = set1 + ['home_bs', 'away_bs', 'h_skill', 'a_skill', 'HA', 'h_lambda', 'a_lambda']

### Train/test set

In [7]:
# set1 : 이전 시즌의 팀별 스텟
# set2 : 동일한 HA 적용한 model에서 얻은 parameter 추가
# set3 : 코로나 이후 변화한 HA 를 적용한 model에서 얻은 parameter 추가
set1 = ['W_h', 'D_h', 'L_h', 'GF_h', 'GA_h', 'GD_h', 'Pts_h', 'raised_h',
        'W_a', 'D_a', 'L_a', 'GF_a', 'GA_a', 'GD_a', 'Pts_h', 'raised_a']
set2 = set1 + ['home_bs_fixed', 'away_bs_fixed', 'h_skill_fixed', 'a_skill_fixed', 
               'HA_fixed']
set3 = set1 + ['home_bs', 'away_bs', 'h_skill', 'a_skill', 'HA']

In [8]:
train_x_1, train_y_1 = train_df[set1], train_df['result']
X_test_1, y_test_1 = test_df[set1], test_df['result']
train_x_2, train_y_2 = train_df[set2], train_df['result']
X_test_2, y_test_2 = test_df[set2], test_df['result']
train_x_3, train_y_3 = train_df[set3], train_df['result']
X_test_3, y_test_3 = test_df[set3], test_df['result']

In [244]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size = 0.3, random_state=1)

### logistic Regression

In [265]:
lm = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', C=10)
lm.fit(train_x_1, train_y_1)
lm.score(X_test_1, y_test_1)

0.5062326869806094

In [266]:
RPS(lm.predict_proba(X_test_1), enc.fit_transform(np.array(y_test_1).reshape(-1,1)).toarray())

0.2011134079806868

In [172]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [174]:
a = np.array([0.6, 0.2, 0.2])

In [80]:
enc = OneHotEncoder()

In [94]:
y_test_1_enc = enc.fit_transform(np.array(y_test_1).reshape(-1,1)).toarray()

In [103]:
def RMS(p_proba, target_y) :
    output = ((p_proba - target_y)**2).sum()/(target_y.shape[0])
    return(output)

In [325]:
def RPS(p_proba, target_y) :
    output = ((p_proba - target_y)**2)[:, 2]
    output2 = np.sum((p_proba - target_y)[:,1:2], axis=1)**2
    output3 = np.sum((p_proba - target_y), axis=1)**2
    output = output + output2 + output3
    output /= 2
    output = np.mean(output)
    return(output)

In [279]:
lm = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', C=10)
lm.fit(train_x_2, train_y_2)
lm.score(X_test_2, y_test_2)

0.5207756232686981

In [280]:
RPS(lm.predict_proba(X_test_2), enc.fit_transform(np.array(y_test_2).reshape(-1,1)).toarray())

0.2000821013422271

In [281]:
lm = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', C=10)
lm.fit(train_x_3, train_y_3)
lm.score(X_test_3, y_test_3)

0.5228531855955678

In [282]:
RPS(lm.predict_proba(X_test_3), enc.fit_transform(np.array(y_test_3).reshape(-1,1)).toarray())

0.19985719764598608

In [102]:
0.5**2 + 0.5**2 + 0.5**2

0.75

## MLP

In [283]:
train_x, train_y = train_df[set3], train_df['result']
X_test, y_test = test_df[set3], test_df['result']

In [284]:
scaler1 = StandardScaler()
scaler2 = StandardScaler()
scaler3 = StandardScaler()
scaler1.fit(pd.concat([train_x_1, X_test_1], axis=0).values)
scaler2.fit(pd.concat([train_x_2, X_test_2], axis=0).values)
scaler3.fit(pd.concat([train_x_3, X_test_3], axis=0).values)
train_x_no_1 = scaler1.transform(train_x_1.values)
train_x_no_2 = scaler2.transform(train_x_2.values)
train_x_no_3 = scaler3.transform(train_x_3.values)
X_test_no_1 = scaler1.transform(X_test_1.values)
X_test_no_2 = scaler2.transform(X_test_2.values)
X_test_no_3 = scaler3.transform(X_test_3.values)

In [336]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-1,
                     hidden_layer_sizes=(3,3), random_state=15, max_iter = 1000)

In [337]:
clf.fit(train_x_no_1, train_y)
clf.score(X_test_no_1, y_test)

0.5083102493074793

In [338]:
RPS(clf.predict_proba(X_test_no_1), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())

0.20033708254571317

In [339]:
clf.fit(train_x_no_2, train_y)
clf.score(X_test_no_2, y_test)

0.5186980609418282

In [340]:
RPS(clf.predict_proba(X_test_no_2), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())

0.20098307946137636

In [341]:
clf.fit(train_x_no_3, train_y)
clf.score(X_test_no_3, y_test)

0.5186980609418282

In [342]:
RPS(clf.predict_proba(X_test_no_3), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())

0.20089599135517072

In [96]:
#################### RANDOM FOREST CLASSIFIER
####################

In [164]:
rf = RandomForestClassifier(n_estimators=100, max_features=5, random_state=10)

In [297]:
rf.fit(train_x_1, train_y)
rf.score(X_test_1,y_test)

0.471606648199446

In [298]:
RPS(rf.predict_proba(X_test_1), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())

0.21231617036011083

In [299]:
rf.fit(train_x_2, train_y)
rf.score(X_test_2,y_test)

0.4937673130193906

In [300]:
RPS(rf.predict_proba(X_test_2), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())

0.21003795013850415

In [301]:
rf.fit(train_x_3, train_y)
rf.score(X_test_3,y_test)

0.49584487534626037

In [302]:
RPS(rf.predict_proba(X_test_3), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())

0.20733840027700834

In [171]:
################### SVM 까지 비교 score 예측
###################

In [303]:
clf = svm.SVC(kernel='linear', verbose = 1, tol = 1e-4, C=1, 
              decision_function_shape='ovr', probability=True)

In [304]:
clf.fit(train_x_1, train_y)
clf.score(X_test_1,y_test)

[LibSVM]

0.4951523545706371

In [305]:
RPS(clf.predict_proba(X_test_1), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())

0.20501532205002634

In [306]:
clf.fit(train_x_2, train_y)
clf.score(X_test_2,y_test)

[LibSVM]

0.5159279778393352

In [307]:
RPS(clf.predict_proba(X_test_2), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())

0.20234669815442152

In [308]:
clf.fit(train_x_3, train_y)
clf.score(X_test_3,y_test)

[LibSVM]

0.5193905817174516

In [309]:
RPS(clf.predict_proba(X_test_3), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())

0.20153335609341885

In [None]:
######################### naive_bayes

In [329]:
clf = GaussianNB()
clf.fit(train_x_1, train_y)
clf.score(X_test_1, y_test)

0.4792243767313019

In [330]:
RPS(clf.predict_proba(X_test_1), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())/2

0.11651770520811668

In [345]:
clf = GaussianNB()
clf.fit(train_x_2, train_y)
clf.score(X_test_2, y_test)

0.481994459833795

In [332]:
RPS(clf.predict_proba(X_test_2), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())/2

0.1175407121844732

In [343]:
clf = GaussianNB()
clf.fit(train_x_3, train_y)
clf.score(X_test_3, y_test)

0.4778393351800554

In [334]:
RPS(clf.predict_proba(X_test_3), enc.fit_transform(np.array(y_test).reshape(-1,1)).toarray())/2

0.11771561411320036

In [335]:
clf.predict_proba(X_test_3)

array([[2.35086520e-05, 2.44002937e-04, 9.99732488e-01],
       [2.51073104e-01, 5.74532973e-01, 1.74393923e-01],
       [3.18113990e-01, 5.11661344e-01, 1.70224666e-01],
       ...,
       [1.40094760e-01, 4.55437686e-01, 4.04467555e-01],
       [9.03606643e-07, 7.95257000e-05, 9.99919571e-01],
       [8.78975751e-01, 1.10878376e-01, 1.01458725e-02]])

In [347]:
clf.class_prior_

array([0.29779548, 0.24462764, 0.45757688])

In [366]:
len(train_y[train_y==-1])/len(train_y)

0.29779547980733606

In [361]:
len(y_test)

1444

In [358]:
369/1444

0.25554016620498615

In [364]:
X_test_1==-1

Unnamed: 0,W_h,D_h,L_h,GF_h,GA_h,GD_h,Pts_h,raised_h,W_a,D_a,L_a,GF_a,GA_a,GD_a,Pts_h.1,raised_a
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
301,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
302,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
303,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
