In [485]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os
import random
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import plot_importance

In [450]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.3.3-py3-none-win_amd64.whl (95.2 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.3.3
Note: you may need to restart the kernel to use updated packages.


In [389]:
path = os.getcwd()

In [390]:
os.listdir(path)

['.ipynb_checkpoints',
 '.RData',
 '.Rhistory',
 '.Rproj.user',
 'BL.csv',
 'BL.RData',
 'data',
 'LIGA.csv',
 'LIGA.RData',
 'note.R',
 'PL.csv',
 'PL.RData',
 'plot',
 'plot 그리기.R',
 'preprocessed.R',
 'preprocessing_utils.R',
 'SE.csv',
 'SE.RData',
 'soccer.Rproj',
 'Untitled.ipynb']

In [391]:
########### read csv files

In [392]:
PL = pd.read_csv('PL.csv').iloc[:,2:]
LIGA = pd.read_csv('LIGA.csv').iloc[:,2:]
BL = pd.read_csv('BL.csv').iloc[:,2:]
SE = pd.read_csv('SE.csv').iloc[:,2:]

df = pd.concat([PL,LIGA,BL,SE], axis=0)
df = df.dropna()

In [393]:
def win(x):
    if x > 0:
        return 1
    if x == 0:
        return 0
    else :
        return -1

In [394]:
result = list(map(win, (df['full_home_score'] - df['full_away_score'])))
len(result)

11250

In [395]:
df['result'] = result

In [396]:
df[df['result']==0].shape[0]/11250

0.24515555555555554

In [397]:
result[result == 0]

0

In [398]:
df[df['result']>0].shape[0]/11250

0.46044444444444443

In [399]:
df[df['result']<0].shape[0]/11250

0.2944

In [400]:
def null_classifier(n_games):
    random_list = np.random.rand(n_games)
    def classifier(x):
        if x < 0.460:
            return 1
        elif x < 0.705:
            return 0
        else :
            return -1
    return(np.array(list(map(classifier, random_list))))    

In [401]:
aa = null_classifier(11250)

In [402]:
aa[aa == df['result']].shape[0]/11250

0.3551111111111111

In [403]:
df.columns

Index(['home_shot', 'away_shot', 'home_possession', 'away_possession',
       'home_pass_success', 'away_pass_success', 'home_dribbles',
       'away_dribbles', 'home_aerials_won', 'away_aerials_won', 'home_tackles',
       'away_tackles', 'home_corners', 'away_corners', 'home_dispossessed',
       'away_dispossessed', 'home_missing_player', 'away_missing_player',
       'home', 'away', 'half_home_score', 'half_away_score', 'full_home_score',
       'full_away_score', 'kick_off', 'day', 'date', 'matchup_home_goals',
       'matchup_away_goals', 'matchup_home_wins', 'matchup_draw',
       'matchup_away_wins', 'home_total_att', 'away_total_att',
       'home_open_att', 'away_open_att', 'home_set_att', 'away_set_att',
       'home_counter_att', 'away_counter_att', 'home_pk_att', 'away_pk_att',
       'home_own_att', 'away_own_att', 'home_total_passes',
       'away_total_passes', 'home_crosses_passes', 'away_crosses_passes',
       'home_long_balls', 'away_long_balls', 'home_short_passes'

In [404]:
df = df[['day','home_missing_player','away_missing_player','matchup_home_wins','matchup_draw',
    'matchup_away_wins', 'home_last_5_points', 'away_last_5_points', 'home_last_5_goals',
   'away_last_5_goals', 'home_last_5_conceded','away_last_5_conceded','home_last_5_passes',
   'away_last_5_passes', 'home_last_5_shot', 'away_last_5_shot','home_uefa','away_uefa','result']]

In [405]:
df['day'] = ['weekend' if i in ['Sat', 'Sun'] else 'week' for i in df['day']]

In [406]:
df['result'] = df['result']+1

In [407]:
df.head()

Unnamed: 0,day,home_missing_player,away_missing_player,matchup_home_wins,matchup_draw,matchup_away_wins,home_last_5_points,away_last_5_points,home_last_5_goals,away_last_5_goals,home_last_5_conceded,away_last_5_conceded,home_last_5_passes,away_last_5_passes,home_last_5_shot,away_last_5_shot,home_uefa,away_uefa,result
49,weekend,1,7,33,17,50,3.0,3.0,3.0,4.0,8.0,7.0,2116.0,2255.0,68.0,73.0,17.157,40.157,1
50,weekend,2,3,50,33,17,7.0,7.0,6.0,4.0,7.0,6.0,2565.0,2087.0,75.0,67.0,115.157,17.157,2
51,weekend,4,5,0,17,83,9.0,4.0,4.0,7.0,2.0,10.0,1919.0,1715.0,63.0,69.0,34.157,30.157,2
52,weekend,6,5,83,0,17,4.0,3.0,6.0,8.0,14.0,13.0,2677.0,2112.0,85.0,66.0,108.157,29.157,2
55,weekend,4,7,0,0,100,8.0,15.0,3.0,21.0,5.0,4.0,1582.0,2752.0,42.0,92.0,17.157,151.157,1


In [408]:
df[['week','weekend']] = pd.get_dummies(df['day'])

In [409]:
df = df.drop(['day'], axis=1)

In [410]:
df

Unnamed: 0,home_missing_player,away_missing_player,matchup_home_wins,matchup_draw,matchup_away_wins,home_last_5_points,away_last_5_points,home_last_5_goals,away_last_5_goals,home_last_5_conceded,away_last_5_conceded,home_last_5_passes,away_last_5_passes,home_last_5_shot,away_last_5_shot,home_uefa,away_uefa,result,week,weekend
49,1,7,33,17,50,3.0,3.0,3.0,4.0,8.0,7.0,2116.0,2255.0,68.0,73.0,17.157,40.157,1,0,1
50,2,3,50,33,17,7.0,7.0,6.0,4.0,7.0,6.0,2565.0,2087.0,75.0,67.0,115.157,17.157,2,0,1
51,4,5,0,17,83,9.0,4.0,4.0,7.0,2.0,10.0,1919.0,1715.0,63.0,69.0,34.157,30.157,2,0,1
52,6,5,83,0,17,4.0,3.0,6.0,8.0,14.0,13.0,2677.0,2112.0,85.0,66.0,108.157,29.157,2,0,1
55,4,7,0,0,100,8.0,15.0,3.0,21.0,5.0,4.0,1582.0,2752.0,42.0,92.0,17.157,151.157,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3414,5,4,0,0,100,6.0,6.0,9.0,8.0,10.0,10.0,2546.0,2176.0,83.0,65.0,14.945,14.945,0,0,1
3415,3,3,0,40,60,1.0,10.0,3.0,10.0,16.0,3.0,2124.0,2103.0,59.0,63.0,14.945,53.945,0,0,1
3416,6,3,33,33,33,5.0,7.0,10.0,5.0,8.0,5.0,2943.0,2176.0,70.0,54.0,18.945,14.945,0,0,1
3417,4,2,67,17,17,6.0,5.0,4.0,7.0,13.0,9.0,2324.0,2478.0,53.0,66.0,14.945,14.945,2,0,1


In [411]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(df.values)

In [412]:
df = pd.DataFrame(x_scaled)

In [413]:
df[17] = df[17].astype(np.str)

In [414]:
X = df.drop([17],axis=1)

In [415]:
y = np.array(result)

In [416]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)

In [417]:
lm = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', C=0.1)
lm.fit(X_train, y_train)

LogisticRegression(C=0.1, multi_class='ovr', solver='liblinear')

In [418]:
lm

LogisticRegression(C=0.1, multi_class='ovr', solver='liblinear')

In [419]:
lm.score(X_test, y_test)

0.5214814814814814

In [357]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(2, 2), random_state=1, max_iter = 1000)

In [358]:
clf.fit(np.array(X_train), np.array(y_train))

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(2, 2), max_iter=1000,
              random_state=1, solver='lbfgs')

In [359]:
clf.predict(X_test)

array([ 1,  1, -1, ..., -1,  1, -1])

In [360]:
clf.score(X_test,y_test)

0.5214814814814814

In [352]:
clf.predict_proba(X_test)

array([[0.25620075, 0.2897787 , 0.45402055],
       [0.20591698, 0.26240664, 0.53167638],
       [0.48390742, 0.25464273, 0.26144985],
       ...,
       [0.48345293, 0.25476557, 0.2617815 ],
       [0.09556793, 0.17192272, 0.73250935],
       [0.71045725, 0.16734733, 0.12219541]])

In [278]:
np.array(X_train)

array([[0.46666667, 0.1875    , 0.17      , ..., 0.50984522, 0.        ,
        1.        ],
       [0.2       , 0.3125    , 0.5       , ..., 0.20419713, 0.        ,
        1.        ],
       [0.26666667, 0.9375    , 0.33      , ..., 0.11132827, 0.        ,
        1.        ],
       ...,
       [0.26666667, 0.375     , 0.5       , ..., 0.59273175, 0.        ,
        1.        ],
       [0.33333333, 0.25      , 0.67      , ..., 0.05252534, 0.        ,
        1.        ],
       [0.33333333, 0.125     , 0.5       , ..., 0.03011032, 1.        ,
        0.        ]])

In [281]:
np.array(y_train)

array([-1,  0,  1, ...,  1,  1,  0])

In [387]:
X_train

array([[ 1.44331303, -0.42313392, -0.78285026, ...,  1.44099909,
        -0.50614347,  0.50614347],
       [-0.4232699 ,  0.50920509,  0.50460203, ...,  0.10189669,
        -0.50614347,  0.50614347],
       [ 0.04337583,  5.17090012, -0.15863097, ..., -0.30497941,
        -0.50614347,  0.50614347],
       ...,
       [ 0.04337583,  0.97537459,  0.50460203, ...,  1.80414074,
        -0.50614347,  0.50614347],
       [ 0.51002156,  0.04303558,  1.16783503, ..., -0.56260626,
        -0.50614347,  0.50614347],
       [ 0.51002156, -0.88930342,  0.50460203, ..., -0.66081068,
         1.9757244 , -1.9757244 ]])

In [304]:
scaler = StandardScaler()

In [307]:
scaler.fit(X_train.values)

StandardScaler()

In [309]:
X_train = scaler.transform(X_train.values)
X_test = scaler.transform(X_test.values)

In [447]:
rf = RandomForestClassifier(n_estimators=300, max_features=2)
rf.fit(X_train, y_train)

RandomForestClassifier(max_features=2, n_estimators=300)

In [475]:
y_train+1

array([0, 1, 2, ..., 2, 2, 1])

In [472]:
(pd.get_dummies(y_train)).shape

(7875, 3)

In [476]:
dtrain = xgb.DMatrix(data=X_train, label = y_train+1)
dtest = xgb.DMatrix(data=X_test, label = y_test+1)

In [590]:
params = {'n_estimators' : 200,
          'subsample' : 0.5,
          'colsample_bytree' : 0.5,
          'num_class' : 3,
         'objective' : 'multi:softprob',
         'eval_metric' : 'mlogloss',
         'early_stoppings' : 100 }

num_rounds = 10

In [591]:
wlist = [(dtrain, 'train'), (dtest,'eval')]
xgb_model = xgb.train(params = params, dtrain=dtrain, num_boost_round=num_rounds, evals=wlist)

Parameters: { early_stoppings, n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-mlogloss:1.04042	eval-mlogloss:1.05583
[1]	train-mlogloss:1.00311	eval-mlogloss:1.03384
[2]	train-mlogloss:0.97396	eval-mlogloss:1.01851
[3]	train-mlogloss:0.95132	eval-mlogloss:1.01088
[4]	train-mlogloss:0.93511	eval-mlogloss:1.00512
[5]	train-mlogloss:0.92320	eval-mlogloss:1.00240
[6]	train-mlogloss:0.91204	eval-mlogloss:1.00112
[7]	train-mlogloss:0.90179	eval-mlogloss:1.00011
[8]	train-mlogloss:0.89257	eval-mlogloss:1.00035
[9]	train-mlogloss:0.88341	eval-mlogloss:0.99960


In [592]:
pred_probs = xgb_model.predict(dtest)

In [593]:
y_pred = np.argmax(pred_probs, axis=1)

In [594]:
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))

In [595]:
get_clf_eval(y_test+1,y_pred)

오차행렬:
 [[ 448   25  509]
 [ 200   30  611]
 [ 236   39 1277]]

정확도: 0.5200
