# 머신러닝 모델 연구

현재 핵심적으로 진행하고 있는 기계학습은 특정 경기에서의 홈팀의 승부를 기준으로 하고있기 때문에, 분류로 볼 수 있다

따라서 분류모델을 중심적으로 연구하도록 하겠다.

## Logistic Regressor

종속변수와 독립변수의 관계를 구체적인 함수로 나타내어 향후 사건의 발생 확률을 예측하는 기법

종속변수가 범주형일 때,  해당 데이터의 결과가 특정 분류로 나타내기 때문에 일종의 분류(classification) 라고 볼 수 있음

In [7]:
import pandas as pd

In [8]:
train = pd.read_csv('./train_1005.csv') #~21.10.05 까지의 퍼포먼스 데이터를 통해 생성한 학습 데이터
test = pd.read_csv('./test_1005.csv') #21.10.05~21.10.18 까지의 예정된 경기를 통해 생성한 테스트 데이터

<h3>기본 문법</h3>

 - model = LogisticRegression()
 
 - model.fit(features, labels)
 
 - model.predict(features) #예측값 분류
 
 - model.predict_proba(features) #예측값 나올 확률

### 적용

In [9]:
from sklearn.linear_model import LogisticRegression

features : 종속변수를 분류하는데 사용할 독립변수들

labels : 분석을 통해 분류될 종속변수

In [10]:
train_f = train[['home_h2h','away_h2h','home_possession_percent','away_possession_percent','home_xg','away_xg',
       'home_goals','away_goals','home_progressive_runs','away_progressive_runs',
       'home_touches_in_box','away_touches_in_box','home_shots_on_target','away_shots_on_target',
       'home_through_passes_successful','away_through_passes_successful',
       'home_pass_to_final_thirds_successful','away_pass_to_final_thirds_successful',
       'home_tackles','away_tackles','home_ppda','away_ppda']]
train_labels = train['home_result_pts']

test_f = test[['home_h2h','away_h2h','home_possession_percent','away_possession_percent','home_xg','away_xg',
       'home_goals','away_goals','home_progressive_runs','away_progressive_runs',
       'home_touches_in_box','away_touches_in_box','home_shots_on_target','away_shots_on_target',
       'home_through_passes_successful','away_through_passes_successful',
       'home_pass_to_final_thirds_successful','away_pass_to_final_thirds_successful',
       'home_tackles','away_tackles','home_ppda','away_ppda']]
# test_l = test['home_result_pts']

학습/평가 세트 분리

In [13]:
# 현재 학습/평가가 분리되어있기 때문에 굳이 거칠 필요 없는 과정
# from sklearn.model_selection import train_test_split
# train_features, test_features, train_labels, test_labels = train_test_split(features, survival)

# print(model.score(train_features, train_labels))
# print(model.score(test_features, test_labels))

정규화

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_f)
test_features = scaler.transform(test_f)

모델생성

In [16]:
model = LogisticRegression()
model.fit(train_features, train_labels)

LogisticRegression()

예측

In [17]:
print(model.predict(test_features))

[1 1 1 1 3 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 0 1 3 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 3 1 1 1 3 1 1]


In [18]:
print(model.predict_proba(test_features))

[[5.16375642e-02 8.97764960e-01 5.05974754e-02]
 [1.14095419e-01 8.60084629e-01 2.58199524e-02]
 [3.24816099e-02 9.14393567e-01 5.31248227e-02]
 [7.79828036e-02 8.89200562e-01 3.28166346e-02]
 [1.56723130e-05 9.80460089e-02 9.01938319e-01]
 [4.21002298e-01 5.76711463e-01 2.28623905e-03]
 [1.08606863e-02 8.16246320e-01 1.72892994e-01]
 [7.74835974e-02 8.93552000e-01 2.89644024e-02]
 [4.95385815e-01 5.02901472e-01 1.71271284e-03]
 [6.72102628e-02 8.99694238e-01 3.30954992e-02]
 [1.80744634e-02 8.75321550e-01 1.06603986e-01]
 [2.80689771e-06 4.82272523e-02 9.51769941e-01]
 [4.08852182e-02 8.94580152e-01 6.45346296e-02]
 [6.12290597e-02 8.97815134e-01 4.09558067e-02]
 [4.68204889e-02 9.00787112e-01 5.23923996e-02]
 [1.87492328e-02 8.71377275e-01 1.09873493e-01]
 [2.47813639e-02 8.96779169e-01 7.84394672e-02]
 [1.95106584e-02 8.75105162e-01 1.05384180e-01]
 [1.47769151e-02 8.45230932e-01 1.39992153e-01]
 [1.87705150e-01 8.02554180e-01 9.74067054e-03]
 [8.73185456e-02 8.85697900e-01 2.698355

In [19]:
# 정리
test['Label'] = model.predict(test_features)
test = test[[ 'competition_id', 'competition_name',
        'season_name',  'round_name', 'matche_id',
       'date',  'home_name', 
       'away_name', 'Label','home_h2h', 'away_h2h', 'home_ppda',
       'home_tackles', 'home_pass_to_final_thirds_successful',
       'home_through_passes_successful', 'home_shots_on_target',
       'home_touches_in_box', 'home_progressive_runs', 'home_goals', 'home_xg',
       'home_possession_percent', 'away_ppda', 'away_tackles',
       'away_pass_to_final_thirds_successful',
       'away_through_passes_successful', 'away_shots_on_target',
       'away_touches_in_box', 'away_progressive_runs', 'away_goals', 'away_xg',
       'away_possession_percent']]
test

Unnamed: 0,competition_id,competition_name,season_name,round_name,matche_id,date,home_name,away_name,Label,home_h2h,...,away_ppda,away_tackles,away_pass_to_final_thirds_successful,away_through_passes_successful,away_shots_on_target,away_touches_in_box,away_progressive_runs,away_goals,away_xg,away_possession_percent
0,364,Premier League,2021/2022,Regular Season,5234881,2021-10-16 23:00:00,Aston Villa,Wolverhampton Wanderers,1,0,...,12.94784,4.953704,41.450617,2.462963,4.064815,15.37963,13.138889,1.354938,1.417229,49.104938
1,364,Premier League,2021/2022,Regular Season,5234882,2021-10-17 01:30:00,Brentford,Chelsea,1,0,...,10.903306,4.123967,52.151515,3.330579,5.289256,22.267218,15.528926,1.831956,1.756954,56.92011
2,364,Premier League,2021/2022,Regular Season,5234883,2021-10-17 22:00:00,Everton,West Ham United,1,0,...,12.169831,5.742373,38.79661,2.142373,4.098305,15.867797,10.481356,1.427119,1.463182,47.640678
3,364,Premier League,2021/2022,Regular Season,5234884,2021-10-16 23:00:00,Leicester City,Manchester United,1,0,...,10.166667,4.955381,52.664042,3.141732,5.149606,19.461942,15.102362,1.716535,1.649797,55.677165
4,364,Premier League,2021/2022,Regular Season,5234885,2021-10-16 23:00:00,Manchester City,Burnley,3,0,...,11.792388,4.851211,36.882353,2.076125,3.442907,13.66782,6.854671,1.103806,1.27081,43.415225
5,364,Premier League,2021/2022,Regular Season,5234886,2021-10-18 00:30:00,Newcastle United,Tottenham Hotspur,1,0,...,9.82409,5.554622,49.380952,3.2493,5.473389,18.142857,13.462185,1.901961,1.674955,56.47619
6,364,Premier League,2021/2022,Regular Season,5234887,2021-10-16 23:00:00,Norwich City,Brighton,1,0,...,11.889769,5.564356,38.481848,2.105611,3.914191,16.267327,11.085809,1.20132,1.382533,50.115512
7,364,Premier League,2021/2022,Regular Season,5234888,2021-10-16 23:00:00,Southampton,Leeds United,1,0,...,8.494704,7.29595,40.859813,2.669782,4.426791,18.186916,12.380062,1.401869,1.50168,54.922118
8,364,Premier League,2021/2022,Regular Season,5234889,2021-10-16 20:30:00,Watford,Liverpool,1,0,...,9.25618,4.685393,57.129213,4.089888,5.786517,24.002809,14.904494,1.974719,1.882216,58.80618
9,412,Ligue 1,2021/2022,Regular Season,5240983,2021-10-17 20:00:00,Troyes,Nice,1,0,...,11.742599,5.093863,40.472924,2.440433,4.263538,13.158845,14.032491,1.350181,1.297845,54.429603


In [20]:
test.to_csv('./LR.csv',index=False)

## SVM 서포트 벡터 머신

서포트 벡터 머신 중에서도 선형 SVM

클래스를 구분하는 초평면(결정 경계)와 훈련 샘플간의 거리(마진)을 최대화하는 분류 모델. 

이때 마진이 클수록 일반화의 오차가 작아진다.

### 기본 문법

 - model = SVC()
 
 - model.fit(x_train, y_train)
 
 - model.predict(x_test)  #예측값 분류
 
 - model.predict_proba(x_test) #예측값 나올 확률
 
     - predict_proba() 사용할 땐 model = SVC(probability = True) 사용

In [21]:
import pandas as pd
from sklearn.svm import SVC

In [22]:
x_train = train[['home_h2h','away_h2h','home_possession_percent','away_possession_percent','home_xg','away_xg',
       'home_goals','away_goals','home_progressive_runs','away_progressive_runs',
       'home_touches_in_box','away_touches_in_box','home_shots_on_target','away_shots_on_target',
       'home_through_passes_successful','away_through_passes_successful',
       'home_pass_to_final_thirds_successful','away_pass_to_final_thirds_successful',
       'home_tackles','away_tackles','home_ppda','away_ppda']]
y_train = train['home_result_pts']

x_test = test[['home_h2h','away_h2h','home_possession_percent','away_possession_percent','home_xg','away_xg',
       'home_goals','away_goals','home_progressive_runs','away_progressive_runs',
       'home_touches_in_box','away_touches_in_box','home_shots_on_target','away_shots_on_target',
       'home_through_passes_successful','away_through_passes_successful',
       'home_pass_to_final_thirds_successful','away_pass_to_final_thirds_successful',
       'home_tackles','away_tackles','home_ppda','away_ppda']]

In [23]:
svclassifier = SVC(kernel='linear',)
svclassifier.fit(x_train, y_train)

y_pred = svclassifier.predict(x_test) 
y_pred

array([1, 1, 1, 1, 3, 0, 1, 1, 0, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 3, 1, 1, 1, 3, 1, 1], dtype=int64)

In [24]:
svclassifier = SVC(kernel='linear',probability=True)
svclassifier.fit(x_train, y_train)

svclassifier.predict_proba(x_test)

array([[6.19052970e-02, 8.88305314e-01, 4.97893889e-02],
       [1.21879608e-01, 8.52325466e-01, 2.57949259e-02],
       [4.97189623e-02, 8.85410464e-01, 6.48705734e-02],
       [9.49386769e-02, 8.71960411e-01, 3.31009125e-02],
       [3.26277603e-04, 8.27866639e-02, 9.16887058e-01],
       [4.83784161e-01, 5.11160814e-01, 5.05502476e-03],
       [2.43361172e-02, 7.95366726e-01, 1.80297157e-01],
       [9.48234203e-02, 8.72116177e-01, 3.30604025e-02],
       [5.51928669e-01, 4.44237494e-01, 3.83383761e-03],
       [8.50025873e-02, 8.79159430e-01, 3.58379829e-02],
       [3.39202826e-02, 8.51201213e-01, 1.14878504e-01],
       [7.29207370e-04, 3.92847241e-02, 9.59986069e-01],
       [5.15991155e-02, 8.86321783e-01, 6.20791013e-02],
       [7.47199756e-02, 8.84545754e-01, 4.07342704e-02],
       [5.97110296e-02, 8.88447532e-01, 5.18414382e-02],
       [3.36982167e-02, 8.50421470e-01, 1.15880313e-01],
       [4.13098888e-02, 8.70462610e-01, 8.82275010e-02],
       [3.49711608e-02, 8.54716

In [25]:
test['Label'] = y_pred
test_svm = test[[ 'competition_id', 'competition_name',
        'season_name',  'round_name', 'matche_id',
       'date',  'home_name', 
       'away_name', 'Label','home_h2h', 'away_h2h', 'home_ppda',
       'home_tackles', 'home_pass_to_final_thirds_successful',
       'home_through_passes_successful', 'home_shots_on_target',
       'home_touches_in_box', 'home_progressive_runs', 'home_goals', 'home_xg',
       'home_possession_percent', 'away_ppda', 'away_tackles',
       'away_pass_to_final_thirds_successful',
       'away_through_passes_successful', 'away_shots_on_target',
       'away_touches_in_box', 'away_progressive_runs', 'away_goals', 'away_xg',
       'away_possession_percent']]

test_svm

Unnamed: 0,competition_id,competition_name,season_name,round_name,matche_id,date,home_name,away_name,Label,home_h2h,...,away_ppda,away_tackles,away_pass_to_final_thirds_successful,away_through_passes_successful,away_shots_on_target,away_touches_in_box,away_progressive_runs,away_goals,away_xg,away_possession_percent
0,364,Premier League,2021/2022,Regular Season,5234881,2021-10-16 23:00:00,Aston Villa,Wolverhampton Wanderers,1,0,...,12.94784,4.953704,41.450617,2.462963,4.064815,15.37963,13.138889,1.354938,1.417229,49.104938
1,364,Premier League,2021/2022,Regular Season,5234882,2021-10-17 01:30:00,Brentford,Chelsea,1,0,...,10.903306,4.123967,52.151515,3.330579,5.289256,22.267218,15.528926,1.831956,1.756954,56.92011
2,364,Premier League,2021/2022,Regular Season,5234883,2021-10-17 22:00:00,Everton,West Ham United,1,0,...,12.169831,5.742373,38.79661,2.142373,4.098305,15.867797,10.481356,1.427119,1.463182,47.640678
3,364,Premier League,2021/2022,Regular Season,5234884,2021-10-16 23:00:00,Leicester City,Manchester United,1,0,...,10.166667,4.955381,52.664042,3.141732,5.149606,19.461942,15.102362,1.716535,1.649797,55.677165
4,364,Premier League,2021/2022,Regular Season,5234885,2021-10-16 23:00:00,Manchester City,Burnley,3,0,...,11.792388,4.851211,36.882353,2.076125,3.442907,13.66782,6.854671,1.103806,1.27081,43.415225
5,364,Premier League,2021/2022,Regular Season,5234886,2021-10-18 00:30:00,Newcastle United,Tottenham Hotspur,0,0,...,9.82409,5.554622,49.380952,3.2493,5.473389,18.142857,13.462185,1.901961,1.674955,56.47619
6,364,Premier League,2021/2022,Regular Season,5234887,2021-10-16 23:00:00,Norwich City,Brighton,1,0,...,11.889769,5.564356,38.481848,2.105611,3.914191,16.267327,11.085809,1.20132,1.382533,50.115512
7,364,Premier League,2021/2022,Regular Season,5234888,2021-10-16 23:00:00,Southampton,Leeds United,1,0,...,8.494704,7.29595,40.859813,2.669782,4.426791,18.186916,12.380062,1.401869,1.50168,54.922118
8,364,Premier League,2021/2022,Regular Season,5234889,2021-10-16 20:30:00,Watford,Liverpool,0,0,...,9.25618,4.685393,57.129213,4.089888,5.786517,24.002809,14.904494,1.974719,1.882216,58.80618
9,412,Ligue 1,2021/2022,Regular Season,5240983,2021-10-17 20:00:00,Troyes,Nice,1,0,...,11.742599,5.093863,40.472924,2.440433,4.263538,13.158845,14.032491,1.350181,1.297845,54.429603


## 의사결정나무

데이터를 분석하여 이들 사이에 존재하는 패턴을 예측 가능한 규칙들의 조합으로 나타내고 그 모양이 나무와 같이 나타나는 모델. 구분 한 뒤 각 영역의 순도를 증가시키고 불순도/불확실성이 최대한 감소하도록 하는 방향으로 학습을 진행한다.

분류와 회귀 모두 가능하지만 주로 분류에서 사용하고 있다.


### 기본 문법 
 
 - model = DecisionTreeClassifier()
 
 - model.fit(x_train, y_train)
 
 - model.predict(x_test)  #예측값 분류
 
 - model.predict_proba(x_test) #예측값 나올 확률
 
     - predict_proba() 사용할 땐 model = DecisionTreeClassifier(probability = True) 사용

In [26]:
from sklearn import tree

In [27]:
model = tree.DecisionTreeClassifier()
model.fit(x_train, y_train)
model.predict(x_test)

array([1, 1, 1, 1, 3, 0, 1, 1, 0, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 0, 1, 1,
       3, 1, 0, 3, 3, 1, 1, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1,
       1, 1, 1, 3, 0, 0, 1, 3, 1, 1], dtype=int64)

In [28]:
model.predict_proba(x_test)

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0

## 랜덤 포레스트

다수의 의사결정나무를 학습하고 그 나무들의 분류를 집계하여 최종적으로 분류하는 앙상블 방법. 의사결정나무의 단점 중 하나인 과대적합(over fitting)이 나타나는 나무의 영향력을 줄일 수 있다.

### 기본 문법 
 
 - model = RandomForestClassifier()
 
 - model.fit(x_train, y_train)
 
 - model.predict(x_test)  #예측값 분류
 
 - model.predict_proba(x_test) #예측값 나올 확률
 
     - predict_proba() 사용할 땐 model = RandomForestClassifier(probability = True) 사용

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
# 변수 조작
# model = RandomForestClassifier(n_estimators = 20, max_depth = 5, random_state=0)
# model = RandomForestClassifier(n_estimators = 50, max_depth = 5, random_state=0)
model = RandomForestClassifier(n_estimators = 300, max_depth = 5, random_state=0)
model.fit(x_train, y_train)
model.predict(x_test)


array([1, 1, 1, 1, 3, 0, 1, 1, 0, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 0, 1, 1,
       3, 1, 0, 3, 3, 0, 1, 0, 1, 0, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1,
       1, 1, 1, 3, 0, 0, 1, 3, 1, 1], dtype=int64)

In [36]:
model.predict_proba(x_test)

array([[0.28951231, 0.4471605 , 0.26332719],
       [0.33071604, 0.35845002, 0.31083393],
       [0.28970133, 0.44471513, 0.26558353],
       [0.35002829, 0.362297  , 0.28767471],
       [0.06024827, 0.16513574, 0.774616  ],
       [0.69628775, 0.19776279, 0.10594947],
       [0.27175549, 0.43915298, 0.28909153],
       [0.29009988, 0.440084  , 0.26981612],
       [0.71946325, 0.18749334, 0.09304341],
       [0.28830405, 0.44738364, 0.26431231],
       [0.28204803, 0.44088176, 0.27707021],
       [0.069994  , 0.16945435, 0.76055165],
       [0.28516455, 0.4451158 , 0.26971965],
       [0.27888641, 0.44504149, 0.2760721 ],
       [0.267881  , 0.44606013, 0.28605886],
       [0.30340504, 0.3639952 , 0.33259976],
       [0.28917656, 0.45152915, 0.25929429],
       [0.28226257, 0.44528928, 0.27244815],
       [0.09656527, 0.19459729, 0.70883744],
       [0.69358891, 0.19829382, 0.10811727],
       [0.30160313, 0.4413756 , 0.25702127],
       [0.27849707, 0.36395734, 0.35754559],
       [0.