In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# 데이터 불러오기

In [31]:
# 데이터 로드
data = pd.read_csv('~/Documents/ds_study/EPL_prediction_ML/data/merge_data.csv')
data

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,HTR,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR,home_xG,away_xG
0,2017-08-11,Arsenal,Leicester,H,D,27,6,10,3,9,4,9,12,0,1,0,0,2.543290,1.464950
1,2017-08-12,Brighton,Man City,A,D,6,14,2,4,3,10,6,9,0,2,0,0,0.276343,1.867510
2,2017-08-12,Chelsea,Burnley,A,A,19,10,6,5,8,5,16,11,3,3,2,0,1.356510,0.564237
3,2017-08-12,Crystal Palace,Huddersfield,A,A,14,8,4,6,12,9,7,19,1,3,0,0,0.988934,1.736570
4,2017-08-12,Everton,Stoke,H,H,9,9,4,1,6,7,13,10,1,1,0,0,0.720574,0.276440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2107,2023-05-28,Everton,Bournemouth,H,D,13,7,6,2,11,12,9,3,1,3,0,0,1.129330,0.432084
2108,2023-05-28,Leeds,Tottenham,A,A,19,11,2,7,7,5,12,3,3,0,0,0,1.191510,2.033150
2109,2023-05-28,Leicester,West Ham,H,H,13,16,4,3,8,10,3,5,1,1,0,0,2.030130,0.970352
2110,2023-05-28,Man United,Fulham,H,D,21,10,8,3,14,10,5,4,1,2,0,0,2.747660,1.881670


In [32]:

# 결측지 있는지 확인

missing_values = data.isnull().sum()
print(missing_values)

Date        0
HomeTeam    0
AwayTeam    0
FTR         0
HTR         0
HS          0
AS          0
HST         0
AST         0
HC          0
AC          0
HF          0
AF          0
HY          0
AY          0
HR          0
AR          0
home_xG     0
away_xG     0
dtype: int64


In [33]:
#데이터 정확히 확인
print(data.head())

print(data.describe())

         Date        HomeTeam      AwayTeam FTR HTR  HS  AS  HST  AST  HC  AC  \
0  2017-08-11         Arsenal     Leicester   H   D  27   6   10    3   9   4   
1  2017-08-12        Brighton      Man City   A   D   6  14    2    4   3  10   
2  2017-08-12         Chelsea       Burnley   A   A  19  10    6    5   8   5   
3  2017-08-12  Crystal Palace  Huddersfield   A   A  14   8    4    6  12   9   
4  2017-08-12         Everton         Stoke   H   H   9   9    4    1   6   7   

   HF  AF  HY  AY  HR  AR   home_xG   away_xG  
0   9  12   0   1   0   0  2.543290  1.464950  
1   6   9   0   2   0   0  0.276343  1.867510  
2  16  11   3   3   2   0  1.356510  0.564237  
3   7  19   1   3   0   0  0.988934  1.736570  
4  13  10   1   1   0   0  0.720574  0.276440  
                HS           AS          HST          AST           HC  \
count  2112.000000  2112.000000  2112.000000  2112.000000  2112.000000   
mean     13.580966    11.305398     4.660038     3.970644     6.310606   
std

# 데이터 전처리

In [34]:
# 범주형 데이터를 원핫 인코딩으로 변환하는 함수
def preprocess_features(data, categorical_features, numerical_features, encoder=None, scaler=None):

    
# 범주형 데이터 원핫 인코딩
    if encoder is None: # encoder가 제공되지 않았다면 새로운 OneHotEncoder 객체를 생성후 데이터에 fit_transform 적용
        encoder = OneHotEncoder()
        encoded_categorical = encoder.fit_transform(data[categorical_features])
    else: # encoder가 제공되었다면 기존 인코더를 사용하여 데이터에 transform 적용
        encoded_categorical = encoder.transform(data[categorical_features])
    encoded_categorical = pd.DataFrame(encoded_categorical.toarray(), columns=encoder.get_feature_names_out(categorical_features))

 # 숫자형 데이터 스케일링
    if scaler is None:  # scaler가 제공되지 않았다면 새로운 StandardScaler 객체를 생성후 데이터에 fit_transform 적용
        scaler = StandardScaler()
        scaled_numerical = scaler.fit_transform(data[numerical_features])
    else:   # scaler가 제공되었다면 기존 스케일러를 사용하여 데이터에 transform 적용
        scaled_numerical = scaler.transform(data[numerical_features])
    scaled_numerical = pd.DataFrame(scaled_numerical, columns=numerical_features)

# 변환된 데이터 결합
    preprocessed_data = pd.concat([encoded_categorical, scaled_numerical], axis=1)
    # 결합된 데이터프레임과 사용된 encoder 및 scaler 객체를 반환
    return preprocessed_data, encoder, scaler


In [35]:
# 범주형 및 숫자형 특성 정의
categorical_features = ['HomeTeam', 'AwayTeam']  # 범주형 특성
numerical_features = ['HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'home_xG', 'away_xG']  # 숫자형 특성


# 독립 변수 및 종속 변수 설정

In [36]:
# 전처리된 데이터셋 생성
X, encoder, scaler = preprocess_features(data, categorical_features, numerical_features)
# 타겟 변수
y = data['FTR']

# 모델 훈련 및 평가

In [37]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
#최적의 하이퍼파라미터 찾기

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

param_grid = {
    'C': [0.001, 0.01, 0.1, 1],
    'solver': ['liblinear', 'saga', 'newton-cg', 'lbfgs', 'sag'],
    'max_iter': [500, 1000, 2000]
}

grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 최적의 매개변수
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'C': 1, 'max_iter': 500, 'solver': 'liblinear'}


In [39]:
# 과적합 검사
from sklearn.model_selection import cross_val_score

# 교차 검증 수행
clf = LogisticRegression(C=1, solver='liblinear', max_iter=500, random_state=42)
scores = cross_val_score(clf, X_train, y_train, cv=5)  # 5겹 교차 검증
print("Cross-validation scores:", scores)
print("Average score:", scores.mean())

Cross-validation scores: [0.63313609 0.64201183 0.65976331 0.66272189 0.67655786]
Average score: 0.6548381999192315


In [51]:
# 모델 학습
model = LogisticRegression(C=1, solver='liblinear', max_iter=500, random_state=42)
model.fit(X_train, y_train)

In [41]:
# 훈련 세트와 테스트 세트에서 성능 평가
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print("Training F1 score:", f1_score(y_train, y_pred_train, average='macro'))
print("Test F1 score:", f1_score(y_test, y_pred_test, average='macro'))

Training F1 score: 0.6086192626277324
Test F1 score: 0.5972614405450226


In [42]:
# F1 점수를 사용하여 5-겹 교차 검증 수행
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=1, solver='liblinear', max_iter=500, random_state=42))

cross_val_f1_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1_macro')
print("교차 검증 F1 점수:", cross_val_f1_scores)
print("평균 F1 점수:", cross_val_f1_scores.mean())

교차 검증 F1 점수: [0.57818237 0.55520819 0.55841804 0.59416131 0.52409209]
평균 F1 점수: 0.5620124003401162


In [49]:
# 혼동 행렬 출력(성능 평가)
print("Confusion Matrix (Train):\n", confusion_matrix(y_train, y_pred_train))
print("Confusion Matrix (Test):\n", confusion_matrix(y_test, y_pred_test))

Confusion Matrix (Train):
 [[442  41  72]
 [122  93 172]
 [ 72  39 636]]
Confusion Matrix (Test):
 [[109  10  21]
 [ 30  25  47]
 [ 17  15 149]]


# 예측 수행

In [52]:
model.predict(X_test)

array(['H', 'H', 'H', 'H', 'A', 'H', 'H', 'A', 'D', 'H', 'H', 'A', 'H',
       'H', 'H', 'A', 'H', 'A', 'H', 'A', 'A', 'A', 'H', 'H', 'D', 'A',
       'A', 'A', 'H', 'D', 'H', 'A', 'H', 'H', 'H', 'A', 'A', 'A', 'H',
       'H', 'H', 'D', 'H', 'A', 'H', 'A', 'A', 'A', 'A', 'D', 'D', 'H',
       'H', 'A', 'H', 'H', 'H', 'A', 'A', 'H', 'A', 'H', 'A', 'A', 'H',
       'H', 'A', 'H', 'H', 'H', 'A', 'H', 'H', 'H', 'H', 'A', 'A', 'A',
       'H', 'H', 'H', 'D', 'H', 'A', 'H', 'A', 'D', 'H', 'A', 'H', 'H',
       'H', 'D', 'H', 'A', 'H', 'A', 'A', 'A', 'H', 'H', 'A', 'H', 'H',
       'H', 'A', 'H', 'H', 'H', 'H', 'D', 'A', 'A', 'D', 'H', 'H', 'H',
       'H', 'H', 'H', 'A', 'H', 'H', 'H', 'D', 'A', 'H', 'D', 'A', 'A',
       'A', 'H', 'H', 'A', 'H', 'H', 'H', 'A', 'H', 'D', 'D', 'A', 'D',
       'D', 'D', 'H', 'H', 'D', 'A', 'D', 'H', 'A', 'A', 'A', 'H', 'H',
       'H', 'H', 'A', 'H', 'A', 'A', 'D', 'A', 'H', 'D', 'H', 'A', 'A',
       'H', 'A', 'A', 'A', 'A', 'A', 'A', 'H', 'A', 'A', 'H', 'H

In [53]:
#각 테스트 샘플이 각 클래스에 속할 확률 구하기
Y_predict = model.predict_proba(X_test)
# 반환된 값은 각 행이 테스트 샘플을 나타내고, 각 열이 클래스의 확률을 나타냄
Y_predict = pd.DataFrame(Y_predict, columns=['Away Team','Draw','Home Team'])
# 확률을 퍼센트(백분율)로 변환
display((Y_predict *100).head(10))

Unnamed: 0,Away Team,Draw,Home Team
0,15.341114,14.111922,70.546965
1,17.530893,12.097811,70.371296
2,17.02485,31.674517,51.300633
3,16.098507,11.204018,72.697474
4,63.202413,30.370134,6.427453
5,0.790162,26.363326,72.846511
6,0.076027,6.344673,93.5793
7,65.762644,20.375373,13.861983
8,34.441409,51.903907,13.654684
9,28.94541,16.108832,54.945758


In [54]:
# 기본적인 팀 정보만을 유지, 나머지 통계는 초기화하여 모델이 이를 예측하게끔 준비하는 과정

fixtures = pd.read_csv('~/Documents/ds_study/EPL_prediction_ML/data/merge_data.csv')
fixtures = fixtures.drop(['Date'], axis=1)
fixtures.columns.values[0:2] = ['HomeTeam', 'AwayTeam']
# 나머지 열 초기화
fixtures['HS'] = 0
fixtures['AS'] = 0
fixtures['HST'] = 0
fixtures['AST'] = 0
fixtures['HF'] = 0
fixtures['AF'] = 0
fixtures['HC'] = 0
fixtures['AC'] = 0
fixtures['HY'] = 0
fixtures['AY'] = 0
fixtures['HR'] = 0
fixtures['AR'] = 0
fixtures['home_xG'] = 0
fixtures['away_xG'] = 0

In [55]:
display(fixtures.head(10))

Unnamed: 0,HomeTeam,AwayTeam,FTR,HTR,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR,home_xG,away_xG
0,Arsenal,Leicester,H,D,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Brighton,Man City,A,D,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Chelsea,Burnley,A,A,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Crystal Palace,Huddersfield,A,A,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Everton,Stoke,H,H,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,Southampton,Swansea,D,D,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Watford,Liverpool,D,H,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,West Brom,Bournemouth,H,H,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Man United,West Ham,H,H,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Newcastle,Tottenham,A,D,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [56]:
# 똑같이 위에서 똑같이 수행한 원핫 인코딩 실행
preprocessedFixtures, _, _ = preprocess_features(data, categorical_features, numerical_features, encoder=None, scaler=None)

In [57]:
# 다시 한번 학습데이터와 예측데이터의 컬럼이 일치하는지 확인 

# 학습 데이터셋 컬럼 확인
train_columns = X_train.columns
print("학습 데이터셋 컬럼:")
print(train_columns)

# 예측 데이터셋 컬럼 확인
predict_columns = preprocessedFixtures.columns
print("예측 데이터셋 컬럼:")
print(predict_columns)

학습 데이터셋 컬럼:
Index(['HomeTeam_Arsenal', 'HomeTeam_Aston Villa', 'HomeTeam_Bournemouth',
       'HomeTeam_Brentford', 'HomeTeam_Brighton', 'HomeTeam_Burnley',
       'HomeTeam_Cardiff', 'HomeTeam_Chelsea', 'HomeTeam_Crystal Palace',
       'HomeTeam_Everton', 'HomeTeam_Fulham', 'HomeTeam_Huddersfield',
       'HomeTeam_Leeds', 'HomeTeam_Leicester', 'HomeTeam_Liverpool',
       'HomeTeam_Man City', 'HomeTeam_Man United', 'HomeTeam_Newcastle',
       'HomeTeam_Norwich', 'HomeTeam_Nott'm Forest',
       'HomeTeam_Sheffield United', 'HomeTeam_Southampton', 'HomeTeam_Stoke',
       'HomeTeam_Swansea', 'HomeTeam_Tottenham', 'HomeTeam_Watford',
       'HomeTeam_West Brom', 'HomeTeam_West Ham', 'HomeTeam_Wolves',
       'AwayTeam_Arsenal', 'AwayTeam_Aston Villa', 'AwayTeam_Bournemouth',
       'AwayTeam_Brentford', 'AwayTeam_Brighton', 'AwayTeam_Burnley',
       'AwayTeam_Cardiff', 'AwayTeam_Chelsea', 'AwayTeam_Crystal Palace',
       'AwayTeam_Everton', 'AwayTeam_Fulham', 'AwayTeam_Huddersfield

In [58]:
model.predict(preprocessedFixtures)

array(['H', 'A', 'A', ..., 'H', 'H', 'A'], dtype=object)

In [59]:
Result = fixtures.drop(['HS', 'AS', 'HST', 'HF', 'AF', 'HY', 
                        'AY', 'HR', 'AR', 'HC', 'AC', 'AST', 'HC', 'AC','home_xG', 'away_xG'], axis=1)
Result.head(10)

Unnamed: 0,HomeTeam,AwayTeam,FTR,HTR
0,Arsenal,Leicester,H,D
1,Brighton,Man City,A,D
2,Chelsea,Burnley,A,A
3,Crystal Palace,Huddersfield,A,A
4,Everton,Stoke,H,H
5,Southampton,Swansea,D,D
6,Watford,Liverpool,D,H
7,West Brom,Bournemouth,H,H
8,Man United,West Ham,H,H
9,Newcastle,Tottenham,A,D


In [60]:
fixturePredictedProbability = model.predict_proba(preprocessedFixtures) *100
fixturePredictedProbability = pd.DataFrame(fixturePredictedProbability, columns=['Away win %','Draw %','Home win %'])

display(fixturePredictedProbability)

Unnamed: 0,Away win %,Draw %,Home win %
0,9.746134,11.811010,78.442856
1,73.734844,20.853731,5.411426
2,51.916804,43.143967,4.939228
3,65.956249,30.477855,3.565896
4,1.915177,29.052528,69.032296
...,...,...,...
2107,10.140186,14.393617,75.466197
2108,73.522670,25.894415,0.582915
2109,2.538901,16.997421,80.463679
2110,4.689023,16.768254,78.542722


In [61]:
final = pd.concat([Result, fixturePredictedProbability], axis = 1)
final.head(20)

Unnamed: 0,HomeTeam,AwayTeam,FTR,HTR,Away win %,Draw %,Home win %
0,Arsenal,Leicester,H,D,9.746134,11.81101,78.442856
1,Brighton,Man City,A,D,73.734844,20.853731,5.411426
2,Chelsea,Burnley,A,A,51.916804,43.143967,4.939228
3,Crystal Palace,Huddersfield,A,A,65.956249,30.477855,3.565896
4,Everton,Stoke,H,H,1.915177,29.052528,69.032296
5,Southampton,Swansea,D,D,38.319003,56.816925,4.864072
6,Watford,Liverpool,D,H,73.598509,11.792079,14.609411
7,West Brom,Bournemouth,H,H,14.800565,38.731477,46.467957
8,Man United,West Ham,H,H,1.083755,33.384098,65.532146
9,Newcastle,Tottenham,A,D,82.629723,12.237717,5.13256


In [62]:
df = pd.read_csv('~/Documents/ds_study/EPL_prediction_ML/data/merge_data.csv')

# Date 컬럼만 출력
selected_columns = ['Date'
                   ] 
selected_df = df[selected_columns]

print(selected_df)

            Date
0     2017-08-11
1     2017-08-12
2     2017-08-12
3     2017-08-12
4     2017-08-12
...          ...
2107  2023-05-28
2108  2023-05-28
2109  2023-05-28
2110  2023-05-28
2111  2023-05-28

[2112 rows x 1 columns]


In [63]:
selected_df.reset_index(drop=True, inplace=True)
final.reset_index(drop=True, inplace=True)
# selected_df를 final의 앞에 붙이기
df = pd.concat([selected_df, final], axis=1)
df.head(20)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,HTR,Away win %,Draw %,Home win %
0,2017-08-11,Arsenal,Leicester,H,D,9.746134,11.81101,78.442856
1,2017-08-12,Brighton,Man City,A,D,73.734844,20.853731,5.411426
2,2017-08-12,Chelsea,Burnley,A,A,51.916804,43.143967,4.939228
3,2017-08-12,Crystal Palace,Huddersfield,A,A,65.956249,30.477855,3.565896
4,2017-08-12,Everton,Stoke,H,H,1.915177,29.052528,69.032296
5,2017-08-12,Southampton,Swansea,D,D,38.319003,56.816925,4.864072
6,2017-08-12,Watford,Liverpool,D,H,73.598509,11.792079,14.609411
7,2017-08-12,West Brom,Bournemouth,H,H,14.800565,38.731477,46.467957
8,2017-08-13,Man United,West Ham,H,H,1.083755,33.384098,65.532146
9,2017-08-13,Newcastle,Tottenham,A,D,82.629723,12.237717,5.13256


In [64]:
# 컬럼 이름 수정, 추가, 순서 변경

df.drop('HTR', axis=1, inplace=True)
df.rename(columns={'FTR': 'Real result'}, inplace=True)
df['Predicted result'] = df[['Away win %', 'Draw %', 'Home win %']].idxmax(axis=1).map({
    'Away win %': 'A',
    'Draw %': 'D',
    'Home win %': 'H'
})

new = ['Date', 'HomeTeam', 'AwayTeam', 'Away win %', 'Draw %', 'Home win %', 'Real result', 'Predicted result']
df = df[new]

df

Unnamed: 0,Date,HomeTeam,AwayTeam,Away win %,Draw %,Home win %,Real result,Predicted result
0,2017-08-11,Arsenal,Leicester,9.746134,11.811010,78.442856,H,H
1,2017-08-12,Brighton,Man City,73.734844,20.853731,5.411426,A,A
2,2017-08-12,Chelsea,Burnley,51.916804,43.143967,4.939228,A,A
3,2017-08-12,Crystal Palace,Huddersfield,65.956249,30.477855,3.565896,A,A
4,2017-08-12,Everton,Stoke,1.915177,29.052528,69.032296,H,H
...,...,...,...,...,...,...,...,...
2107,2023-05-28,Everton,Bournemouth,10.140186,14.393617,75.466197,H,H
2108,2023-05-28,Leeds,Tottenham,73.522670,25.894415,0.582915,A,A
2109,2023-05-28,Leicester,West Ham,2.538901,16.997421,80.463679,H,H
2110,2023-05-28,Man United,Fulham,4.689023,16.768254,78.542722,H,H


In [65]:
df.to_csv('~/Documents/ds_study/EPL_prediction_ML/data/data_for_predict.csv', index=False)

In [89]:
# 토트넘 홈 데이터만 출력

tottenham_home_games = df[df['HomeTeam'] == 'Tottenham']
tottenham_home_games

Unnamed: 0,Date,HomeTeam,AwayTeam,Away win %,Draw %,Home win %,Real result,Predicted result
18,2017-08-20,Tottenham,Chelsea,46.486559,35.733818,17.779622,A,A
28,2017-08-27,Tottenham,Burnley,9.194639,35.788323,55.017039,D,H
45,2017-09-16,Tottenham,Swansea,16.338880,34.367266,49.293854,D,H
75,2017-10-14,Tottenham,Bournemouth,5.006526,11.460806,83.532668,H,H
89,2017-10-22,Tottenham,Liverpool,9.623908,11.709059,78.667033,H,H
...,...,...,...,...,...,...,...,...
2021,2023-02-19,Tottenham,West Ham,6.872047,19.525495,73.602458,H,H
2029,2023-02-26,Tottenham,Chelsea,16.670855,22.955595,60.373550,H,H
2047,2023-04-15,Tottenham,Bournemouth,37.062891,15.373122,47.563987,A,H
2070,2023-04-27,Tottenham,Man United,42.895096,22.319210,34.785694,D,A


In [90]:
#2023년도만 출력

df['Date'] = pd.to_datetime(df['Date'])
games_2023 = df[df['Date'].dt.year == 2023]
games_2023

Unnamed: 0,Date,HomeTeam,AwayTeam,Away win %,Draw %,Home win %,Real result,Predicted result
1986,2023-01-01,Tottenham,Aston Villa,66.005704,19.528660,14.465637,A,A
1987,2023-01-01,Nott'm Forest,Chelsea,45.398329,25.092763,29.508908,D,A
1988,2023-01-13,Aston Villa,Leeds,49.494878,18.367180,32.137941,H,A
1989,2023-01-14,Man United,Man City,11.063426,17.818172,71.118402,H,H
1990,2023-01-14,Brighton,Liverpool,4.372041,19.514983,76.112975,H,H
...,...,...,...,...,...,...,...,...
2107,2023-05-28,Everton,Bournemouth,8.435905,16.584014,74.980082,H,H
2108,2023-05-28,Leeds,Tottenham,86.869552,12.183035,0.947413,A,A
2109,2023-05-28,Leicester,West Ham,3.239772,14.816605,81.943624,H,H
2110,2023-05-28,Man United,Fulham,6.378475,20.246396,73.375129,H,H
