In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# 데이터 로드
data = pd.read_csv('~/Documents/ds_study/EPL_prediction_ML/data/merge_data.csv')
data

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,HTR,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR,home_xG,away_xG
0,2017-08-11,Arsenal,Leicester,H,D,27,6,10,3,9,4,9,12,0,1,0,0,2.543290,1.464950
1,2017-08-12,Brighton,Man City,A,D,6,14,2,4,3,10,6,9,0,2,0,0,0.276343,1.867510
2,2017-08-12,Chelsea,Burnley,A,A,19,10,6,5,8,5,16,11,3,3,2,0,1.356510,0.564237
3,2017-08-12,Crystal Palace,Huddersfield,A,A,14,8,4,6,12,9,7,19,1,3,0,0,0.988934,1.736570
4,2017-08-12,Everton,Stoke,H,H,9,9,4,1,6,7,13,10,1,1,0,0,0.720574,0.276440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2107,2023-05-28,Everton,Bournemouth,H,D,13,7,6,2,11,12,9,3,1,3,0,0,1.129330,0.432084
2108,2023-05-28,Leeds,Tottenham,A,A,19,11,2,7,7,5,12,3,3,0,0,0,1.191510,2.033150
2109,2023-05-28,Leicester,West Ham,H,H,13,16,4,3,8,10,3,5,1,1,0,0,2.030130,0.970352
2110,2023-05-28,Man United,Fulham,H,D,21,10,8,3,14,10,5,4,1,2,0,0,2.747660,1.881670


In [3]:
# 결측지 있는지 확인

missing_values = data.isnull().sum()
print(missing_values)

Date        0
HomeTeam    0
AwayTeam    0
FTR         0
HTR         0
HS          0
AS          0
HST         0
AST         0
HC          0
AC          0
HF          0
AF          0
HY          0
AY          0
HR          0
AR          0
home_xG     0
away_xG     0
dtype: int64


In [4]:
#데이터 정확히 확인
print(data.head())

print(data.describe())

         Date        HomeTeam      AwayTeam FTR HTR  HS  AS  HST  AST  HC  AC  \
0  2017-08-11         Arsenal     Leicester   H   D  27   6   10    3   9   4   
1  2017-08-12        Brighton      Man City   A   D   6  14    2    4   3  10   
2  2017-08-12         Chelsea       Burnley   A   A  19  10    6    5   8   5   
3  2017-08-12  Crystal Palace  Huddersfield   A   A  14   8    4    6  12   9   
4  2017-08-12         Everton         Stoke   H   H   9   9    4    1   6   7   

   HF  AF  HY  AY  HR  AR   home_xG   away_xG  
0   9  12   0   1   0   0  2.543290  1.464950  
1   6   9   0   2   0   0  0.276343  1.867510  
2  16  11   3   3   2   0  1.356510  0.564237  
3   7  19   1   3   0   0  0.988934  1.736570  
4  13  10   1   1   0   0  0.720574  0.276440  
                HS           AS          HST          AST           HC  \
count  2112.000000  2112.000000  2112.000000  2112.000000  2112.000000   
mean     13.580966    11.305398     4.660038     3.970644     6.310606   
std

In [5]:
# 범주형 데이터를 원핫 인코딩으로 변환하는 함수
def preprocess_features(data, categorical_features, numerical_features, encoder=None, scaler=None):

    
# 범주형 데이터 원핫 인코딩
    if encoder is None: # encoder가 제공되지 않았다면 새로운 OneHotEncoder 객체를 생성후 데이터에 fit_transform 적용
        encoder = OneHotEncoder()
        encoded_categorical = encoder.fit_transform(data[categorical_features])
    else: # encoder가 제공되었다면 기존 인코더를 사용하여 데이터에 transform 적용
        encoded_categorical = encoder.transform(data[categorical_features])
    encoded_categorical = pd.DataFrame(encoded_categorical.toarray(), columns=encoder.get_feature_names_out(categorical_features))

 # 숫자형 데이터 스케일링
    if scaler is None:  # scaler가 제공되지 않았다면 새로운 StandardScaler 객체를 생성후 데이터에 fit_transform 적용
        scaler = StandardScaler()
        scaled_numerical = scaler.fit_transform(data[numerical_features])
    else:   # scaler가 제공되었다면 기존 스케일러를 사용하여 데이터에 transform 적용
        scaled_numerical = scaler.transform(data[numerical_features])
    scaled_numerical = pd.DataFrame(scaled_numerical, columns=numerical_features)

# 변환된 데이터 결합
    preprocessed_data = pd.concat([encoded_categorical, scaled_numerical], axis=1)
    # 결합된 데이터프레임과 사용된 encoder 및 scaler 객체를 반환
    return preprocessed_data, encoder, scaler

In [6]:
# 범주형 및 숫자형 특성 정의
categorical_features = ['HomeTeam', 'AwayTeam']  # 범주형 특성
numerical_features = ['HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'home_xG', 'away_xG']  # 숫자형 특성

In [9]:
# 전처리된 데이터셋 생성
X, encoder, scaler = preprocess_features(data, categorical_features, numerical_features)
# 타겟 변수
y = data['FTR']

In [10]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# 행 수 확인
print("X_train 행 수:", len(X_train))
print("y_train 행 수:", len(y_train))

# 일치 여부 확인
if len(X_train) == len(y_train):
    print("행 수가 일치합니다.")
else:
    print("행 수가 일치하지 않습니다. 데이터 확인이 필요합니다.")

X_train 행 수: 1689
y_train 행 수: 1689
행 수가 일치합니다.


In [41]:
# 하이퍼파라미터 그리드 설정
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [30, 40, 50, 60, 70, 80, 90, 100],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30],
    'p': [1, 2],
    'metric': ['minkowski', 'euclidean', 'manhattan']
    }

# 그리드 서치 초기화 및 수행
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 최적의 매개변수 출력
print("Best parameters:", grid_search.best_params_)

Best parameters: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 80, 'p': 2, 'weights': 'uniform'}


In [42]:
# 과적합 검사
from sklearn.model_selection import cross_val_score

# 교차 검증 수행
knn = KNeighborsClassifier(**grid_search.best_params_)
cross_val_scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print("Cross-validation scores:", cross_val_scores)
print("Average score:", cross_val_scores.mean())

Cross-validation scores: [0.64066194 0.63120567 0.62085308 0.62085308 0.63270142]
Average score: 0.6292550390463065


In [62]:
# 모델 학습
knn = KNeighborsClassifier(
    algorithm='auto', 
    leaf_size=10, 
    metric='minkowski', 
    n_neighbors=80, 
    p=2, 
    weights='uniform'
)

knn.fit(X_train, y_train)


In [63]:
from sklearn.metrics import f1_score, confusion_matrix

# 훈련 세트와 테스트 세트에서 성능 평가
y_pred_train = knn.predict(X_train)
y_pred_test = knn.predict(X_test)
print("Training F1 score:", f1_score(y_train, y_pred_train, average='macro'))
print("Test F1 score:", f1_score(y_test, y_pred_test, average='macro'))

Training F1 score: 0.5062964073979578
Test F1 score: 0.49081189585391266


In [80]:
# F1 점수를 사용하여 5-겹 교차 검증 수행
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(
    algorithm='auto', 
    leaf_size=10, 
    metric='minkowski', 
    n_neighbors=80, 
    p=1, 
    weights='uniform'
))

cross_val_f1_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1_macro')
print("교차 검증 F1 점수:", cross_val_f1_scores)
print("평균 F1 점수:", cross_val_f1_scores.mean())

교차 검증 F1 점수: [0.47063336 0.46536695 0.46869584 0.43883666 0.4199793 ]
평균 F1 점수: 0.4527024216425458


In [81]:
# 혼동 행렬 출력(성능 평가)
print("Confusion Matrix (Train):\n", confusion_matrix(y_train, y_pred_train))
print("Confusion Matrix (Test):\n", confusion_matrix(y_test, y_pred_test))

Confusion Matrix (Train):
 [[410  10 135]
 [123  21 243]
 [ 95   9 643]]
Confusion Matrix (Test):
 [[104   3  33]
 [ 28   3  71]
 [ 22   2 157]]


In [82]:
knn.predict(X_test)

array(['H', 'H', 'H', 'H', 'A', 'H', 'H', 'A', 'H', 'H', 'H', 'A', 'H',
       'H', 'H', 'H', 'H', 'A', 'H', 'A', 'A', 'A', 'H', 'H', 'A', 'H',
       'A', 'H', 'H', 'H', 'H', 'A', 'H', 'H', 'H', 'A', 'A', 'A', 'H',
       'H', 'H', 'H', 'H', 'A', 'H', 'A', 'A', 'A', 'A', 'H', 'D', 'H',
       'H', 'A', 'H', 'A', 'H', 'A', 'A', 'H', 'A', 'H', 'H', 'A', 'H',
       'H', 'A', 'H', 'H', 'H', 'A', 'H', 'H', 'H', 'H', 'A', 'A', 'A',
       'H', 'H', 'A', 'H', 'H', 'A', 'H', 'H', 'A', 'A', 'A', 'H', 'H',
       'H', 'H', 'H', 'A', 'H', 'A', 'H', 'A', 'H', 'H', 'A', 'H', 'H',
       'H', 'A', 'H', 'H', 'H', 'H', 'H', 'A', 'A', 'H', 'H', 'H', 'H',
       'H', 'H', 'D', 'A', 'H', 'H', 'H', 'H', 'A', 'A', 'H', 'A', 'H',
       'A', 'H', 'H', 'A', 'A', 'H', 'A', 'A', 'H', 'A', 'D', 'A', 'A',
       'H', 'A', 'H', 'H', 'H', 'A', 'H', 'H', 'A', 'H', 'A', 'H', 'H',
       'H', 'H', 'A', 'H', 'A', 'H', 'A', 'A', 'H', 'H', 'H', 'H', 'A',
       'H', 'A', 'A', 'A', 'H', 'A', 'H', 'H', 'A', 'A', 'H', 'H

In [79]:
#각 테스트 샘플이 각 클래스에 속할 확률 구하기
Y_predict = knn.predict_proba(X_test)
# 반환된 값은 각 행이 테스트 샘플을 나타내고, 각 열이 클래스의 확률을 나타냄
Y_predict = pd.DataFrame(Y_predict, columns=['Away Team','Draw','Home Team'])
# 확률을 퍼센트(백분율)로 변환
display((Y_predict *100).head(10))

Unnamed: 0,Away Team,Draw,Home Team
0,12.5,22.5,65.0
1,20.0,21.25,58.75
2,31.25,23.75,45.0
3,35.0,28.75,36.25
4,47.5,28.75,23.75
5,2.5,20.0,77.5
6,3.75,10.0,86.25
7,53.75,18.75,27.5
8,33.75,26.25,40.0
9,28.75,28.75,42.5


In [49]:
# 기본적인 팀 정보만을 유지, 나머지 통계는 초기화하여 모델이 이를 예측하게끔 준비하는 과정

fixtures = pd.read_csv('~/Documents/ds_study/EPL_prediction_ML/data/merge_data.csv')
fixtures = fixtures.drop(['Date'], axis=1)
fixtures.columns.values[0:2] = ['HomeTeam', 'AwayTeam']
# 나머지 열 초기화
fixtures['HS'] = 0
fixtures['AS'] = 0
fixtures['HST'] = 0
fixtures['AST'] = 0
fixtures['HF'] = 0
fixtures['AF'] = 0
fixtures['HC'] = 0
fixtures['AC'] = 0
fixtures['HY'] = 0
fixtures['AY'] = 0
fixtures['HR'] = 0
fixtures['AR'] = 0
fixtures['home_xG'] = 0
fixtures['away_xG'] = 0

In [50]:
display(fixtures.head(10))

Unnamed: 0,HomeTeam,AwayTeam,FTR,HTR,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR,home_xG,away_xG
0,Arsenal,Leicester,H,D,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Brighton,Man City,A,D,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Chelsea,Burnley,A,A,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Crystal Palace,Huddersfield,A,A,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Everton,Stoke,H,H,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,Southampton,Swansea,D,D,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Watford,Liverpool,D,H,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,West Brom,Bournemouth,H,H,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Man United,West Ham,H,H,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Newcastle,Tottenham,A,D,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [51]:
# 똑같이 위에서 똑같이 수행한 원핫 인코딩 실행
preprocessedFixtures, _, _ = preprocess_features(data, categorical_features, numerical_features, encoder=None, scaler=None)

In [52]:
# 다시 한번 학습데이터와 예측데이터의 컬럼이 일치하는지 확인 

# 학습 데이터셋 컬럼 확인
train_columns = X_train.columns
print("학습 데이터셋 컬럼:")
print(train_columns)

# 예측 데이터셋 컬럼 확인
predict_columns = preprocessedFixtures.columns
print("예측 데이터셋 컬럼:")
print(predict_columns)

학습 데이터셋 컬럼:
Index(['HomeTeam_Arsenal', 'HomeTeam_Aston Villa', 'HomeTeam_Bournemouth',
       'HomeTeam_Brentford', 'HomeTeam_Brighton', 'HomeTeam_Burnley',
       'HomeTeam_Cardiff', 'HomeTeam_Chelsea', 'HomeTeam_Crystal Palace',
       'HomeTeam_Everton', 'HomeTeam_Fulham', 'HomeTeam_Huddersfield',
       'HomeTeam_Leeds', 'HomeTeam_Leicester', 'HomeTeam_Liverpool',
       'HomeTeam_Man City', 'HomeTeam_Man United', 'HomeTeam_Newcastle',
       'HomeTeam_Norwich', 'HomeTeam_Nott'm Forest',
       'HomeTeam_Sheffield United', 'HomeTeam_Southampton', 'HomeTeam_Stoke',
       'HomeTeam_Swansea', 'HomeTeam_Tottenham', 'HomeTeam_Watford',
       'HomeTeam_West Brom', 'HomeTeam_West Ham', 'HomeTeam_Wolves',
       'AwayTeam_Arsenal', 'AwayTeam_Aston Villa', 'AwayTeam_Bournemouth',
       'AwayTeam_Brentford', 'AwayTeam_Brighton', 'AwayTeam_Burnley',
       'AwayTeam_Cardiff', 'AwayTeam_Chelsea', 'AwayTeam_Crystal Palace',
       'AwayTeam_Everton', 'AwayTeam_Fulham', 'AwayTeam_Huddersfield

In [53]:
knn.predict(preprocessedFixtures)

array(['H', 'A', 'A', ..., 'H', 'H', 'A'], dtype=object)

In [54]:
Result = fixtures.drop(['HS', 'AS', 'HST', 'HF', 'AF', 'HY', 
                        'AY', 'HR', 'AR', 'HC', 'AC', 'AST', 'HC', 'AC','home_xG', 'away_xG'], axis=1)
Result.head(10)

Unnamed: 0,HomeTeam,AwayTeam,FTR,HTR
0,Arsenal,Leicester,H,D
1,Brighton,Man City,A,D
2,Chelsea,Burnley,A,A
3,Crystal Palace,Huddersfield,A,A
4,Everton,Stoke,H,H
5,Southampton,Swansea,D,D
6,Watford,Liverpool,D,H
7,West Brom,Bournemouth,H,H
8,Man United,West Ham,H,H
9,Newcastle,Tottenham,A,D


In [55]:
fixturePredictedProbability = knn.predict_proba(preprocessedFixtures) *100
fixturePredictedProbability = pd.DataFrame(fixturePredictedProbability, columns=['Away win %','Draw %','Home win %'])

display(fixturePredictedProbability)

Unnamed: 0,Away win %,Draw %,Home win %
0,10.00,12.50,77.50
1,52.50,18.75,28.75
2,57.50,22.50,20.00
3,41.25,30.00,28.75
4,20.00,28.75,51.25
...,...,...,...
2107,18.75,22.50,58.75
2108,63.75,25.00,11.25
2109,23.75,25.00,51.25
2110,16.25,21.25,62.50


In [56]:
final = pd.concat([Result, fixturePredictedProbability], axis = 1)
final.head(20)

Unnamed: 0,HomeTeam,AwayTeam,FTR,HTR,Away win %,Draw %,Home win %
0,Arsenal,Leicester,H,D,10.0,12.5,77.5
1,Brighton,Man City,A,D,52.5,18.75,28.75
2,Chelsea,Burnley,A,A,57.5,22.5,20.0
3,Crystal Palace,Huddersfield,A,A,41.25,30.0,28.75
4,Everton,Stoke,H,H,20.0,28.75,51.25
5,Southampton,Swansea,D,D,18.75,31.25,50.0
6,Watford,Liverpool,D,H,43.75,21.25,35.0
7,West Brom,Bournemouth,H,H,15.0,32.5,52.5
8,Man United,West Ham,H,H,6.25,20.0,73.75
9,Newcastle,Tottenham,A,D,68.75,16.25,15.0


In [57]:
df = pd.read_csv('~/Documents/ds_study/EPL_prediction_ML/data/merge_data.csv')

# Date 컬럼만 출력
selected_columns = ['Date'] 
selected_df = df[selected_columns]

print(selected_df)

            Date
0     2017-08-11
1     2017-08-12
2     2017-08-12
3     2017-08-12
4     2017-08-12
...          ...
2107  2023-05-28
2108  2023-05-28
2109  2023-05-28
2110  2023-05-28
2111  2023-05-28

[2112 rows x 1 columns]


In [58]:
selected_df.reset_index(drop=True, inplace=True)
final.reset_index(drop=True, inplace=True)
# selected_df를 final의 앞에 붙이기
df = pd.concat([selected_df, final], axis=1)
df.head(20)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,HTR,Away win %,Draw %,Home win %
0,2017-08-11,Arsenal,Leicester,H,D,10.0,12.5,77.5
1,2017-08-12,Brighton,Man City,A,D,52.5,18.75,28.75
2,2017-08-12,Chelsea,Burnley,A,A,57.5,22.5,20.0
3,2017-08-12,Crystal Palace,Huddersfield,A,A,41.25,30.0,28.75
4,2017-08-12,Everton,Stoke,H,H,20.0,28.75,51.25
5,2017-08-12,Southampton,Swansea,D,D,18.75,31.25,50.0
6,2017-08-12,Watford,Liverpool,D,H,43.75,21.25,35.0
7,2017-08-12,West Brom,Bournemouth,H,H,15.0,32.5,52.5
8,2017-08-13,Man United,West Ham,H,H,6.25,20.0,73.75
9,2017-08-13,Newcastle,Tottenham,A,D,68.75,16.25,15.0


In [59]:
# 컬럼 이름 수정, 추가, 순서 변경

df.drop('HTR', axis=1, inplace=True)
df.rename(columns={'FTR': 'Real result'}, inplace=True)
df['Predicted result'] = df[['Away win %', 'Draw %', 'Home win %']].idxmax(axis=1).map({
    'Away win %': 'A',
    'Draw %': 'D',
    'Home win %': 'H'
})

new = ['Date', 'HomeTeam', 'AwayTeam', 'Away win %', 'Draw %', 'Home win %', 'Real result', 'Predicted result']
df = df[new]

df

Unnamed: 0,Date,HomeTeam,AwayTeam,Away win %,Draw %,Home win %,Real result,Predicted result
0,2017-08-11,Arsenal,Leicester,10.00,12.50,77.50,H,H
1,2017-08-12,Brighton,Man City,52.50,18.75,28.75,A,A
2,2017-08-12,Chelsea,Burnley,57.50,22.50,20.00,A,A
3,2017-08-12,Crystal Palace,Huddersfield,41.25,30.00,28.75,A,A
4,2017-08-12,Everton,Stoke,20.00,28.75,51.25,H,H
...,...,...,...,...,...,...,...,...
2107,2023-05-28,Everton,Bournemouth,18.75,22.50,58.75,H,H
2108,2023-05-28,Leeds,Tottenham,63.75,25.00,11.25,A,A
2109,2023-05-28,Leicester,West Ham,23.75,25.00,51.25,H,H
2110,2023-05-28,Man United,Fulham,16.25,21.25,62.50,H,H
