In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np


In [89]:
data_to_train = pd.read_csv('~/Documents/ds_study/EPL_prediction_ML/data/2017-2023_predict.csv')
data_to_predict = pd.read_csv('~/Documents/ds_study/EPL_prediction_ML/data/epl_data_for_predict.csv')

In [90]:
# 변수 인코딩 (라벨 인코딩)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data_to_train['Real result'])
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(data_to_train[['HomeTeam', 'AwayTeam']])

In [99]:
# 데이터 분류
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [92]:
# StandardScaler 적용
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [100]:
# SVC 모델 
model = SVC(probability=True,C=1, class_weight=None, coef0=0.0, degree=2, gamma='auto', kernel='rbf', max_iter=10000)
model.fit(X_train_scaled, y_train)

In [101]:
# 새로운 데이터에 대한 예측을 위한 특성 인코딩
X_to_predict_encoded = encoder.transform(data_to_predict[['HomeTeam', 'AwayTeam']])
X_to_predict_scaled = scaler.transform(X_to_predict_encoded)

In [102]:
# 새로운 데이터에 대한 예측 수행
y_pred = model.predict(X_to_predict_scaled)
# 예측된 정수 레이블을 원래의 문자 레이블로 변환
predicted_results = label_encoder.inverse_transform(y_pred)
# 예측 결과를 데이터 프레임에 추가
data_to_predict['Predicted result'] = predicted_results

In [103]:
# 확률 예측 추가 
predicted_probabilities = model.predict_proba(X_to_predict_scaled)
data_to_predict['Away win %'] = predicted_probabilities[:, label_encoder.transform(['A'])[0]] * 100
data_to_predict['Draw %'] = predicted_probabilities[:, label_encoder.transform(['D'])[0]] * 100
data_to_predict['Home win %'] = predicted_probabilities[:, label_encoder.transform(['H'])[0]] * 100

In [104]:
data_to_predict

Unnamed: 0,Date,HomeTeam,AwayTeam,Predicted result,Away win %,Draw %,Home win %
0,2023-08-11,Burnley,Man City,A,54.408707,23.137092,22.454201
1,2023-08-12,Arsenal,Nott'm Forest,H,19.450813,23.122031,57.427156
2,2023-08-12,Bournemouth,West Ham,A,39.062317,30.535944,30.401739
3,2023-08-12,Everton,Fulham,H,21.683245,20.632438,57.684317
4,2023-08-12,Sheffield United,Crystal Palace,H,37.083159,18.439200,44.477642
...,...,...,...,...,...,...,...
337,2024-05-19,Chelsea,Bournemouth,H,27.515919,19.294019,53.190062
338,2024-05-19,Crystal Palace,Aston Villa,H,22.396194,19.661160,57.942645
339,2024-05-19,Sheffield United,Tottenham,A,42.647784,18.260668,39.091548
340,2024-05-19,Arsenal,Everton,H,22.075145,19.542545,58.382310


In [105]:
# 컬럼 순서 변경한 2023-2024시즌 예측 데이터
new_column_order = ['Date', 'HomeTeam', 'AwayTeam', 'Home win %', 'Draw %', 'Away win %', 'Predicted result']
data_to_predict = data_to_predict[new_column_order]
data_to_predict

Unnamed: 0,Date,HomeTeam,AwayTeam,Home win %,Draw %,Away win %,Predicted result
0,2023-08-11,Burnley,Man City,22.454201,23.137092,54.408707,A
1,2023-08-12,Arsenal,Nott'm Forest,57.427156,23.122031,19.450813,H
2,2023-08-12,Bournemouth,West Ham,30.401739,30.535944,39.062317,A
3,2023-08-12,Everton,Fulham,57.684317,20.632438,21.683245,H
4,2023-08-12,Sheffield United,Crystal Palace,44.477642,18.439200,37.083159,H
...,...,...,...,...,...,...,...
337,2024-05-19,Chelsea,Bournemouth,53.190062,19.294019,27.515919,H
338,2024-05-19,Crystal Palace,Aston Villa,57.942645,19.661160,22.396194,H
339,2024-05-19,Sheffield United,Tottenham,39.091548,18.260668,42.647784,A
340,2024-05-19,Arsenal,Everton,58.382310,19.542545,22.075145,H


In [106]:
#data_to_predict.to_csv('~/Documents/ds_study/EPL_prediction_ML/data/2023-2024_predict.csv', index=False)

In [118]:
#복사본 생성
data_to_predict_copy = data_to_predict.copy()

# 각 경기 홈 팀과 어웨이 팀 승점계산
data_to_predict_copy['Home Points'] = data_to_predict_copy['Predicted result'].map({'H': 3, 'D': 1, 'A': 0})
data_to_predict_copy['Away Points'] = data_to_predict_copy['Predicted result'].map({'A': 3, 'D': 1, 'H': 0})

# 팀별로 승점 집계
home_points = data_to_predict_copy.groupby('HomeTeam')['Home Points'].sum()
away_points = data_to_predict_copy.groupby('AwayTeam')['Away Points'].sum()

# 홈과 어웨이 승점을 합산
total_points = home_points.add(away_points, fill_value=0)

# 순위 결정을 위한 승점 정렬
team_rankings = total_points.sort_values(ascending=False)

# 팀별 승점을 데이터프레임으로 변환
team_rankings_df = total_points.reset_index()
team_rankings_df.columns = ['Team', 'Points']

# 팀별 순위 추가
team_rankings_df['Rank'] = team_rankings_df['Points'].rank(ascending=False, method='min').astype(int)

# 순위에 따라 데이터프레임 정렬
team_rankings_df = team_rankings_df.sort_values(by='Rank')

team_rankings_df


Unnamed: 0,Team,Points,Rank
11,Man City,102,1
10,Liverpool,98,2
12,Man United,88,3
6,Chelsea,75,4
16,Tottenham,73,5
0,Arsenal,67,6
3,Brentford,54,7
14,Nott'm Forest,52,8
1,Aston Villa,46,9
8,Everton,46,9


In [119]:
new_column_order = ['Rank', 'Team', 'Points']
team_rankings_df = team_rankings_df[new_column_order]
team_rankings_df

Unnamed: 0,Rank,Team,Points
11,1,Man City,102
10,2,Liverpool,98
12,3,Man United,88
6,4,Chelsea,75
16,5,Tottenham,73
0,6,Arsenal,67
3,7,Brentford,54
14,8,Nott'm Forest,52
1,9,Aston Villa,46
8,9,Everton,46
