In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import tree

import time
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# 데이터 불러오는 함수
def load_data(Position):
  data_path = '/content/drive/MyDrive/Colab Notebooks/캡스톤_4조/dataset/'
  data = pd.read_csv(data_path + 'epl_{}_3grade_data.csv'.format(Position))
  data = data[data['연봉(€)']!=0]
  del data['Unnamed: 0']
  return data

# 데이터 분할하는 함수
def data_split(df):
  train_df = df[df['시즌']!=(2021 or 2022)]
  test_df = df[df['시즌']==2021]
  del train_df['시즌'], test_df['시즌']
  train_X, train_y = train_df.drop('가치 등급', axis=1), train_df['가치 등급']
  test_X, test_y = test_df.drop('가치 등급', axis=1), test_df['가치 등급']

  return train_X, train_y, test_X, test_y

# 데이터 스케일링 함수
def data_scaler(data):
  x_train, y_train, x_test, y_test = data_split(data)
  scaler = StandardScaler()  
  x_train = scaler.fit_transform(x_train)
  x_test = scaler.transform(x_test)
  return x_train, y_train, x_test, y_test

# LR, KNN, DT 모델 & Voting
def model1(x_train, y_train, x_test, y_test):
  lr_clf = LogisticRegression()
  knn_clf = KNeighborsClassifier()
  dt_clf = tree.DecisionTreeClassifier()
  models = [lr_clf, knn_clf, dt_clf]
  for model in models:
      model.fit(x_train, y_train)
      pred_y = model.predict(x_test)
      model_name = model.__class__.__name__
      print(f"{model_name} Accuracy :{accuracy_score(y_test, pred_y)}")
      print()
  vo_clf = VotingClassifier(estimators=[('LR', lr_clf),
                                    ('KNN', knn_clf),
                                    ('DT', dt_clf)],
                        voting='soft')

  vo_clf.fit(x_train, y_train)
  pred_y = vo_clf.predict(x_test)
  print('------- Voting -------')
  print('Voting Accuracy', accuracy_score(y_test, pred_y))

# Random Forest with Grid Search
def model2(x_train, y_train, x_test, y_test):
  params = {
      'n_estimators' : [50,100,150,200],
      'max_depth': [5,10,20],
      'min_samples_leaf':[8,16,32],
      'min_samples_split':[4,8,16]
  }

  rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1)
  grid_cv = GridSearchCV(rf_clf, param_grid=params,
                        cv=2, n_jobs=-1)
  grid_cv.fit(x_train, y_train)
  print()
  print('------- Random Forest -------')
  print(f"Best Parameters : {grid_cv.best_params_}")
  print(f"Best Score : {grid_cv.best_score_}")

# Gradiant Boosting with Grid Search
def model3(x_train, y_train, x_test, y_test):
  params = {
    'n_estimators':[10,15,20,25],
    'learning_rate':[0.001, 0.01, 0.05, 0.1]
  }

  gb_clf = GradientBoostingClassifier()
  grid_cv = GridSearchCV(gb_clf, param_grid=params,
                        n_jobs=-1, cv=2, verbose=1)
  grid_cv.fit(x_train, y_train)
  print()
  print('------- Gradiant Boosting -------')
  print("Best Parameters:", grid_cv.best_params_)
  print("Best Accuracy:", grid_cv.best_score_)

# 예측 결과 확인
def print_result(data):
  x_train, y_train, x_test, y_test = data_scaler(data)
  model1(x_train, y_train, x_test, y_test)
  model2(x_train, y_train, x_test, y_test)
  model3(x_train, y_train, x_test, y_test)

In [None]:
GK = load_data('GK')
DF = load_data('DF')
MF = load_data('MF')
FW = load_data('FW')

GK # check

Unnamed: 0,연봉(€),시즌,전체 출장 시간,전체 골,전체 어시스트,클린시트,실점,경고,퇴장,90분당 골 관여,...,90분당 골,90분당 실점,카드/90분,공격수 순위,미드필더 순위,수비수 순위,득점 순위,나이,승점,가치 등급
0,5992270.0,2022,720.0,0.0,0.0,2.0,10.0,0.0,0.0,0.0,...,0.0,1.25,0.00,216.0,198.0,45.0,13.0,36,71,2
2,5992270.0,2022,720.0,0.0,0.0,1.0,21.0,1.0,0.0,0.0,...,0.0,2.63,0.13,235.0,236.0,95.0,24.0,25,27,1
3,3595362.0,2022,720.0,0.0,0.0,3.0,7.0,0.0,0.0,0.0,...,0.0,0.88,0.00,219.0,227.0,13.0,8.0,30,73,1
4,22471014.0,2022,630.0,0.0,0.0,2.0,14.0,0.0,0.0,0.0,...,0.0,2.00,0.00,140.0,131.0,86.0,17.0,32,67,1
5,5992270.0,2022,720.0,0.0,0.0,4.0,8.0,0.0,0.0,0.0,...,0.0,1.00,0.00,141.0,110.0,23.0,14.0,29,94,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,5992270.0,2014,2814.0,0.0,0.0,13.0,28.0,1.0,0.0,0.0,...,0.0,0.90,0.03,389.0,393.0,24.0,14.0,22,87,1
243,898841.0,2014,3420.0,0.0,0.0,10.0,51.0,1.0,0.0,0.0,...,0.0,1.34,0.03,380.0,385.0,123.0,16.0,28,33,3
244,3115981.0,2014,270.0,0.0,0.0,1.0,7.0,1.0,0.0,0.0,...,0.0,2.33,0.33,-1.0,-1.0,-1.0,15.0,38,38,3
245,1797681.0,2014,3060.0,0.0,0.0,9.0,44.0,2.0,0.0,0.0,...,0.0,1.29,0.06,359.0,366.0,110.0,11.0,30,38,2


In [None]:
print_result(GK)

LogisticRegression Accuracy :0.5714285714285714

KNeighborsClassifier Accuracy :0.5357142857142857

DecisionTreeClassifier Accuracy :0.6785714285714286

------- Voting -------
Voting Accuracy 0.6785714285714286

------- Random Forest -------
Best Parameters : {'max_depth': 5, 'min_samples_leaf': 16, 'min_samples_split': 4, 'n_estimators': 150}
Best Score : 0.6805555555555556
Fitting 2 folds for each of 16 candidates, totalling 32 fits

------- Gradiant Boosting -------
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 20}
Best Accuracy: 0.6759259259259259


In [None]:
print_result(DF)

LogisticRegression Accuracy :0.7007874015748031

KNeighborsClassifier Accuracy :0.5984251968503937

DecisionTreeClassifier Accuracy :0.6535433070866141

------- Voting -------
Voting Accuracy 0.6535433070866141

------- Random Forest -------
Best Parameters : {'max_depth': 10, 'min_samples_leaf': 8, 'min_samples_split': 4, 'n_estimators': 50}
Best Score : 0.6469979296066253
Fitting 2 folds for each of 16 candidates, totalling 32 fits

------- Gradiant Boosting -------
Best Parameters: {'learning_rate': 0.05, 'n_estimators': 20}
Best Accuracy: 0.6780538302277432


In [None]:
print_result(MF)

LogisticRegression Accuracy :0.6168831168831169

KNeighborsClassifier Accuracy :0.5584415584415584

DecisionTreeClassifier Accuracy :0.577922077922078

------- Voting -------
Voting Accuracy 0.5974025974025974

------- Random Forest -------
Best Parameters : {'max_depth': 20, 'min_samples_leaf': 8, 'min_samples_split': 4, 'n_estimators': 50}
Best Score : 0.6739961759082218
Fitting 2 folds for each of 16 candidates, totalling 32 fits

------- Gradiant Boosting -------
Best Parameters: {'learning_rate': 0.05, 'n_estimators': 10}
Best Accuracy: 0.6959847036328872


In [None]:
print_result(FW)

LogisticRegression Accuracy :0.6142857142857143

KNeighborsClassifier Accuracy :0.5285714285714286

DecisionTreeClassifier Accuracy :0.5857142857142857

------- Voting -------
Voting Accuracy 0.6142857142857143

------- Random Forest -------
Best Parameters : {'max_depth': 10, 'min_samples_leaf': 8, 'min_samples_split': 4, 'n_estimators': 150}
Best Score : 0.6741803278688525
Fitting 2 folds for each of 16 candidates, totalling 32 fits

------- Gradiant Boosting -------
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 20}
Best Accuracy: 0.680327868852459
