# Classification

Used classification models

1. KNN (K-Nearest Neighbor)
2. Logistic Regression
3. Decision Tree

--- Ensemble model ---
1. Voting
2. Bagging (Random Forest)
3. Boosting
  - GBM
  - XGBoost
  - LightGBM


# Setting
- Mount Google Drive
- Load data & library


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 
from sklearn import tree
import warnings
warnings.filterwarnings(action='ignore')


In [33]:
data_path = '/content/drive/MyDrive/Colab Notebooks/캡스톤_4조/dataset/'
data = pd.read_csv(data_path + 'epl_grade_data.csv')

# data = data[['시즌','클린시트','승점','전체 출장 시간','나이','90분당 실점','연봉 등급']]
del data['Unnamed: 0']
#, data['연봉(€)']

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 22 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   연봉(€)      3095 non-null   float64
 1   시즌         3095 non-null   int64  
 2   포지션        3095 non-null   int64  
 3   전체 출장 시간   3095 non-null   float64
 4   전체 골       3095 non-null   float64
 5   전체 어시스트    3095 non-null   float64
 6   클린시트       3095 non-null   float64
 7   실점         3095 non-null   float64
 8   경고         3095 non-null   float64
 9   퇴장         3095 non-null   float64
 10  90분당 골 관여  3095 non-null   float64
 11  90분당 어시스트  3095 non-null   float64
 12  90분당 골     3095 non-null   float64
 13  90분당 실점    3095 non-null   float64
 14  카드/90분     3095 non-null   float64
 15  공격수 순위     3095 non-null   float64
 16  미드필더 순위    3095 non-null   float64
 17  수비수 순위     3095 non-null   float64
 18  득점 순위      3095 non-null   float64
 19  나이         3095 non-null   int64  
 20  승점      

# Data split

In [34]:
def data_split(df):
  train_df = df[df['시즌']!=2022]
  test_df = df[df['시즌']==2022]
  del train_df['시즌'], test_df['시즌']
  train_X, train_y = train_df.drop('가치 등급', axis=1), train_df['가치 등급']
  test_X, test_y = test_df.drop('가치 등급', axis=1), test_df['가치 등급']

  return train_X, train_y, test_X, test_y

lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()
dt_clf = tree.DecisionTreeClassifier()

x_train, y_train, x_test, y_test = data_split(data)

scaler = StandardScaler()  
  
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Modeling

In [35]:
models = [lr_clf, knn_clf, dt_clf]
for model in models:
    model.fit(x_train, y_train)
    pred_y = model.predict(x_test)
    model_name = model.__class__.__name__
    print(f"{model_name} Accuracy :{accuracy_score(y_test, pred_y)}")

LogisticRegression Accuracy :0.4326923076923077
KNeighborsClassifier Accuracy :0.4567307692307692
DecisionTreeClassifier Accuracy :0.4326923076923077


## Voting

In [36]:
# Voting
vo_clf = VotingClassifier(estimators=[('LR', lr_clf),
                                     ('KNN', knn_clf),
                                      ('DT', dt_clf)],
                         voting='soft')

vo_clf.fit(x_train, y_train)
pred_y = vo_clf.predict(x_test)
print('Voting Accuracy', accuracy_score(y_test, pred_y))

Voting Accuracy 0.46153846153846156


## Bagging (Random Forest)

In [37]:
# RandomForest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# sample 20개 , tree depth - 5 #

rf = RandomForestClassifier(n_estimators=20, 
                             max_depth=5, random_state=0)
rf.fit(x_train,y_train)

predict1 = rf.predict(x_test)
print('Random Forest Accuracy (sample:20, depth:5)', accuracy_score(y_test,predict1))

# sample 100개 , tree depth - 20 #

rf = RandomForestClassifier(n_estimators=100,
                             max_depth=20, random_state=0)
rf.fit(x_train,y_train)
predict2 = rf.predict(x_test)

print('Random Forest Accuracy (sample:100, depth:20)', accuracy_score(y_test, predict2))

Random Forest Accuracy (sample:20, depth:5) 0.4423076923076923
Random Forest Accuracy (sample:100, depth:20) 0.4230769230769231


In [38]:
# Random Forest

rf_clf = RandomForestClassifier()
rf_clf.fit(x_train, y_train)
y_pred = rf_clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy : {acc: .4f}")

Random Forest Accuracy :  0.3894


In [39]:
from sklearn.model_selection import GridSearchCV
import time

start_time = time.time()

params = {
    'n_estimators' : [50,100,150,200],
    'max_depth': [5,10,20],
    'min_samples_leaf':[8,16,32],
    'min_samples_split':[4,8,16]
}

rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params,
                      cv=2, n_jobs=-1)
grid_cv.fit(x_train, y_train)

print(f"Best Parameters : {grid_cv.best_params_}")
print(f"Best Score : {grid_cv.best_score_}")
print(f"Running Time : {time.time()-start_time :.4f}")

Best Parameters : {'max_depth': 20, 'min_samples_leaf': 8, 'min_samples_split': 4, 'n_estimators': 150}
Best Score : 0.6782168381891374
Running Time : 65.3105


## Gradient Boosting

In [40]:
from sklearn.ensemble import GradientBoostingClassifier

In [41]:
start_time = time.time()

gb_clf = GradientBoostingClassifier()
gb_clf.fit(x_train, y_train)
gb_pred = gb_clf.predict(x_test)
gb_acc = accuracy_score(y_test, gb_pred)

print(f"Gradient Boosting Accuracy : {gb_acc :.4f}")
print(f"Running Time : {time.time()-start_time :.4f}")

Gradient Boosting Accuracy : 0.5192
Running Time : 2.9202


In [42]:
start_time = time.time()

params = {
    'n_estimators':[10,15,20,25],
    'learning_rate':[0.001, 0.01, 0.05, 0.1]
}

gb_clf = GradientBoostingClassifier()
grid_cv = GridSearchCV(gb_clf, param_grid=params,
                      n_jobs=-1, cv=2, verbose=1)
grid_cv.fit(x_train, y_train)
gb_pred = grid_cv.predict(x_test)
print("Best Parameters:", grid_cv.best_params_)
print("Best Accuracy:", grid_cv.best_score_)
print(f"Running Time : {time.time()-start_time :.4f}")

Fitting 2 folds for each of 16 candidates, totalling 32 fits
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 25}
Best Accuracy: 0.685148764788654
Running Time : 7.2964


In [43]:
from sklearn import metrics

y_test = np.array(y_test)
confusion_matrix = metrics.confusion_matrix(y_test, gb_pred)
confusion_matrix

array([[77, 44, 15],
       [ 5, 24, 27],
       [ 0,  1, 15]])

In [44]:
start_time = time.time()

gb_clf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=20)
gb_clf.fit(x_train, y_train)
gb_pred = gb_clf.predict(x_test)
gb_acc = accuracy_score(y_test, gb_pred)

print(f"Gradient Boosting Accuracy : {gb_acc :.4f}")
print(f"Running Time : {time.time()-start_time :.4f}")

Gradient Boosting Accuracy : 0.5817
Running Time : 0.5704
