In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from itertools import cycle
import warnings

warnings.filterwarnings('ignore')

---
### 데이터 불러오기

In [26]:
names = [
    "make", "address", "all", "3d", "our", "over", "remove", "internet", "order", "mail",
    "receive", "will", "people", "report", "addresses", "free", "business", "email", "you",
    "credit", "your", "font", "000", "money", "hp", "hpl", "george", "650", "lab", "labs",
    "telnet", "857", "data", "415", "85", "technology", "1999", "parts", "pm", "direct",
    "cs", "meeting", "original", "project", "re", "edu", "table", "conference", "freq_;",
    "freq_(", "freq_[", "freq_!", "freq_$", "freq_#", "capital_run_length_average",
    "capital_run_length_longest", "capital_run_length_total", "is_spam"]

spam_df = pd.read_csv("data/spambase.data",header=None, names=names)

spam_X = spam_df.drop('is_spam', axis=1)
spam_y = spam_df['is_spam']

# print(spam_y[spam_y==0].count() / 4601 * 100)
# print(spam_y[spam_y==1].count() / 4601 * 100)
names[41]

'meeting'

---
### 데이터 정규화

In [3]:
standard_sc = StandardScaler()
spam_X = standard_sc.fit_transform(spam_X)

---
### 데이터 분할

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spam_X,
                                                    spam_y,
                                                    test_size=0.2,
                                                    stratify=spam_y,
                                                    random_state=0)

---
### 모델 정의

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score


lr_clf =  LogisticRegression(max_iter=1000)
dt_clf =  DecisionTreeClassifier(max_features='sqrt')
svm_clf =  SVC(max_iter=1000)
knn_clf =  KNeighborsClassifier()
xgb_clf =  XGBClassifier(n_estimators=500)
lgbm_clf =  LGBMClassifier(n_estimators=500)
hist_gb_clf =  HistGradientBoostingClassifier(max_bins=255,
                                              early_stopping=True,
                                              n_iter_no_change=5)

models_names = [lr_clf, dt_clf, svm_clf, knn_clf, xgb_clf, lgbm_clf, hist_gb_clf]

---
### 파라미터 목록

In [6]:
lr_params =  {
            'penalty' : ['l1', 'l2'],
            'C' : np.arange(200) / 10,
            'solver' : ['lbfgs', 'newton-cg', 'liblinear']
            }
dt_params =  {
            'max_depth' : range(1, 8),
            'min_samples_split' : range(2, 11),
            'min_samples_leaf' : range(2, 11)
            }
svm_params =  {
            'C' : np.arange(200) / 10,
            'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
            }
knn_params =  {
            'n_neighbors' : range(3, 12, 2),
            'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
            }
xgb_params =  {
            'learning_rate' : np.arange(101) / 100,
            'max_depth' : range(3, 9)
            }
lgbm_params =  {
            'learning_rate' : np.arange(101) / 100,
            'max_depth' : range(3, 9)
            }
hist_gb_params =  {
            'learning_rate' : np.arange(101) / 100,
            'max_depth' : range(3, 9),
            
            }

model_params = [lr_params, dt_params, svm_params, knn_params, xgb_params, lgbm_params, hist_gb_params]

---
### 평가지표

In [7]:
from sklearn.metrics import precision_score, recall_score

def evaluate_score(y_true, y_pred):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    print('precision: {0:.6f}, recall: {1:.6f}'\
          .format(precision, recall))

---
### 하이퍼 파라미터 최적화

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

# model_best_params = []
# model_best_estimators = []

# for idx, model_ in enumerate(models_names):
#     rd_search = RandomizedSearchCV(model_, model_params[idx], cv=5, n_iter=100, random_state=0)
#     rd_search.fit(X_train, y_train)
    
#     model_best_params.append(rd_search.best_params_)
#     model_best_estimators.append(rd_search.best_estimator_)

In [26]:
# from joblib import dump
# file_names = ['lr_clf.joblib', 'dt_clf.joblib', 'svm_clf.joblib', 'knn_clf.joblib', 'xgb_clf.joblib', 'lgbm_clf.joblib', 'hist_gb_clf.joblib']

# for idx, file_name in enumerate(file_names):
#     dump(model_best_estimators[idx], file_name)

In [27]:
# for idx, best_param in enumerate(model_best_params):
#     print(f'{file_names[idx]} | best parameters:\n{best_param}')

lr_clf.joblib | best parameters:
{'solver': 'liblinear', 'penalty': 'l1', 'C': np.float64(3.7)}

dt_clf.joblib | best parameters:
{'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 7}

svm_clf.joblib | best parameters:
{'kernel': 'rbf', 'C': np.float64(6.9)}

knn_clf.joblib | best parameters:
{'n_neighbors': 5, 'algorithm': 'auto'}

xgb_clf.joblib | best parameters:
{'max_depth': 4, 'learning_rate': np.float64(0.06)}

lgbm_clf.joblib | best parameters:
{'max_depth': 7, 'learning_rate': np.float64(0.05)}

hist_gb_clf.joblib | best parameters:
{'max_depth': 8, 'learning_rate': np.float64(0.38)}

---
### 파라미터 최적화 결과

In [28]:
# for idx, best_model in enumerate(model_best_estimators):
#     y_pred = best_model.predict(X_test)
#     print(f'{file_names[idx]}')
#     evaluate_score(y_test, y_pred)
#     print()

lr_clf.joblib
precision: 0.915068, recall: 0.920110

dt_clf.joblib
precision: 0.920732, recall: 0.831956

svm_clf.joblib
precision: 0.928375, recall: 0.928375

knn_clf.joblib
precision: 0.881844, recall: 0.842975

xgb_clf.joblib
precision: 0.940054, recall: 0.950413

lgbm_clf.joblib
precision: 0.945055, recall: 0.947658

hist_gb_clf.joblib
precision: 0.942466, recall: 0.947658

---
### 성능 좋은 3가지 모델로 앙상블

In [29]:
# from sklearn.ensemble import VotingClassifier

# voting_clf = VotingClassifier(estimators = [
#                                             ("svm_clf", model_best_estimators[2]),
#                                             ("xgb_clf", model_best_estimators[4]),
#                                             ("hist_gb_clf", model_best_estimators[6]),
#                                             ],
#                               voting = "hard")

# voting_clf.fit(X_train, y_train)
# y_pred = voting_clf.predict(X_test)
# evaluate_score(y_test, y_pred)

# dump(voting_clf, 'voting_clf.joblib')

precision: 0.945504, recall: 0.955923