# Data

- 데이터 출처
  - https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/data

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data = data.drop(['id'], axis=1)
data = data[data['gender'] != 'Other']
data.head()

df = data.copy()
df.loc[:, ["hypertension", "heart_disease", "stroke"]] = data.loc[:, ["hypertension", "heart_disease", "stroke"]].applymap(lambda x: "Yes" if x == 1 else "No")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,,never smoked,Yes
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes


## A. 전처리

In [3]:
from sklearn.preprocessing import LabelEncoder

X = df.drop(["stroke"], axis=1)
y = LabelEncoder().fit_transform(df['stroke'])

In [4]:
from sklearn.impute import SimpleImputer

X_num = X.select_dtypes(include = 'number')
X_cat = X.select_dtypes(exclude = 'number')

X[X_num.columns] = SimpleImputer(strategy="mean").fit_transform(X_num)
X[X_cat.columns] = SimpleImputer(strategy="most_frequent").fit_transform(X_cat)

In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

scaler = StandardScaler()

onehot = OneHotEncoder(drop = 'first', handle_unknown='ignore', sparse_output=False)

ct = ColumnTransformer([('scaler', scaler, X_num.columns),
                        ('onehot', onehot, X_cat.columns)], 
                       remainder='passthrough', n_jobs=-1)

ct

# 1. 하이퍼 파라미터 튜닝

## A. `GridSearchCV()`

In [6]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate, cross_val_predict

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

predictr = LogisticRegression(random_state=42, max_iter = 1000)

pipe = Pipeline([('ct', ct), ("model", predictr)]);pipe

In [9]:
param_grid = {
    'model__C': np.logspace(-4, 4, 10)
}

In [10]:
from sklearn.model_selection import GridSearchCV

start_time = time.time()

predictr_gs = GridSearchCV(pipe, 
                           param_grid, 
                           cv=5, 
                           scoring='accuracy',
                           refit = True,
                           n_jobs = -1)

predictr_gs.fit(X_train, y_train)

pipe.set_params(**{key: value for key, value in predictr_gs.best_params_.items()}).fit(X_train, y_train)

end_time = time.time()
print("코드 실행 시간: {:.1f} 초".format(end_time - start_time))

코드 실행 시간: 5.0 초


In [11]:
start_time = time.time()

# estimator = pipe에 predictr_gs 수행 시 반올림 하기 전 파라미터로 수행. (시간이 더 소요)
scores = cross_validate(estimator = pipe,
                        X=X_train,
                        y=y_train,
                        scoring = ['accuracy', 'roc_auc'], # 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'
                        cv=5,
                        n_jobs = -1)

accuracy_score_gs = np.mean(scores['test_accuracy']).round(4)

end_time = time.time()
print("코드 실행 시간: {:.1f} 초".format(end_time - start_time))

코드 실행 시간: 0.8 초


In [12]:
print("Best Parameters: ", predictr_gs.best_params_)
print("Best Parameters after round: ", pipe.named_steps["model"].set_params)
print("Best Score in CV: ", predictr_gs.best_score_)
print("mean Score after round:", accuracy_score_gs)
print("Best test Score: ", pipe.named_steps["model"].score(ct.transform(X_test), y_test))

Best Parameters:  {'model__C': 0.0001}
Best Parameters after round:  <bound method BaseEstimator.set_params of LogisticRegression(C=0.0001, max_iter=1000, random_state=42)>
Best Score in CV:  0.9513091308472467
mean Score after round: 0.9513
Best test Score:  0.9510763209393346


## B. `RandomizedSearchCV()`

In [13]:
predictr = LogisticRegression(random_state=42, max_iter = 1000)

pipe = Pipeline([('ct', ct), ("model", predictr)]);pipe

In [14]:
import scipy.stats

param_dist = {
    'model__C': scipy.stats.loguniform(1e-4, 1e4)
}

In [15]:
from sklearn.model_selection import RandomizedSearchCV

start_time = time.time()

predictr_rs = RandomizedSearchCV(pipe, 
                                 param_dist,
                                 n_iter = 50, # default
                                 cv=5, 
                                 scoring='accuracy',
                                 random_state=42, 
                                 refit = True,
                                 n_jobs = -1)

predictr_rs.fit(X_train, y_train)
pipe.set_params(**{key: round(value, 2) for key, value in predictr_rs.best_params_.items()}).fit(X_train, y_train)

end_time = time.time()
print("코드 실행 시간: {:.1f} 초".format(end_time - start_time))

코드 실행 시간: 4.9 초


In [16]:
start_time = time.time()

scores = cross_validate(estimator = pipe,
                        X=X_train,
                        y=y_train,
                        scoring = ['accuracy', 'roc_auc'],
                        cv=5,
                        n_jobs = -1)

accuracy_score_rs = np.mean(scores['test_accuracy']).round(4)
roc_auc_score_rs = np.mean(scores['test_roc_auc']).round(4)

end_time = time.time()
print("코드 실행 시간: {:.1f} 초".format(end_time - start_time))

코드 실행 시간: 0.2 초


In [17]:
print("Best Parameters: ", predictr_rs.best_params_)
print("Best Parameters after round: ", pipe.named_steps["model"].set_params)
print("Best Score in CV: ", predictr_rs.best_score_)
print("mean Score after round:", accuracy_score_rs)
print("Best test Score: ", pipe.named_steps["model"].score(ct.transform(X_test), y_test))

Best Parameters:  {'model__C': 0.09915644566638401}
Best Parameters after round:  <bound method BaseEstimator.set_params of LogisticRegression(C=0.1, max_iter=1000, random_state=42)>
Best Score in CV:  0.9513091308472467
mean Score after round: 0.9513
Best test Score:  0.9510763209393346


## C. `BayesSearchCV()`

In [18]:
predictr = LogisticRegression(random_state=42, max_iter = 1000)

pipe = Pipeline([('ct', ct), ("model", predictr)]);pipe

In [19]:
search_spaces = {
    'model__C': (0.001, 1000, 'log-uniform') # scipy.stats.loguniform(1e-4, 1e4)
}

In [20]:
from skopt import BayesSearchCV

start_time = time.time()

predictr_bs = BayesSearchCV(pipe, 
                    search_spaces, 
                    n_iter = 50, # defualt
                    cv=5, 
                    scoring='accuracy', 
                    random_state=42, 
                    n_jobs = -1)

predictr_bs.fit(X_train, y_train)
pipe.set_params(**{key: round(value, 2) for key, value in predictr_bs.best_params_.items()}).fit(X_train, y_train)

end_time = time.time()
print("코드 실행 시간: {:.1f} 초".format(end_time - start_time))

코드 실행 시간: 68.4 초


In [21]:
start_time = time.time()

scores = cross_validate(estimator = pipe,
                        X=X_train,
                        y=y_train,
                        scoring = ['accuracy', 'roc_auc'],
                        cv=5,
                        n_jobs = -1)

accuracy_score_bs = np.mean(scores['test_accuracy']).round(4)
roc_auc_score_bs = np.mean(scores['test_roc_auc']).round(4)

end_time = time.time()
print("코드 실행 시간: {:.1f} 초".format(end_time - start_time))

코드 실행 시간: 0.2 초


In [22]:
print("Best Parameters: ", predictr_bs.best_params_)
print("Best Parameters after round: ", pipe.named_steps["model"].set_params)
print("Best Score in CV: ", predictr_bs.best_score_)
print("mean Score after round:", accuracy_score_bs)
print("Best test Score: ", pipe.named_steps["model"].score(ct.transform(X_test), y_test))

Best Parameters:  OrderedDict([('model__C', 0.28881766539144715)])
Best Parameters after round:  <bound method BaseEstimator.set_params of LogisticRegression(C=0.29, max_iter=1000, random_state=42)>
Best Score in CV:  0.9513091308472467
mean Score after round: 0.9513
Best test Score:  0.952054794520548
