# Data

- 데이터 출처
  - https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data = data.drop(['id'], axis=1)
data = data[data['gender'] != 'Other']
data.head()

df = data.copy()
df.loc[:, ["hypertension", "heart_disease", "stroke"]] = data.loc[:, ["hypertension", "heart_disease", "stroke"]].applymap(lambda x: "Yes" if x == 1 else "No")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,,never smoked,Yes
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes


## A. 전처리

- 결측치 처리, 표준화, 인코딩

In [3]:
from sklearn.preprocessing import LabelEncoder

X = df.drop(["stroke"], axis=1)
y = LabelEncoder().fit_transform(df['stroke'])

In [4]:
from sklearn.impute import SimpleImputer

X_num = X.select_dtypes(include = 'number')
X_cat = X.select_dtypes(exclude = 'number')

X[X_num.columns] = SimpleImputer(strategy="mean").fit_transform(X_num)
X[X_cat.columns] = SimpleImputer(strategy="most_frequent").fit_transform(X_cat)

In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

scaler = StandardScaler()

onehot = OneHotEncoder(drop = 'first', handle_unknown='ignore', sparse_output=False)

ct = ColumnTransformer([('scaler', scaler, X_num.columns),
                        ('onehot', onehot, X_cat.columns)], 
                       remainder='passthrough', n_jobs=-1)

ct

# 1. 교차검증

In [6]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate, cross_val_predict

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

predictr = LogisticRegression(random_state=42)

pipe = Pipeline([('ct', ct), ("model", predictr)]);pipe

:::{.callout-important}   
- **참고** 
- 교차 검증 때 `LogisticRegression()` default값 사용함
:::

## A. `StratifiedKFold()`

In [9]:
kfold = StratifiedKFold(n_splits=5).split(X_train, y_train)

scores = []

for k, (train, test) in enumerate(kfold):
    predictr.fit(ct.fit_transform(X_train.iloc[train]), y_train[train])
    score = predictr.score(ct.transform(X_train.iloc[test]), y_train[test])
    scores.append(score)

    print(f'폴드: {k+1:02d}, '
          f'클래스 분포: {np.bincount(y_train[train])}, '
          f'정확도: {score:.4f}')

mean_acc = np.mean(scores)
std_acc = np.std(scores)
print(f'\nCV 정확도: {mean_acc:.4f} +/- {std_acc:.4f}')

폴드: 01, 클래스 분포: [3110  159], 정확도: 0.9511
폴드: 02, 클래스 분포: [3110  159], 정확도: 0.9511
폴드: 03, 클래스 분포: [3110  160], 정확도: 0.9523
폴드: 04, 클래스 분포: [3111  159], 정확도: 0.9510
폴드: 05, 클래스 분포: [3111  159], 정확도: 0.9510

CV 정확도: 0.9513 +/- 0.0005


- `cross_validate()`

In [10]:
import time

start_time = time.time()

# estimator = pipe에 predictr_gs 수행 시 반올림 하기 전 파라미터로 수행. (시간이 더 소요)
scores = cross_validate(estimator = pipe,
                        X=X_train,
                        y=y_train,
                        scoring = ['accuracy', 'roc_auc'], # 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'
                        cv = StratifiedKFold(n_splits=5),
                        n_jobs = -1)

accuracy_score_gs = np.mean(scores['test_accuracy']).round(4);accuracy_score_gs

end_time = time.time()
print("코드 실행 시간: {:.1f} 초".format(end_time - start_time))

코드 실행 시간: 0.8 초


In [11]:
accuracy_score_gs

0.9513