# 데이터

- 데이터 출처
  - https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/data

In [1]:
import pandas as pd

In [13]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data = data.drop(['id'], axis=1)
data = data[data['gender'] != 'Other']
data.head()

df = data.copy()
df.loc[:, ["hypertension", "heart_disease", "stroke"]] = data.loc[:, ["hypertension", "heart_disease", "stroke"]].applymap(lambda x: "Yes" if x == 1 else "No")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,,never smoked,Yes
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes


# 1. 결측치 처리

## A. 결측치 확인

In [3]:
[col for col in df.columns if df[col].isnull().sum() > 0]

['bmi']

In [4]:
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

## B. 결측치 대체

### 1) `SimpleImputer()`
- 한 특성(열)의 통계 값을 이용하여 결측치 채움.

`-` 연속형 변수
- `strategy` = `"mean"`, `"median"`, `"most_frequent"`, `"constant, fill_value"`

`-` 범주형 변수
- `strategy` = `"most_frequent"`, `"constant, fill_value"`

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
def imputed_missing(df):
    df_imputed = df.copy()
    df_num = df.select_dtypes(include="number")
    df_cat = df.select_dtypes(exclude="number")
    df_imputed[df_num.columns] = SimpleImputer().fit_transform(df_num) 
    df_imputed[df_cat.columns] = SimpleImputer(strategy='most_frequent').fit_transform(df_cat) 
    return df_imputed

In [7]:
imputed_missing(df)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046.0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.600000,formerly smoked,Yes
1,51676.0,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,28.893237,never smoked,Yes
2,31112.0,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.500000,never smoked,Yes
3,60182.0,Female,49.0,No,No,Yes,Private,Urban,171.23,34.400000,smokes,Yes
4,1665.0,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.000000,never smoked,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234.0,Female,80.0,Yes,No,Yes,Private,Urban,83.75,28.893237,never smoked,No
5106,44873.0,Female,81.0,No,No,Yes,Self-employed,Urban,125.20,40.000000,never smoked,No
5107,19723.0,Female,35.0,No,No,Yes,Self-employed,Rural,82.99,30.600000,never smoked,No
5108,37544.0,Male,51.0,No,No,Yes,Private,Rural,166.29,25.600000,formerly smoked,No


### 2) `.fillna()`

In [8]:
col = [col for col in df.columns if df[col].isnull().sum() > 0]

df_a = df.copy()
df_a[col] = df_a[col].fillna(df[col].mean())
df_a.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,51676,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,28.893237,never smoked,Yes
2,31112,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,60182,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,1665,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes


### 3) `IterativeImputer()`
- 다른 특성(열)을 사용하여 결측치 예측.

- `initial_strategy 매개 변수`: `mean`, `median`, `most_frequent`, `constant`
  - 지정된 방식으로 결측치 초기화. 
  - 결측치가 있는 한 특성을 다른 특성들을 사용한 모델을 훈련하여 예측
- `imputation_order 매개 변수`:
  - `ascending`: 결측치 값이 가장 적은 특성부터
  - `descending`: 결측치 값이 가장 큰 특성부터
  - `roman`: 왼쪽에서 오른쪽으로
  - `arabic`: 오른쪽에서 왼쪽으로
  - `random`: 무작위
  - `max_iter 매개 변수`: 지정된 횟수에 도달 할 때 종료. 기본값: 10
  - `tol`: 각 반복 단계에서 이전 단계와 절댓값 차이 중 가장 큰 값이 누락된 값을 제외하고 가장 큰 절댓값과 지정된 값을 곱한 것보다 작을 경우 종료. 기본값: 0.001
  - `n_nearest_features`: 예측에 사용할 특성 계수. 상관 계수가 높은 특성을 우선하여 랜덤하게 선택 . 기본값: None
  - `estimator`: 예측에 사용되는 모델. 기본값: `BayesianRidge`

- `enable_iterative_imputer 클래스`: `IterativeImputer 클래스`가 실험적이기 때문에 이 클래스를 먼저 import 해야함

In [9]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [10]:
itrimr = IterativeImputer()
itrimr.fit_transform(df.select_dtypes(include="number"))

array([[9.04600000e+03, 6.70000000e+01, 2.28690000e+02, 3.66000000e+01],
       [5.16760000e+04, 6.10000000e+01, 2.02210000e+02, 3.25949405e+01],
       [3.11120000e+04, 8.00000000e+01, 1.05920000e+02, 3.25000000e+01],
       ...,
       [1.97230000e+04, 3.50000000e+01, 8.29900000e+01, 3.06000000e+01],
       [3.75440000e+04, 5.10000000e+01, 1.66290000e+02, 2.56000000e+01],
       [4.46790000e+04, 4.40000000e+01, 8.52800000e+01, 2.62000000e+01]])

### 4) `KNNImputer()`
- k-최근접 이웃 방법을 사용하여 결측치를 채움.

- `n_neighbors 매개변수`: 최근접 이웃 개수. 기본값 5
  - 샘플개수가 n_neighbors가 작으면 `SimpleImputer()`

In [11]:
from sklearn.impute import KNNImputer

In [12]:
kimr = KNNImputer(n_neighbors = 5) # 샘플 갯수가 n_neighbors보다 작으면 mean 계산
kimr.fit_transform(df.select_dtypes(include="number"))

array([[9.0460e+03, 6.7000e+01, 2.2869e+02, 3.6600e+01],
       [5.1676e+04, 6.1000e+01, 2.0221e+02, 2.8080e+01],
       [3.1112e+04, 8.0000e+01, 1.0592e+02, 3.2500e+01],
       ...,
       [1.9723e+04, 3.5000e+01, 8.2990e+01, 3.0600e+01],
       [3.7544e+04, 5.1000e+01, 1.6629e+02, 2.5600e+01],
       [4.4679e+04, 4.4000e+01, 8.5280e+01, 2.6200e+01]])