# 데이터

- 데이터 출처
  - https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data = data.drop(['id'], axis=1)
data = data[data['gender'] != 'Other']
data.head()

# 범주형 자료에 숫자형으로 있으면 후에 `.select_dtypes(exclude = 'number')` 수행 하지 못함.
df = data.copy()
df.loc[:, ["hypertension", "heart_disease", "stroke"]] = data.loc[:, ["hypertension", "heart_disease", "stroke"]].applymap(lambda x: "Yes" if x == 1 else "No")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,,never smoked,Yes
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes


# 1. 인코딩

## A. **클래스 레이블(타깃 변수) 인코딩**

### 1) **`LabelEncoder()`**

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
y = LabelEncoder().fit_transform(df['stroke'].values)
y

array([1, 1, 1, ..., 0, 0, 0])

:::{.callout-important}
- **참고**
- `LabelEncoder 클래스`
   - 타깃 레이블을 인코딩하기 위한 클래스. 입력 데이터로 1차원 배열을 기대함.
   - 여러개의 특성(열)을 작업하기에는 반복작업을 해야하므로 범주형 데이터를 정수로 인코딩하는 `OrdinalEncode`와 판다스 데이터프레임의 열마다 다른 변환을 적용하도록 도와주는 `ColumnTransformer`를 이용
   - `OrdinalEncode`와 `ColumnTransformer`는 특성 변환에서 설명
:::

### 2) `numpy 이용`

In [5]:
import numpy as np

In [6]:
Y_mapping = {label: idx for idx, label in enumerate(np.unique(df['stroke']))}
Y_mapping

{'No': 0, 'Yes': 1}

In [7]:
np.array(df['stroke'].map(Y_mapping))

array([1, 1, 1, ..., 0, 0, 0])

## B. 원-핫 인코딩

### 1) **`OneHotEncoder-ColumnTransformer`**

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [9]:
X = df.drop(["stroke"], axis=1)

In [10]:
onehot = OneHotEncoder(drop = 'first', handle_unknown='ignore', sparse_output=False)

ct = ColumnTransformer([('onehot', onehot, X.select_dtypes(exclude = "number").columns)],
                        remainder='passthrough', n_jobs=-1)
ct

In [11]:
ct.fit_transform(X)

array([[  1.  ,   0.  ,   1.  , ...,  67.  , 228.69,  36.6 ],
       [  0.  ,   0.  ,   0.  , ...,  61.  , 202.21,    nan],
       [  1.  ,   0.  ,   1.  , ...,  80.  , 105.92,  32.5 ],
       ...,
       [  0.  ,   0.  ,   0.  , ...,  35.  ,  82.99,  30.6 ],
       [  1.  ,   0.  ,   0.  , ...,  51.  , 166.29,  25.6 ],
       [  0.  ,   0.  ,   0.  , ...,  44.  ,  85.28,  26.2 ]])

In [13]:
categories = ct.named_transformers_['onehot'].categories_
categories

[array(['Female', 'Male'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'],
       dtype=object),
 array(['Rural', 'Urban'], dtype=object),
 array(['Unknown', 'formerly smoked', 'never smoked', 'smokes'],
       dtype=object)]

In [14]:
# 삭제된 범주와 해당 변수명 확인
dropped_categories = {feature: category[0] for feature, category in zip(X.select_dtypes(exclude = "number").columns, categories)}
print("삭제된 범주와 변수명:", dropped_categories)

삭제된 범주와 변수명: {'gender': 'Female', 'hypertension': 'No', 'heart_disease': 'No', 'ever_married': 'No', 'work_type': 'Govt_job', 'Residence_type': 'Rural', 'smoking_status': 'Unknown'}


### 2) `pd.get_dummies(, drop_first = True)`

In [15]:
pd.get_dummies(X, drop_first = True)

Unnamed: 0,age,avg_glucose_level,bmi,gender_Male,hypertension_Yes,heart_disease_Yes,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,228.69,36.6,1,0,1,1,0,1,0,0,1,1,0,0
1,61.0,202.21,,0,0,0,1,0,0,1,0,0,0,1,0
2,80.0,105.92,32.5,1,0,1,1,0,1,0,0,0,0,1,0
3,49.0,171.23,34.4,0,0,0,1,0,1,0,0,1,0,0,1
4,79.0,174.12,24.0,0,1,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,80.0,83.75,,0,1,0,1,0,1,0,0,1,0,1,0
5106,81.0,125.20,40.0,0,0,0,1,0,0,1,0,1,0,1,0
5107,35.0,82.99,30.6,0,0,0,1,0,0,1,0,0,0,1,0
5108,51.0,166.29,25.6,1,0,0,1,0,1,0,0,0,1,0,0
