In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Data

- 데이터 출처
  - https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/data

In [27]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data = data.drop(['id'], axis=1)
data = data[data['gender'] != 'Other']
data.head()

df = data.copy()
df.loc[:, ["hypertension", "heart_disease", "stroke"]] = data.loc[:, ["hypertension", "heart_disease", "stroke"]].applymap(lambda x: "Yes" if x == 1 else "No")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,,never smoked,Yes
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5109 non-null   object 
 1   age                5109 non-null   float64
 2   hypertension       5109 non-null   object 
 3   heart_disease      5109 non-null   object 
 4   ever_married       5109 non-null   object 
 5   work_type          5109 non-null   object 
 6   Residence_type     5109 non-null   object 
 7   avg_glucose_level  5109 non-null   float64
 8   bmi                4908 non-null   float64
 9   smoking_status     5109 non-null   object 
 10  stroke             5109 non-null   object 
dtypes: float64(3), object(8)
memory usage: 479.0+ KB


## 결측치 처리

In [12]:
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [15]:
[col for col in df.columns if df[col].isnull().sum() > 0]

['bmi']

In [17]:
X = df.drop(["stroke"], axis=1)
y = LabelEncoder().fit_transform(df['stroke'])

In [18]:
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,,never smoked
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked


In [19]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [20]:
X_num = X.select_dtypes(include = 'number')
X_cat = X.select_dtypes(exclude = 'number')

X[X_num.columns] = SimpleImputer(strategy="mean").fit_transform(X_num)
X[X_cat.columns] = SimpleImputer(strategy="most_frequent").fit_transform(X_cat)

In [21]:
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,28.893237,never smoked
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked


- `ColumnTransformer()`를 이용하여 표준화 및 인코딩 처리

In [22]:
scaler = StandardScaler()

onehot = OneHotEncoder(drop = 'first', handle_unknown='ignore', sparse_output=False)

ct = ColumnTransformer([('scaler', scaler, X_num.columns),
                        ('onehot', onehot, X_cat.columns)], 
                       remainder='passthrough', n_jobs=-1)

In [23]:
ct

- train, test set 분리

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [25]:
ct.fit_transform(X_train)

array([[ 0.20566087, -0.8199733 ,  0.53847936, ...,  0.        ,
         1.        ,  0.        ],
       [-1.25490055,  0.35207477, -1.02064076, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.04659017,  0.09066209, -0.51811444, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.78103355, -0.61137349,  0.93792335, ...,  1.        ,
         0.        ,  0.        ],
       [-0.54674956, -0.71302171, -0.37637625, ...,  0.        ,
         0.        ,  0.        ],
       [-1.65323548, -0.33736527, -0.87890256, ...,  0.        ,
         0.        ,  0.        ]])

In [26]:
ct.transform(X_test)

array([[ 0.86955242, -0.62065442,  0.75752929, ...,  0.        ,
         1.        ,  0.        ],
       [-0.01563631, -0.43415205,  0.56424994, ...,  0.        ,
         1.        ,  0.        ],
       [-0.90082505,  0.44974544, -0.02847341, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.29417974, -0.31305809, -1.04641134, ...,  0.        ,
         1.        ,  0.        ],
       [-1.25490055, -0.96670029,  0.43539704, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.33843918, -0.50862041,  0.65444697, ...,  1.        ,
         0.        ,  0.        ]])