In [1]:
import warnings

warnings.filterwarnings('ignore')

import os
import random
import numpy as np
import pandas as pd
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from statsmodels.imputation.mice import MICEData
from sklearn.experimental import enable_iterative_imputer
# KNN Imputer는 범주형 변수 대체 불가, 통계값으로 변환
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything()

In [3]:
df = pd.read_csv('./survey.csv')
cat_col = []
num_col = [
    'FEV1', 'FEV1FVC', 'age', 'BS6_3', 'BS6_2_1', 'BD1',
    '건강문해력', 'Total_slp_wk', 'EQ_5D', 'BE3_31', 'BE5_1', '질환유병기간'
]
for col in df.columns:
    if col not in num_col and col != 'BS3_1':
        cat_col.append(col)

In [4]:
na_values = df.isna().sum()
na_values = na_values[na_values!=0].sort_values(ascending=False)
na_values[:8]

가래양상          82
BS6_3         73
BS6_2_1       73
FEV1          29
FEV1FVC       29
심한금단증상경험여부     4
DI1_2          3
BH1_1          2
dtype: int64

In [5]:
na_ratio = na_values/len(df)*100
na_ratio[:8]

가래양상          23.428571
BS6_3         20.857143
BS6_2_1       20.857143
FEV1           8.285714
FEV1FVC        8.285714
심한금단증상경험여부     1.142857
DI1_2          0.857143
BH1_1          0.571429
dtype: float64

### Fill 0

In [6]:
imp0 = df.copy()
imp0.fillna(0, inplace=True)
imp0.to_csv('./impute_set/imp0.csv', index=False)

### Fill median(numeric) and mode(categorical)

In [7]:
imp1 = df.copy()
for col in imp1.columns:
    if col in num_col:
        imp1[col] = imp1[col].fillna(imp1[col].median())
    elif col in cat_col:
        imp1[col] = imp1[col].fillna(imp1[col].mode()[0])
imp1.to_csv('./impute_set/imp1.csv', index=False)

### Fill median(numeric) and mode(categorical) per cage

In [8]:
imp2 = df.copy()
imp2['cage'] = imp2['age']//10
age_groups = {age:imp2['cage']==age for age in range(4, 10)}
for col in imp2.columns:
    if col in num_col:
        for age, mask in age_groups.items():
            median_value = imp2.loc[mask, col].median()
            imp2.loc[mask, col] = imp2.loc[mask, col].fillna(median_value)
    elif col in cat_col:
        for age, mask in age_groups.items():
            mode_value = imp2.loc[mask, col].mode()
            if not mode_value.empty:
                imp2.loc[mask, col] = imp2.loc[mask, col].fillna(mode_value[0])
imp2.drop('cage', axis=1, inplace=True)
imp2.to_csv('./impute_set/imp2.csv', index=False)

### MICE Imputation

In [9]:
imp3 = df.copy()
mice = MICEData(imp3)
for i in range(10):
    mice.update_all()
imp3 = mice.data
imp3.to_csv('./impute_set/imp3.csv', index=False)

### Iterative Imputer

In [10]:
imp4 = df.copy()
imp4_num = imp4[num_col]
imp4_cat = imp4[cat_col]

num_iter = IterativeImputer(estimator=RandomForestRegressor(random_state=42), random_state=42)
cat_iter = IterativeImputer(estimator=RandomForestClassifier(random_state=42), random_state=42)

imp4_num = pd.DataFrame(num_iter.fit_transform(imp4_num), columns=imp4_num.columns)
imp4_cat = pd.DataFrame(cat_iter.fit_transform(imp4_cat), columns=imp4_cat.columns)

imp4[num_col] = imp4_num
imp4[cat_col] = imp4_cat

imp4.to_csv('./impute_set/imp4.csv', index=False)

### KNN Imputer

In [11]:
imp5 = df.copy()

imputer = KNNImputer(weights='distance')
imp5 = pd.DataFrame(imputer.fit_transform(imp5), columns=imp5.columns)

for col in imp5.columns:
    if col in cat_col:
        imp5[col] = np.round(imp5[col])

imp5.to_csv('./impute_set/imp5.csv', index=False)

### MissForest

In [12]:
imp6 = df.copy()

imputer = MissForest(random_state=42)
imp6 = pd.DataFrame(imputer.fit_transform(imp6), columns=imp6.columns)

for col in imp6.columns:
    if col in cat_col:
        imp6[col] = np.round(imp6[col])

imp6.to_csv('./impute_set/imp6.csv', index=False)        

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
