In [62]:
import os
import random
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, f1_score
from xgboost import XGBClassifier

def seed_everything(seed:int=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

warnings.filterwarnings('ignore')
seed_everything()

In [13]:
df = pd.read_csv('./dataset/ff.csv')

def make_targets(targets):
    return 1 if targets >=5 else 0

df['BE3_31'] = df.apply(
    lambda row: make_targets(row['BE3_31']), axis=1
)
df['BE5_1'] = df.apply(
    lambda row: make_targets(row['BE5_1']), axis=1
)

df.head(3)

Unnamed: 0,HE_FEV1FVC,age,Total_slp_wk,sex,occp,EC1_1,cfam,marri_1,BH9_11,HE_DM,...,EC_pedu_1,EC_pedu_2,BS5_1,BD7_4,eq_5d,sm_present,mh_stress,cage,BE3_31,BE5_1
0,0.699366,74.0,8.0,1.0,0.0,0.0,2.0,1.0,0.0,1.0,...,,,비흡연자,없었음,0.72,1,1.0,7,1,0
1,0.69863,80.0,6.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,...,,,비흡연자,없었음,0.723,0,0.0,7,0,0
2,0.698466,49.0,8.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,...,무학,무학,언젠가금연,1년동안있었음,1.0,0,0.0,4,0,0


In [16]:
df.isna().sum().sort_values(ascending=False) / df.shape[0] * 100

HE_cough1       23.867550
HE_sput1        23.867550
EC_pedu_1       23.682119
EC_pedu_2       20.503311
HE_DM           16.529801
BD7_4           15.072848
BD7_5           13.086093
BH9_11          11.735099
BP6_10          10.728477
BP5             10.622517
BH1             10.490066
BS5_1            4.582781
occp             1.748344
EC1_1            1.589404
edu              1.589404
LQ4_00           1.509934
LQ_4EQL          1.483444
LQ1_sb           1.456954
BH2_61           1.456954
LQ1_mn           1.456954
LQ_5EQL          1.456954
LQ_3EQL          1.456954
LQ_1EQL          1.456954
LQ_2EQL          1.456954
MO1_wk           1.430464
D_1_1            1.377483
BS6_3            0.158940
BS6_2_1          0.158940
marri_1          0.132450
BP1              0.105960
BS3_1            0.105960
mh_stress        0.105960
BO1_1            0.079470
cfam             0.079470
BO2_1            0.052980
BD1              0.026490
sm_present       0.000000
cage             0.000000
BE3_31      

In [17]:
# 1차년도
NUMERIC = ['HE_FEV1FVC', 'age', 'Total_slp_wk']
CATEGORICAL = ['sex', 'occp', 'EC1_1', 'cfam', 'marri_1', 'BH9_11', 'HE_DM', 'DC6_dg', 'DF2_dg', 'HE_HPdg', 'BP5']
ONE_HOT = ['BS3_1', 'edu', 'BP1', 'D_1_1', 'LQ_1EQL', 'LQ_2EQL', 'LQ_3EQL', 'LQ_4EQL', 'LQ_5EQL', 'BO1_1', 'BO2_1']
# 2차년도
NUMERIC2 = ['DI1_ag', 'DE1_ag', 'LQ1_mn', 'BS6_2_1', 'BS6_3']
CATEGORICAL2 = ['DI1_pt', 'DE1_pt', 'DE1_3', 'BH1', 'BH2_61', 'LQ4_00', 'LQ4_05', 'LQ1_sb', 'MO1_wk', 'HE_cough1', 'HE_sput1', 'BD1', 'BD7_5', 'BP6_10', 'BP6_31']
ONE_HOT2 = ['DI1_2', 'EC_pedu_1', 'EC_pedu_2', 'BS5_1', 'BD7_4']
# 생성변수
NUMERIC3 = ['eq_5d']
CATEGORICAL3 = ['sm_present', 'mh_stress']
ONE_HOT3 = ['cage']
# Targets
TARGETS = ['BE3_31', 'BE5_1']

f_NUMERIC = (NUMERIC + NUMERIC2 + NUMERIC3).copy()
f_CATEGORICAL = (CATEGORICAL + CATEGORICAL2 + CATEGORICAL3).copy()
f_ONE_HOT = (ONE_HOT + ONE_HOT2 + ONE_HOT3).copy()

In [36]:
for value in f_NUMERIC:
    if value not in df.columns:
        print(value)
        f_NUMERIC.remove(value)

In [39]:
for value in f_CATEGORICAL:
    if value not in df.columns:
        print(value)
        f_CATEGORICAL.remove(value)

In [42]:
for value in f_ONE_HOT:
    if value not in df.columns:
        print(value)
        f_ONE_HOT.remove(value)

In [33]:
drops = df.isna().sum().sort_values(ascending=False) / df.shape[0] * 100
df = df[drops[drops.values <= 10].index]

In [50]:
df.isna().sum().sort_values(ascending=False)

BS5_1           173
occp             66
EC1_1            60
edu              60
LQ4_00           57
LQ_4EQL          56
LQ_3EQL          55
LQ_2EQL          55
LQ_1EQL          55
LQ1_mn           55
LQ_5EQL          55
BH2_61           55
LQ1_sb           55
MO1_wk           54
D_1_1            52
BS6_3             6
BS6_2_1           6
marri_1           5
BP1               4
mh_stress         4
BS3_1             4
BO1_1             3
cfam              3
BO2_1             2
BD1               1
age               0
Total_slp_wk      0
sex               0
HE_HPdg           0
cage              0
HE_FEV1FVC        0
eq_5d             0
BE3_31            0
sm_present        0
BE5_1             0
dtype: int64

In [51]:
for col in df.columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [65]:
X = df.drop('BE5_1', axis=1)
y = df['BE5_1']

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=.2, stratify=y, random_state=42)

train_X = train_X.reset_index().drop('index', axis=1)
train_y = train_y.reset_index().drop('index', axis=1)

In [71]:
ratio = float(np.sum(train_y == 0)) / np.sum(train_y == 1)
ratio.values

array([8.09638554])

In [74]:
def objective(trial:optuna.Trial, xdata, ydata):
    scores = []
    
    param = {
        'n_estimators' : trial.suggest_int('n_estimators', 100, 1000),
        'max_depth' : trial.suggest_int('max_depth', 5, 100),
        'learning_rate' : trial.suggest_float('learning_rate', .0001, .01)
    }
    kf = StratifiedKFold(n_splits=5)
    for train_idx, val_idx in kf.split(xdata, ydata):
        standard = StandardScaler()
        onehot = OneHotEncoder(handle_unknown='ignore')

        preprocessor = ColumnTransformer(
            transformers=[
                ('numeric', standard, f_NUMERIC),
                ('onehot', onehot, f_ONE_HOT)
            ]
        )
        
        tr_X = xdata.loc[train_idx]
        tr_X = preprocessor.fit_transform(tr_X)
        tr_y = ydata.loc[train_idx]
        
        val_X = xdata.loc[val_idx]
        val_X = preprocessor.transform(val_X)
        val_y = ydata.loc[val_idx]
        
        ratio = float(np.sum(tr_y == 0)) / np.sum(tr_y == 1)
        model = XGBClassifier(objective='binary:logistic', scale_pos_weight=ratio.values[0], random_state=42, **param)
        
    
        model.fit(tr_X, tr_y, verbose=False, eval_metric='logloss', eval_set=[(tr_X, tr_y),(val_X, val_y)])
        
        pred = model.predict(val_X)
        scores.append(f1_score(pred, val_y))
        
    return np.mean(scores)

In [75]:
study = optuna.create_study(direction='maximize')

study.optimize(lambda trial:objective(trial, train_X, train_y), n_trials=100)
print(f'Best trial: {study.best_trial}')
print(f'Best value: {study.best_value}')
print(f'Best params: {study.best_params}')

[I 2023-12-26 22:45:14,761] A new study created in memory with name: no-name-412fe1a9-272c-4a60-b3fd-3d653795bebc
[I 2023-12-26 22:45:32,573] Trial 0 finished with value: 0.19821119120086167 and parameters: {'n_estimators': 622, 'max_depth': 23, 'learning_rate': 0.0024987905089021946}. Best is trial 0 with value: 0.19821119120086167.
[I 2023-12-26 22:45:40,589] Trial 1 finished with value: 0.2005126756477083 and parameters: {'n_estimators': 289, 'max_depth': 36, 'learning_rate': 0.004410972268648074}. Best is trial 1 with value: 0.2005126756477083.
[I 2023-12-26 22:45:51,121] Trial 2 finished with value: 0.20057805281096247 and parameters: {'n_estimators': 389, 'max_depth': 78, 'learning_rate': 0.0046511112191074235}. Best is trial 2 with value: 0.20057805281096247.
[I 2023-12-26 22:45:58,121] Trial 3 finished with value: 0.21198019270305934 and parameters: {'n_estimators': 263, 'max_depth': 30, 'learning_rate': 0.004466501568550174}. Best is trial 3 with value: 0.21198019270305934.
[I

KeyboardInterrupt: 