# 언더샘플링 적용한 통계모델

In [6]:
import pandas as pd
import statsmodels.api as sm
from imblearn.under_sampling import RandomUnderSampler

male_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

variables = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP', 'DS1_DBP',
    'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR','exer_category',
    'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE',  
    'DS1_DRINK_RE',  
    'DS1_BMI',
    'walk_category'  
]

X_male = male_df[variables]
y_male = male_df['target']  # target 컬럼명 확인 필요

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_male, y_male)

def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.05, 
                       threshold_out=0.10, 
                       verbose=True):
    included = list(initial_list)
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit(disp=0)
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print(f'Add {best_feature} (p={best_pval:.4f})')
        
        model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[included]))).fit(disp=0)
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()
        if worst_pval > threshold_out:
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            changed = True
            if verbose:
                print(f'Drop {worst_feature} (p={worst_pval:.4f})')
        if not changed:
            break
    return included

In [7]:
male_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

X_male = male_df[variables]
y_male = male_df['target']

rus = RandomUnderSampler(random_state=42)
X_male_resampled, y_male_resampled = rus.fit_resample(X_male, y_male)

selected_male_vars = stepwise_selection(X_male_resampled, y_male_resampled)
print("남성 최종 선택 변수:", selected_male_vars)

Add DS1_PBF (p=0.0000)
Add DS1_WHR (p=0.0000)
Add DS1_PULSE (p=0.0000)
Add DS1_FDM (p=0.0000)
Add DS1_SBP (p=0.0000)
Add DS1_AGE (p=0.0000)
Add DS1_SMOKE_RE (p=0.0001)
Add DS1_HTN (p=0.0034)
남성 최종 선택 변수: ['DS1_PBF', 'DS1_WHR', 'DS1_PULSE', 'DS1_FDM', 'DS1_SBP', 'DS1_AGE', 'DS1_SMOKE_RE', 'DS1_HTN']


In [8]:

female_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

X_female = female_df[variables]
y_female = female_df['target']

rus = RandomUnderSampler(random_state=42)
X_female_resampled, y_female_resampled = rus.fit_resample(X_female, y_female)

selected_female_vars = stepwise_selection(X_female_resampled, y_female_resampled)
print("여성 최종 선택 변수:", selected_female_vars)

Add DS1_WHR (p=0.0000)
Add DS1_PBF (p=0.0000)
Add DS1_PULSE (p=0.0000)
Add DS1_FDM (p=0.0000)
Add DS1_AGE (p=0.0000)
Add DS1_SBP (p=0.0000)
Add DS1_BMI (p=0.0003)
Add DS1_HTN (p=0.0010)
Add DS1_HIP (p=0.0012)
Add DS1_EDU_RE (p=0.0192)
Add walk_category (p=0.0324)
여성 최종 선택 변수: ['DS1_WHR', 'DS1_PBF', 'DS1_PULSE', 'DS1_FDM', 'DS1_AGE', 'DS1_SBP', 'DS1_BMI', 'DS1_HTN', 'DS1_HIP', 'DS1_EDU_RE', 'walk_category']


# 다중공선성

In [2]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

male_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

female_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

variables = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP', 'DS1_DBP', 
    'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR', 'exer_category', 
    'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE', 'DS1_SMOKE_RE', 'DS1_DRINK_RE', 
    'DS1_BMI', 'walk_category'
]

def calculate_vif(df, variables):
    X = df[variables].copy()
    X = sm.add_constant(X)
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

male_vif = calculate_vif(male_final_df, variables)
print("남자 데이터 VIF 결과")
print(male_vif)

print("\n")

female_vif = calculate_vif(female_final_df, variables)
print("여자 데이터 VIF 결과")
print(female_vif)

남자 데이터 VIF 결과
          Feature          VIF
0           const  1178.828421
1         DS1_AGE     1.507129
2         DS1_HTN     1.144192
3         DS1_FDM     1.024239
4         DS1_HIP     3.102316
5       DS1_PULSE     1.043937
6         DS1_SBP     2.400956
7         DS1_DBP     2.331582
8         DS1_PBF     6.109906
9      DS1_MUSCLE     3.310502
10     DS1_VISFAT     4.583747
11        DS1_WHR     1.697799
12  exer_category     1.087119
13  DS1_INCOME_RE     1.363205
14     DS1_EDU_RE     1.357819
15   DS1_MARRY_RE     1.036553
16   DS1_SMOKE_RE     1.053308
17   DS1_DRINK_RE     1.064664
18        DS1_BMI     5.009487
19  walk_category     1.050519


여자 데이터 VIF 결과
          Feature          VIF
0           const  1038.230347
1         DS1_AGE     1.762353
2         DS1_HTN     1.197300
3         DS1_FDM     1.019695
4         DS1_HIP     3.142606
5       DS1_PULSE     1.039135
6         DS1_SBP     2.848742
7         DS1_DBP     2.649898
8         DS1_PBF     7.256148
9      DS

# 랜덤포레스트 피쳐임포턴스 남자/여자

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler

male_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

In [7]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X = male_final_df[all_vars]
y = male_final_df['target']

rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X, y)


X_train, X_temp, y_train, y_temp = train_test_split(X_under, y_under, test_size=0.4, stratify=y_under, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# 4. 전체 변수로 랜덤포레스트 학습 → 상위 8개 변수 선택
model_for_importance = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model_for_importance.fit(X_train, y_train)
importances = model_for_importance.feature_importances_
importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances})
top8_features = importance_df.sort_values(by='importance', ascending=False).head(8)['feature'].tolist()
print("상위 8개 변수:", top8_features)

X_train_top8 =X_train[top8_features]
X_val_top8 = X_val[top8_features]
X_test_top8 = X_test[top8_features]

param_grid = {
    # 'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2]

    # 'n_estimators': [150, 250],            
    # 'max_depth': [7, 12],                 
    # 'min_samples_split': [3, 4],           
    # 'min_samples_leaf': [1, 2]

   #  'n_estimators': [200, 300],
   # 'max_depth': [8, 10],
   # 'min_samples_split': [5, 10],          
   # 'min_samples_leaf': [1]

    #  'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2],
    # 'max_features': ['sqrt', 0.6]

    # 'n_estimators': [150, 200],
    # 'max_depth': [None],     
    # 'min_samples_split': [2, 3],
    # 'min_samples_leaf': [1]

    # 'n_estimators': [200],
    # 'max_depth': [7, 12],
    # 'min_samples_split': [3],
    # 'min_samples_leaf': [1, 2],
    # 'max_features': ['sqrt', 0.6]


    # 'n_estimators': [400, 600],
    # 'max_depth': [4, 6],
    # 'min_samples_split': [4, 6],
    # 'min_samples_leaf': [2, 4]


    
}



cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train_top8, y_train)
best_model = grid_search.best_estimator_
print("최적 하이퍼파라미터:", grid_search.best_params_)

# 7. 유든 인덱스 기반 최적 threshold
y_val_proba = best_model.predict_proba(X_val_top8)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 2)}")

y_test_proba = best_model.predict_proba(X_test_top8)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 2))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=2))

상위 8개 변수: ['DS1_PBF', 'DS1_WHR', 'DS1_BMI', 'DS1_SBP', 'DS1_PULSE', 'DS1_MUSCLE', 'DS1_AGE', 'DS1_HIP']
최적 하이퍼파라미터: {'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 600}
최적 threshold (유든 인덱스 기반): 0.46

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.68

[Classification Report]
              precision    recall  f1-score   support

           0       0.68      0.50      0.58       319
           1       0.61      0.76      0.68       319

    accuracy                           0.63       638
   macro avg       0.64      0.63      0.63       638
weighted avg       0.64      0.63      0.63       638



In [9]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler

female_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

In [15]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X = female_final_df[all_vars]
y = female_final_df['target']

rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X, y)

X_train, X_temp, y_train, y_temp = train_test_split(X_under, y_under, test_size=0.4, stratify=y_under, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)


model_for_importance = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model_for_importance.fit(X_train, y_train)

importances = model_for_importance.feature_importances_
importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances})
top11_features = importance_df.sort_values(by='importance', ascending=False).head(11)['feature'].tolist()

print("상위 11개 변수:", top11_features)

X_train_top11 = X_train[top11_features]
X_val_top11 = X_val[top11_features]
X_test_top11 = X_test[top11_features]

param_grid = {
    # 'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2]
    
    # 'n_estimators': [150, 250],            
    # 'max_depth': [7, 12],                 
    # 'min_samples_split': [3, 4],           
    # 'min_samples_leaf': [1, 2]

    'n_estimators': [300],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': [None, 'sqrt']

    #  'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2],
    # 'max_features': ['sqrt', 0.6]

    # 'n_estimators': [200, 300, 500],
    # 'max_depth': [None, 10, 20, 30],
    # 'max_features': ['sqrt', 'log2', 0.3, 0.5],
    # 'min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    # 'criterion': ['gini', 'entropy'],
    # 'bootstrap': [True, False]

    #  'n_estimators': [150, 200],
    # 'max_depth': [None],     
    # 'min_samples_split': [2, 3],
    # 'min_samples_leaf': [1]

    # 'n_estimators': [200],
    # 'max_depth': [7, 12],
    # 'min_samples_split': [3],
    # 'min_samples_leaf': [1, 2],
    # 'max_features': ['sqrt', 0.6]

    
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train_top11, y_train)
best_model = grid_search.best_estimator_
print("최적 하이퍼파라미터:", grid_search.best_params_)


y_val_proba = best_model.predict_proba(X_val_top11)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 2)}")


y_test_proba = best_model.predict_proba(X_test_top11)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 2))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=2))

상위 11개 변수: ['DS1_BMI', 'DS1_PBF', 'DS1_WHR', 'DS1_VISFAT', 'DS1_SBP', 'DS1_AGE', 'DS1_PULSE', 'DS1_MUSCLE', 'DS1_HIP', 'DS1_DBP', 'DS1_FDM']
최적 하이퍼파라미터: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
최적 threshold (유든 인덱스 기반): 0.41

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.71

[Classification Report]
              precision    recall  f1-score   support

           0       0.71      0.45      0.55       334
           1       0.60      0.81      0.69       334

    accuracy                           0.63       668
   macro avg       0.65      0.63      0.62       668
weighted avg       0.65      0.63      0.62       668



# 랜덤포레스트 rfe 10개 남자/여자

In [17]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from imblearn.under_sampling import RandomUnderSampler

male_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

In [None]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_all = male_final_df[all_vars]
y_all = male_final_df['target']

rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X_all, y_all)

base_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
selector = RFE(base_model, n_features_to_select=10)
selector = selector.fit(X_under, y_under)
rfe_top_10_features = X_all.columns[selector.support_].tolist()

print("RFE 기준 상위 10개 변수 (언더샘플링 기반):")
print(rfe_top_10_features)

X_rfe = pd.DataFrame(X_under, columns=X_all.columns)[rfe_top_10_features]
y_rfe = y_under

X_train, X_temp, y_train, y_temp = train_test_split(X_rfe, y_rfe, test_size=0.4, stratify=y_rfe, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

param_grid = {
    # 'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2]
    
    # 'n_estimators': [150, 250],            
    # 'max_depth': [7, 12],                 
    # 'min_samples_split': [3, 4],           
    # 'min_samples_leaf': [1, 2]

    # 'n_estimators': [300],
    # 'max_depth': [None],
    # 'min_samples_split': [2],
    # 'min_samples_leaf': [1],
    # 'max_features': [None, 'sqrt']

    #  'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2],
    # 'max_features': ['sqrt', 0.6]

    'n_estimators': [200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['sqrt', 'log2', 0.3, 0.5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]

    #  'n_estimators': [150, 200],
    # 'max_depth': [None],     
    # 'min_samples_split': [2, 3],
    # 'min_samples_leaf': [1]

    # 'n_estimators': [200],
    # 'max_depth': [7, 12],
    # 'min_samples_split': [3],
    # 'min_samples_leaf': [1, 2],
    # 'max_features': ['sqrt', 0.6]
}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 2)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 2))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=2))

RFE 기준 상위 10개 변수 (언더샘플링 기반):
['DS1_AGE', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP', 'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR', 'DS1_BMI']


In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from imblearn.under_sampling import RandomUnderSampler

female_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

In [None]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_all = female_final_df[all_vars]
y_all = female_final_df['target']

rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X_all, y_all)

estimator = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
selector = RFE(estimator, n_features_to_select=10)
selector = selector.fit(X_under, y_under)

rfe_selected_features = X_all.columns[selector.support_].tolist()
print("RFE 기준 상위 10개 변수:")
print(rfe_selected_features)

X_rfe = pd.DataFrame(X_under, columns=X_all.columns)[rfe_selected_features]
y_rfe = y_under

X_train, X_temp, y_train, y_temp = train_test_split(X_rfe, y_rfe, test_size=0.4, stratify=y_rfe, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)


# 5. 하이퍼파라미터 튜닝
param_grid = {
    # 'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2]
    
    # 'n_estimators': [150, 250],            
    # 'max_depth': [7, 12],                 
    # 'min_samples_split': [3, 4],           
    # 'min_samples_leaf': [1, 2]

    'n_estimators': [300],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': [None, 'sqrt']

    #  'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2],
    # 'max_features': ['sqrt', 0.6]

    # 'n_estimators': [200, 300, 500],
    # 'max_depth': [None, 10, 20, 30],
    # 'max_features': ['sqrt', 'log2', 0.3, 0.5],
    # 'min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    # 'criterion': ['gini', 'entropy'],
    # 'bootstrap': [True, False]

    #  'n_estimators': [150, 200],
    # 'max_depth': [None],     
    # 'min_samples_split': [2, 3],
    # 'min_samples_leaf': [1]

    # 'n_estimators': [200],
    # 'max_depth': [7, 12],
    # 'min_samples_split': [3],
    # 'min_samples_leaf': [1, 2],
    # 'max_features': ['sqrt', 0.6]

}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 2)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 2))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=2))

# xgboost 피쳐임포턴스 남자/여자

In [21]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from imblearn.under_sampling import RandomUnderSampler


male_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

In [22]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_all = male_final_df[all_vars]
y_all = male_final_df['target']

rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X_all, y_all)

X_full_train, _, y_full_train, _ = train_test_split(X_under, y_under, test_size=0.4, stratify=y_under, random_state=42)

base_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
base_model.fit(X_full_train, y_full_train)

importances = base_model.feature_importances_
importance_df = pd.DataFrame({'feature': X_all.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print("XGBoost 기준 상위 12개 변수 (언더샘플링 기반):")
print(top_features)

X_selected = pd.DataFrame(X_under, columns=X_all.columns)[top_features]
y_selected = y_under

X_train, X_temp, y_train, y_temp = train_test_split(X_selected, y_selected, test_size=0.4, stratify=y_selected, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [scale_pos_weight]
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 2)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 2))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=2))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost 기준 상위 12개 변수 (언더샘플링 기반):
['DS1_MARRY_RE', 'DS1_WHR', 'DS1_FDM', 'DS1_HTN', 'DS1_PBF', 'DS1_PULSE', 'DS1_SBP', 'walk_category', 'DS1_SMOKE_RE', 'DS1_INCOME_RE', 'DS1_HIP', 'DS1_BMI']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


최적 하이퍼파라미터:
{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'scale_pos_weight': 1.0, 'subsample': 1}
최적 threshold (유든 인덱스 기반): 0.5600000023841858

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.68

[Classification Report]
              precision    recall  f1-score   support

           0       0.61      0.72      0.66       319
           1       0.66      0.53      0.59       319

    accuracy                           0.63       638
   macro avg       0.63      0.63      0.62       638
weighted avg       0.63      0.63      0.62       638



In [23]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from imblearn.under_sampling import RandomUnderSampler

female_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

In [24]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler

all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_all = female_final_df[all_vars]
y_all = female_final_df['target']

rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X_all, y_all)

X_full_train, _, y_full_train, _ = train_test_split(X_under, y_under, test_size=0.4, stratify=y_under, random_state=42)

base_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
base_model.fit(X_full_train, y_full_train)

importances = base_model.feature_importances_
importance_df = pd.DataFrame({'feature': X_all.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print("XGBoost 기준 상위 12개 변수 (언더샘플링 기반):")
print(top_features)

X_selected = pd.DataFrame(X_under, columns=X_all.columns)[top_features]
y_selected = y_under

X_train, X_temp, y_train, y_temp = train_test_split(X_selected, y_selected, test_size=0.4, stratify=y_selected, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [scale_pos_weight]
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 2)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 2))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=2))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost 기준 상위 12개 변수 (언더샘플링 기반):
['DS1_FDM', 'DS1_PBF', 'DS1_WHR', 'DS1_VISFAT', 'DS1_BMI', 'DS1_PULSE', 'DS1_SMOKE_RE', 'DS1_HTN', 'DS1_EDU_RE', 'DS1_AGE', 'DS1_HIP', 'DS1_MARRY_RE']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


최적 하이퍼파라미터:
{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'scale_pos_weight': 0.999000999000999, 'subsample': 0.8}
최적 threshold (유든 인덱스 기반): 0.5

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.72

[Classification Report]
              precision    recall  f1-score   support

           0       0.69      0.60      0.64       334
           1       0.65      0.73      0.69       334

    accuracy                           0.67       668
   macro avg       0.67      0.67      0.66       668
weighted avg       0.67      0.67      0.66       668



# xgboost rfe 10개 남자/여자

In [25]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from imblearn.under_sampling import RandomUnderSampler

male_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

In [26]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_all = male_final_df[all_vars]
y_all = male_final_df['target']

rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X_all, y_all)

neg, pos = np.bincount(y_under)
scale_pos_weight = neg / pos

base_model = XGBClassifier(
    n_estimators=100,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=scale_pos_weight
)
selector = RFE(base_model, n_features_to_select=10)
selector = selector.fit(X_under, y_under)
rfe_top_10_features = pd.DataFrame(X_under, columns=X_all.columns).columns[selector.support_].tolist()

print("RFE 기준 상위 10개 변수:")
print(rfe_top_10_features)

X_selected = pd.DataFrame(X_under, columns=X_all.columns)[rfe_top_10_features]
y_selected = y_under

X_train, X_temp, y_train, y_temp = train_test_split(X_selected, y_selected, test_size=0.4, stratify=y_selected, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [scale_pos_weight]
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 2)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 2))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=2))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


RFE 기준 상위 10개 변수:
['DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_PULSE', 'DS1_SBP', 'DS1_DBP', 'DS1_PBF', 'DS1_WHR', 'DS1_MARRY_RE', 'DS1_BMI']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


최적 하이퍼파라미터:
{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'scale_pos_weight': 1.0, 'subsample': 0.8}
최적 threshold (유든 인덱스 기반): 0.5400000214576721

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.7

[Classification Report]
              precision    recall  f1-score   support

           0       0.63      0.69      0.66       319
           1       0.66      0.59      0.62       319

    accuracy                           0.64       638
   macro avg       0.64      0.64      0.64       638
weighted avg       0.64      0.64      0.64       638



In [27]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from imblearn.under_sampling import RandomUnderSampler


female_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

In [28]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from imblearn.under_sampling import RandomUnderSampler

all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_all = female_final_df[all_vars]
y_all = female_final_df['target']

rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X_all, y_all)

scale_pos_weight = 10

base_model = XGBClassifier(
    n_estimators=100,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=scale_pos_weight
)
selector = RFE(base_model, n_features_to_select=10)
selector = selector.fit(X_under, y_under)
rfe_top_10_features = pd.DataFrame(X_under, columns=X_all.columns).columns[selector.support_].tolist()

print("RFE 기준 상위 10개 변수:")
print(rfe_top_10_features)

X_selected = pd.DataFrame(X_under, columns=X_all.columns)[rfe_top_10_features]
y_selected = y_under

X_train, X_temp, y_train, y_temp = train_test_split(X_selected, y_selected, test_size=0.4, stratify=y_selected, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [scale_pos_weight]
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 2)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 2))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=2))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


RFE 기준 상위 10개 변수:
['DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_PULSE', 'DS1_SBP', 'DS1_DBP', 'DS1_PBF', 'DS1_WHR', 'DS1_EDU_RE', 'DS1_BMI']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


최적 하이퍼파라미터:
{'colsample_bytree': 1, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'scale_pos_weight': 10, 'subsample': 0.8}
최적 threshold (유든 인덱스 기반): 0.6600000262260437

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.7

[Classification Report]
              precision    recall  f1-score   support

           0       0.80      0.31      0.44       334
           1       0.57      0.92      0.71       334

    accuracy                           0.62       668
   macro avg       0.68      0.62      0.58       668
weighted avg       0.68      0.62      0.58       668

