# 로지스틱 통계분석의 STEPWISE를 활용한 피쳐임포턴스 개수 선정

In [11]:
import pandas as pd
import statsmodels.api as sm
import numpy as np


In [13]:
male_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")
variables = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP', 'DS1_DBP',
    'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR','exer_category',
    'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE',  
    'DS1_DRINK_RE',  
    'DS1_BMI',
    'walk_category'  
]

def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.05, 
                       threshold_out=0.10, 
                       verbose=True):
    included = list(initial_list)
    while True:
        changed = False
        # forward step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit(disp=0)
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print(f'[남성] Add {best_feature} with p-value {best_pval:.4f}')

        model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[included]))).fit(disp=0)
        pvalues = model.pvalues.iloc[1:]  # intercept 제외
        worst_pval = pvalues.max()
        if worst_pval > threshold_out:
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            changed = True
            if verbose:
                print(f'[남성] Drop {worst_feature} with p-value {worst_pval:.4f}')
        if not changed:
            break
    return included

X_male = male_df[variables]
y_male = male_df['target']  # target 컬럼명 확인 필요

selected_male_vars = stepwise_selection(X_male, y_male)
print(" 남성 최종 선택 변수:", selected_male_vars)


[남성] Add DS1_PBF with p-value 0.0000
[남성] Add DS1_PULSE with p-value 0.0000
[남성] Add DS1_WHR with p-value 0.0000
[남성] Add DS1_FDM with p-value 0.0000
[남성] Add DS1_SBP with p-value 0.0000
[남성] Add DS1_AGE with p-value 0.0000
[남성] Add DS1_SMOKE_RE with p-value 0.0000
[남성] Add DS1_HTN with p-value 0.0015
[남성] Add walk_category with p-value 0.0020
[남성] Add exer_category with p-value 0.0168
[남성] Add DS1_VISFAT with p-value 0.0192
[남성] Add DS1_DBP with p-value 0.0210
 남성 최종 선택 변수: ['DS1_PBF', 'DS1_PULSE', 'DS1_WHR', 'DS1_FDM', 'DS1_SBP', 'DS1_AGE', 'DS1_SMOKE_RE', 'DS1_HTN', 'walk_category', 'exer_category', 'DS1_VISFAT', 'DS1_DBP']


In [14]:
female_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

X_female = female_df[variables]
y_female = female_df['target']  # target 컬럼명 확인 필요

selected_female_vars = stepwise_selection(X_female, y_female)
print(" 여성 최종 선택 변수:", selected_female_vars)

[남성] Add DS1_BMI with p-value 0.0000
[남성] Add DS1_WHR with p-value 0.0000
[남성] Add DS1_PULSE with p-value 0.0000
[남성] Add DS1_FDM with p-value 0.0000
[남성] Add DS1_AGE with p-value 0.0000
[남성] Add DS1_SBP with p-value 0.0000
[남성] Add DS1_HTN with p-value 0.0000
[남성] Add walk_category with p-value 0.0016
[남성] Add DS1_HIP with p-value 0.0025
[남성] Add DS1_PBF with p-value 0.0288
[남성] Add DS1_MUSCLE with p-value 0.0003
[남성] Add DS1_EDU_RE with p-value 0.0425
 여성 최종 선택 변수: ['DS1_BMI', 'DS1_WHR', 'DS1_PULSE', 'DS1_FDM', 'DS1_AGE', 'DS1_SBP', 'DS1_HTN', 'walk_category', 'DS1_HIP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_EDU_RE']


In [None]:
# 12개수도 같고 목록 같음 / 피쳐임토턴스는 상위 12개 

# 모델링

In [7]:
## 최종 변수 선택 :  [ DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP','DS1_PULSE', 'DS1_SBP', 'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR', 'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE', 'DS1_smoke_RE', 'DS1_drink_RE', 'DS1_BMI, 'walk_category']

# 랜덤포레스트 남자/여자

## 랜덤포레스트 피쳐임포턴스 상위 12개 남자

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

male_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

In [12]:
### 하이퍼파라미터 고정값 학습 

In [13]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_full = male_final_df[all_vars]
y = male_final_df['target']

X_full_train, X_full_temp, y_full_train, y_full_temp = train_test_split(X_full, y, test_size=0.4, stratify=y, random_state=42)

model_for_importance = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model_for_importance.fit(X_full_train, y_full_train)

importances = model_for_importance.feature_importances_
importance_df = pd.DataFrame({'feature': X_full.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print(" 중요도 기준 상위 12개 변수:")
print(top_features)

X = male_final_df[top_features]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

y_val_proba = model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

y_test_proba = model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

 중요도 기준 상위 12개 변수:
['DS1_PULSE', 'DS1_WHR', 'DS1_PBF', 'DS1_SBP', 'DS1_BMI', 'DS1_VISFAT', 'DS1_AGE', 'DS1_MUSCLE', 'DS1_HIP', 'DS1_DBP', 'DS1_FDM', 'exer_category']
최적 threshold (유든 인덱스 기반): 0.414

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.679

[Classification Report]
              precision    recall  f1-score   support

           0      0.967     0.790     0.870      6890
           1      0.084     0.417     0.140       319

    accuracy                          0.774      7209
   macro avg      0.526     0.604     0.505      7209
weighted avg      0.928     0.774     0.838      7209



In [None]:
#### GridSearchCV k=5

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_full = male_final_df[all_vars]
y = male_final_df['target']
X_full_train, X_full_temp, y_full_train, y_full_temp = train_test_split(X_full, y, test_size=0.4, stratify=y, random_state=42)

model_for_importance = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model_for_importance.fit(X_full_train, y_full_train)

importances = model_for_importance.feature_importances_
importance_df = pd.DataFrame({'feature': X_full.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print("중요도 기준 상위 12개 변수:")
print(top_features)

X = male_final_df[top_features]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# 하이퍼파라미터 튜닝 설정
param_grid = {
    # 'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2],
    # 'class_weight': ['balanced']

    # 'n_estimators': [150, 250],            
    # 'max_depth': [7, 12],                 
    # 'min_samples_split': [3, 4],           
    # 'min_samples_leaf': [1, 2],           
    # 'class_weight': ['balanced']

   #  'n_estimators': [200, 300],
   # 'max_depth': [8, 10],
   # 'min_samples_split': [5, 10],          
   # 'min_samples_leaf': [1],
   # 'class_weight': ['balanced']

    #  'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2],
    # 'max_features': ['sqrt', 0.6],                  
    # 'class_weight': ['balanced'] #[{0: 1, 1: 2}] 
}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

# 테스트셋 평가
y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

중요도 기준 상위 12개 변수:
['DS1_PULSE', 'DS1_WHR', 'DS1_PBF', 'DS1_SBP', 'DS1_BMI', 'DS1_VISFAT', 'DS1_AGE', 'DS1_MUSCLE', 'DS1_HIP', 'DS1_DBP', 'DS1_FDM', 'exer_category']
최적 하이퍼파라미터:
{'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
최적 threshold (유든 인덱스 기반): 0.456

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.692

[Classification Report]
              precision    recall  f1-score   support

           0      0.979     0.556     0.709      6890
           1      0.072     0.740     0.131       319

    accuracy                          0.564      7209
   macro avg      0.525     0.648     0.420      7209
weighted avg      0.939     0.564     0.683      7209



#### RandomizedSearchCV k=5

In [18]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV


# 전체 변수 리스트
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

# 1. 전체 변수로 중요도 확인용 학습
X_full = male_final_df[all_vars]
y = male_final_df['target']
X_full_train, X_full_temp, y_full_train, y_full_temp = train_test_split(X_full, y, test_size=0.4, stratify=y, random_state=42)

model_for_importance = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model_for_importance.fit(X_full_train, y_full_train)

# 2. 중요도 상위 12개 변수 선택
importances = model_for_importance.feature_importances_
importance_df = pd.DataFrame({'feature': X_full.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print(" 중요도 기준 상위 12개 변수:")
print(top_features)

# 3. 상위 12개 변수만 사용
X = male_final_df[top_features]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# 4. 하이퍼파라미터 튜닝 
param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,  
    scoring='roc_auc',
    cv=cv,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

print("최적 하이퍼파라미터:")
print(random_search.best_params_)

# 5. 유든 인덱스 기반 threshold 설정
y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

# 6. 테스트셋 평가
y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

 중요도 기준 상위 12개 변수:
['DS1_PULSE', 'DS1_WHR', 'DS1_PBF', 'DS1_SBP', 'DS1_BMI', 'DS1_VISFAT', 'DS1_AGE', 'DS1_MUSCLE', 'DS1_HIP', 'DS1_DBP', 'DS1_FDM', 'exer_category']
최적 하이퍼파라미터:
{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 5, 'class_weight': 'balanced'}
최적 threshold (유든 인덱스 기반): 0.456

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.692

[Classification Report]
              precision    recall  f1-score   support

           0      0.979     0.556     0.709      6890
           1      0.072     0.740     0.131       319

    accuracy                          0.564      7209
   macro avg      0.525     0.648     0.420      7209
weighted avg      0.939     0.564     0.683      7209



In [None]:
# 전체 변수 리스트
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

# 1. 전체 변수로 중요도 확인용 학습
X_full = male_final_df[all_vars]
y = male_final_df['target']
X_full_train, X_full_temp, y_full_train, y_full_temp = train_test_split(X_full, y, test_size=0.4, stratify=y, random_state=42)

model_for_importance = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model_for_importance.fit(X_full_train, y_full_train)

# 2. 중요도 상위 12개 변수 선택
importances = model_for_importance.feature_importances_
importance_df = pd.DataFrame({'feature': X_full.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print(" 중요도 기준 상위 12개 변수:")
print(top_features)

# 3. 상위 12개 변수만 사용
X = male_final_df[top_features]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# 4. 하이퍼파라미터 튜닝 
param_dist = {
    # 'n_estimators': [100, 200, 300, 500, 700],
    # 'max_depth': [5, 10, 15, 20, None],
    # 'min_samples_split': [2, 3, 4, 5, 10],
    # 'min_samples_leaf': [1, 2, 3, 5],
    # 'max_features': ['sqrt', 'log2', 0.6, 0.8],
    # 'bootstrap': [True, False],
    # 'class_weight': ['balanced']

    'n_estimators': [100, 300, 500],
    'max_depth': [10, 15, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 0.6],
    'bootstrap': [True],
    'class_weight': ['balanced']

}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,  
    scoring='roc_auc',
    cv=cv,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

print("최적 하이퍼파라미터:")
print(random_search.best_params_)

# 5. 유든 인덱스 기반 threshold 설정
y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

# 6. 테스트셋 평가
y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

## 랜덤포레스트 피쳐임포턴스 상위 12개 여자

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

female_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

In [None]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_full = female_final_df[all_vars]
y = female_final_df['target']

X_full_train, X_full_temp, y_full_train, y_full_temp = train_test_split(X_full, y, test_size=0.4, stratify=y, random_state=42)

model_for_importance = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model_for_importance.fit(X_full_train, y_full_train)

importances = model_for_importance.feature_importances_
importance_df = pd.DataFrame({'feature': X_full.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print(" 중요도 기준 상위 12개 변수:")
print(top_features)

X = male_final_df[top_features]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

y_val_proba = model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

y_test_proba = model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

In [None]:
#### GridSearchCV k=5

In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_full = female_final_df[all_vars]
y = female_final_df['target']
X_full_train, X_full_temp, y_full_train, y_full_temp = train_test_split(X_full, y, test_size=0.4, stratify=y, random_state=42)

model_for_importance = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model_for_importance.fit(X_full_train, y_full_train)

importances = model_for_importance.feature_importances_
importance_df = pd.DataFrame({'feature': X_full.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print(" 중요도 기준 상위 12개 변수:")
print(top_features)

X = female_final_df[top_features]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

param_grid = {
    # 'n_estimators': [100, 200],
    # 'max_depth': [5, 10],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2],
    # 'class_weight': ['balanced']
    
    # 'n_estimators': [150, 250],            
    # 'max_depth': [7, 12],                 
    # 'min_samples_split': [3, 4],           
    # 'min_samples_leaf': [1, 2],           
    # 'class_weight': ['balanced']

    # 'n_estimators': [200, 300],
    # 'max_depth': [8, 10],
    # 'min_samples_split': [5, 10],          
    # 'min_samples_leaf': [1],
    # 'class_weight': ['balanced']

     'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 0.6],                  
    'class_weight':  ['balanced'] #[{0: 1, 1: 2}] 
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

 중요도 기준 상위 12개 변수:
['DS1_WHR', 'DS1_PBF', 'DS1_BMI', 'DS1_SBP', 'DS1_VISFAT', 'DS1_AGE', 'DS1_PULSE', 'DS1_HIP', 'DS1_MUSCLE', 'DS1_DBP', 'DS1_HTN', 'DS1_FDM']
최적 하이퍼파라미터:
{'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
최적 threshold (유든 인덱스 기반): 0.446

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.753

[Classification Report]
              precision    recall  f1-score   support

           0      0.990     0.612     0.757     13436
           1      0.045     0.745     0.086       333

    accuracy                          0.616     13769
   macro avg      0.518     0.679     0.421     13769
weighted avg      0.967     0.616     0.740     13769



#### RandomizedSearchCV k=5

In [14]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV


# 전체 변수 리스트
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

# 1. 전체 변수로 중요도 확인용 학습
X_full = female_final_df[all_vars]
y = female_final_df['target']
X_full_train, X_full_temp, y_full_train, y_full_temp = train_test_split(X_full, y, test_size=0.4, stratify=y, random_state=42)

model_for_importance = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model_for_importance.fit(X_full_train, y_full_train)

# 2. 중요도 상위 12개 변수 선택
importances = model_for_importance.feature_importances_
importance_df = pd.DataFrame({'feature': X_full.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print(" 중요도 기준 상위 12개 변수:")
print(top_features)

# 3. 상위 12개 변수만 사용
X = female_final_df[top_features]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# 4. 하이퍼파라미터 튜닝 
param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,  
    scoring='roc_auc',
    cv=cv,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

print("최적 하이퍼파라미터:")
print(random_search.best_params_)

# 5. 유든 인덱스 기반 threshold 설정
y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

# 6. 테스트셋 평가
y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

 중요도 기준 상위 12개 변수:
['DS1_WHR', 'DS1_PBF', 'DS1_BMI', 'DS1_SBP', 'DS1_VISFAT', 'DS1_AGE', 'DS1_PULSE', 'DS1_HIP', 'DS1_MUSCLE', 'DS1_DBP', 'DS1_HTN', 'DS1_FDM']
최적 하이퍼파라미터:
{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 5, 'class_weight': 'balanced'}
최적 threshold (유든 인덱스 기반): 0.446

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.753

[Classification Report]
              precision    recall  f1-score   support

           0      0.990     0.612     0.757     13436
           1      0.045     0.745     0.086       333

    accuracy                          0.616     13769
   macro avg      0.518     0.679     0.421     13769
weighted avg      0.967     0.616     0.740     13769



## 랜덤포레스트 rfe = 10 남자

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import RFE

male_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

In [2]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_all = male_final_df[all_vars]
y = male_final_df['target']

base_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
selector = RFE(base_model, n_features_to_select=10)
selector = selector.fit(X_all, y)
rfe_top_10_features = X_all.columns[selector.support_].tolist()

print("RFE 기준 상위 10개 변수:")
print(rfe_top_10_features)

X = male_final_df[rfe_top_10_features]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']

      # 'n_estimators': [100],
      #   'max_depth': [7],
      #   'min_samples_split': [5],
      #   'min_samples_leaf': [2],
      #   'class_weight': ['balanced']

    # 'n_estimators': [300],
    #   'max_depth': [None],
    #   'min_samples_split': [2],
    #   'min_samples_leaf': [1],
    #   'class_weight': ['balanced']

     # 'n_estimators': [200,300],
     # 'max_depth': [15],
     # 'min_samples_split': [2],
     # 'min_samples_leaf': [1],
     # 'class_weight': ['balanced']

    # 'n_estimators': [150, 250],           
    # 'max_depth': [7, 12],                  
    # 'min_samples_split': [3, 4],         
    # 'min_samples_leaf': [1, 2],           
    # 'class_weight': ['balanced']
}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

RFE 기준 상위 10개 변수:
['DS1_AGE', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP', 'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR', 'DS1_BMI']
최적 하이퍼파라미터:
{'class_weight': 'balanced', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
최적 threshold (유든 인덱스 기반): 0.445

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.681

[Classification Report]
              precision    recall  f1-score   support

           0      0.978     0.528     0.686      6890
           1      0.068     0.749     0.125       319

    accuracy                          0.538      7209
   macro avg      0.523     0.639     0.406      7209
weighted avg      0.938     0.538     0.661      7209



## 랜덤포레스트 rfe = 10 여자

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble ifrom sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import RFEmport RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import RFEfrom sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import RFE

female_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

In [4]:
all_vars = [    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_all = female_final_df[all_vars]
y =female_final_df['target']

estimator = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
selector = RFE(estimator, n_features_to_select=10)
selector = selector.fit(X_all, y)

rfe_selected_features = X_all.columns[selector.support_].tolist()
print("RFE 기준 상위 10개 변수:")
print(rfe_selected_features)

X = female_final_df[rfe_selected_features]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# 5. 하이퍼파라미터 튜닝
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']

      # 'n_estimators': [100],
      #   'max_depth': [7],
      #   'min_samples_split': [5],
      #   'min_samples_leaf': [2],
      #   'class_weight': ['balanced']

      # 'n_estimators': [300],
      # 'max_depth': [None],
      # 'min_samples_split': [2],
      # 'min_samples_leaf': [1],
      # 'class_weight': ['balanced'] #-> 최악임

     # 'n_estimators': [200,300],
     # 'max_depth': [15],
     # 'min_samples_split': [2],
     # 'min_samples_leaf': [1],
     # 'class_weight': ['balanced']

    # 'n_estimators': [150, 250],           
    # 'max_depth': [7, 12],                  
    # 'min_samples_split': [3, 4],         
    # 'min_samples_leaf': [1, 2],           
    # 'class_weight': ['balanced']


    
}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

RFE 기준 상위 10개 변수:
['DS1_AGE', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP', 'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR', 'DS1_BMI']
최적 하이퍼파라미터:
{'class_weight': 'balanced', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
최적 threshold (유든 인덱스 기반): 0.468

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.746

[Classification Report]
              precision    recall  f1-score   support

           0      0.989     0.639     0.777     13436
           1      0.047     0.721     0.089       333

    accuracy                          0.641     13769
   macro avg      0.518     0.680     0.433     13769
weighted avg      0.967     0.641     0.760     13769



# xgboost 남자/여자

# xgboost 피쳐임포턴스 상위 12개 남자

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

male_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

In [6]:

all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_full = male_final_df[all_vars]
y = male_final_df['target']
X_full_train, _, y_full_train, _ = train_test_split(X_full, y, test_size=0.4, stratify=y, random_state=42)

# scale_pos_weight 계산
neg, pos = np.bincount(y_full_train)
scale_pos_weight = neg / pos

base_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
base_model.fit(X_full_train, y_full_train)

importances = base_model.feature_importances_
importance_df = pd.DataFrame({'feature': X_full.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print("기본 모델 기준 상위 12개 변수:")
print(top_features)

X = male_final_df[top_features]
y = male_final_df['target']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [scale_pos_weight]
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


기본 모델 기준 상위 12개 변수:
['DS1_FDM', 'DS1_WHR', 'DS1_PBF', 'DS1_HTN', 'walk_category', 'DS1_DRINK_RE', 'exer_category', 'DS1_PULSE', 'DS1_SBP', 'DS1_BMI', 'DS1_SMOKE_RE', 'DS1_MUSCLE']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


최적 하이퍼파라미터:
{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'scale_pos_weight': 21.62133891213389, 'subsample': 1}
최적 threshold (유든 인덱스 기반): 0.4000000059604645

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.683

[Classification Report]
              precision    recall  f1-score   support

           0      0.976     0.510     0.670      6890
           1      0.065     0.734     0.119       319

    accuracy                          0.520      7209
   macro avg      0.521     0.622     0.394      7209
weighted avg      0.936     0.520     0.646      7209



# xgboost 피쳐임포턴스 상위 12개 여자

In [1]:
import pandas as pd
import numpy as np

female_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

In [7]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

# 전체 변수 리스트
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

# 전체 변수로 변수 중요도 확인용 기본 모델 학습
X_full = female_final_df[all_vars]
y = female_final_df['target']
X_full_train, _, y_full_train, _ = train_test_split(X_full, y, test_size=0.4, stratify=y, random_state=42)

# scale_pos_weight 계산
neg, pos = np.bincount(y_full_train)
scale_pos_weight = neg / pos

# 기본 모델로 중요도 계산
base_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
base_model.fit(X_full_train, y_full_train)

# 변수 중요도 추출
importances = base_model.feature_importances_
importance_df = pd.DataFrame({'feature': X_full.columns, 'importance': importances})
top_features = importance_df.sort_values(by='importance', ascending=False).head(12)['feature'].tolist()

print("기본 모델 기준 상위 12개 변수:")
print(top_features)

# ❷ 상위 변수로만 다시 데이터셋 구성
X = female_final_df[top_features]
y = female_final_df['target']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# ❸ 하이퍼파라미터 튜닝
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [scale_pos_weight]
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

# ❹ 유든 인덱스 기반 threshold 설정
y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

# ❺ 테스트셋 평가
y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


기본 모델 기준 상위 12개 변수:
['DS1_FDM', 'DS1_WHR', 'DS1_BMI', 'DS1_HTN', 'DS1_PBF', 'DS1_SBP', 'DS1_PULSE', 'DS1_MARRY_RE', 'DS1_AGE', 'DS1_DBP', 'DS1_HIP', 'DS1_VISFAT']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


최적 하이퍼파라미터:
{'colsample_bytree': 1, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'scale_pos_weight': 40.264735264735265, 'subsample': 0.8}
최적 threshold (유든 인덱스 기반): 0.4009999930858612

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.754

[Classification Report]
              precision    recall  f1-score   support

           0      0.990     0.621     0.763     13436
           1      0.046     0.736     0.086       333

    accuracy                          0.624     13769
   macro avg      0.518     0.678     0.425     13769
weighted avg      0.967     0.624     0.747     13769



# xgboost rfe =10개 남자

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import RFE
male_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male_final.csv")

In [3]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_all = male_final_df[all_vars]
y = male_final_df['target']

neg, pos = np.bincount(y)
scale_pos_weight = neg / pos

base_model = XGBClassifier(
    n_estimators=100,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=scale_pos_weight
)
selector = RFE(base_model, n_features_to_select=10)
selector = selector.fit(X_all, y)
rfe_top_10_features = X_all.columns[selector.support_].tolist()

print("RFE 기준 상위 10개 변수:")
print(rfe_top_10_features)

X = male_final_df[rfe_top_10_features]
y = male_final_df['target']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [scale_pos_weight]
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


RFE 기준 상위 10개 변수:
['DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_PULSE', 'DS1_SBP', 'DS1_PBF', 'DS1_VISFAT', 'DS1_WHR', 'DS1_SMOKE_RE', 'DS1_BMI']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


최적 하이퍼파라미터:
{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'scale_pos_weight': 21.612296110414054, 'subsample': 0.8}
최적 threshold (유든 인덱스 기반): 0.4819999933242798

테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)
AUC: 0.695

[Classification Report]
              precision    recall  f1-score   support

           0      0.974     0.660     0.786      6890
           1      0.077     0.618     0.138       319

    accuracy                          0.658      7209
   macro avg      0.526     0.639     0.462      7209
weighted avg      0.934     0.658     0.758      7209



# xgboost rfe =10개 여자

In [4]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import RFE

female_final_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female_final.csv")

In [None]:
all_vars = [
    'DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
    'DS1_DBP', 'DS1_PBF', 'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
    'exer_category', 'DS1_INCOME_RE', 'DS1_EDU_RE', 'DS1_MARRY_RE',
    'DS1_SMOKE_RE', 'DS1_DRINK_RE', 'DS1_BMI', 'walk_category'
]

X_all = female_final_df[all_vars]
y = female_final_df['target']

# neg, pos = np.bincount(y)
scale_pos_weight = 10 # neg / pos

base_model = XGBClassifier(
    n_estimators=100,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=scale_pos_weight
)
selector = RFE(base_model, n_features_to_select=10)
selector = selector.fit(X_all, y)
rfe_top_10_features = X_all.columns[selector.support_].tolist()

print("RFE 기준 상위 10개 변수:")
print(rfe_top_10_features)

X = female_final_df[rfe_top_10_features]
y = female_final_df['target']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [scale_pos_weight]
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("최적 하이퍼파라미터:")
print(grid_search.best_params_)

y_val_proba = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
youden_index = tpr - fpr
valid_range = (thresholds >= 0.4) & (thresholds <= 0.7)
if valid_range.sum() == 0:
    best_threshold = 0.5
    print("유효한 threshold 없음 → 기본값 0.5 사용")
else:
    best_threshold = thresholds[valid_range][youden_index[valid_range].argmax()]
print(f"최적 threshold (유든 인덱스 기반): {round(best_threshold, 3)}")

y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n테스트셋 평가 결과 (유든 인덱스 기반 threshold 적용)")
print("AUC:", round(roc_auc_score(y_test, y_test_proba), 3))
print("\n[Classification Report]")
print(classification_report(y_test, y_test_pred, digits=3))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


RFE 기준 상위 10개 변수:
['DS1_AGE', 'DS1_HTN', 'DS1_FDM', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP', 'DS1_PBF', 'DS1_WHR', 'DS1_EDU_RE', 'DS1_BMI']
