In [1]:
import numpy as np
import pandas as pd   
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
import xgboost as xgb
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# -------------------------
# 데이터 로드
# -------------------------
df = pd.read_csv('univ355.csv')

# -------------------------
# 결측치 컬럼 제거
# -------------------------
drop_missing_cols = [
    'why_not_rechoice', 'future_job_desc', 'income_class', 'income_contest',
    'income_perf', 'income_broadcast', 'income_studio', 'income_creator',
    'income_choreo', 'info_dept', 'info_career_center', 'info_online',
    'info_family', 'info_friends', 'info_academy', 'info_self',
    'help_financial', 'help_resume', 'help_career', 'help_social',
    'help_no', 'non_stay_reason', 'total_univ'
]
df = df.drop(columns=drop_missing_cols, errors='ignore')

# -------------------------
# current_area 관련 변수 제거
# -------------------------
current_area_cols = [col for col in df.columns if col.startswith('current_area')]
df = df.drop(columns=current_area_cols, errors='ignore')

# -------------------------
# 오타 수정
# -------------------------
if 'birth_area' in df.columns:
    df['birth_area'] = df['birth_area'].replace({'g': 'gangwon'})  # 'g' -> 'gangwon'

# -------------------------
# 범주형 변수 매핑 및 원-핫 인코딩
# -------------------------
# dance_years 매핑
midpoint_map = {'less2':1, '2to4':2, '4to7':3, '7to10':4, 'over10':5}
if 'dance_years' in df.columns:
    df['dance_years'] = df['dance_years'].map(midpoint_map)

# 원-핫 인코딩
columns_to_encode = [
    'univ_factor', 'major_factor', 'job_region', 'weight_control',
    'birth_area', 'housing_type', 'enter_type',
    'major_detail', 'univ_name', 'univ_area'

]
df = pd.get_dummies(df, columns=columns_to_encode, drop_first=False, dtype=float)

# -------------------------
# 분석 제외 변수 제거
# -------------------------
remove_vars = ["prof_hi", "int_major", "enter_year"]
df = df.drop(columns=remove_vars, errors='ignore')

# -------------------------
# 상관계수 높은 변수 사전 삭제 (전처리 단계)
# -------------------------
high_corr_remove = ['lecture_qual', 'peer_personal', 'enjoy_major', 'prof_chat','peer_study']
df = df.drop(columns=high_corr_remove, errors='ignore')


# -------------------------
# sat 변수 생성 및 이진 분류
# -------------------------
sat_cols = ['univ_proud', 'univ_belong', 'major_proud', 'major_belong']
df['sat'] = df[sat_cols].mean(axis=1)
sat_median = df['sat'].median()
df['sat_group'] = (df['sat'] >= sat_median).astype(int)

# -------------------------
# 설명용 출력
# -------------------------
print("✅ 최종 데이터셋 컬럼 수:", df.shape[1])
print("sat 중위수:", sat_median)
print(df[['sat', 'sat_group']].head())



✅ 최종 데이터셋 컬럼 수: 178
sat 중위수: 4.0
    sat  sat_group
0  4.00          1
1  4.50          1
2  3.00          0
3  3.00          0
4  4.75          1


In [2]:
# ======================================
# ✅ 최적화 버전: ElasticNet + RFECV(LR/DT/RF/XGB)
# - 변수 선택은 항상 X_train 내부에서 수행
# - 폴드별 선택 변수 → 교집합(안정적 변수) 추출
# - CV(파이프라인 전체) & Hold-out 평가
# - 중복 코드 최소화, 가독성 강화
# ======================================

import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings

warnings.filterwarnings("ignore")

# -------------------------
# 설정
# -------------------------
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# -------------------------
# 데이터 존재 확인
# -------------------------
try:
    df
except NameError:
    raise RuntimeError("데이터프레임 df가 존재하지 않습니다. 먼저 전처리 셀을 실행하세요.")

# -------------------------
# 타깃/설명변수 설정
# -------------------------
target_col = 'sat_group'
drop_cols = ['id','univ_proud','univ_belong','major_proud','major_belong','sat','sat_group']
X = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')
y = df[target_col].copy()

# -------------------------
# 학습/테스트 분할
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# -------------------------
# CV 설정
# -------------------------
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scoring = {"accuracy": "accuracy", "f1": "f1", "roc_auc": "roc_auc"}

# -------------------------
# RFECV 파이프라인 생성 함수 
# -------------------------
def make_rfecv_pipeline(base_estimator, final_estimator=None):
    """RFECV 기반 파이프라인 생성"""
    if final_estimator is None:
        final_estimator = base_estimator
    rfecv = RFECV(
        estimator=base_estimator,
        step=1,
        cv=inner_cv,
        scoring='f1',
        min_features_to_select=max(1, int(0.05 * X_train.shape[1])),
        n_jobs=1,
        verbose=0
    )
    # LogisticRegression일 때만 스케일러 추가
    if isinstance(base_estimator, LogisticRegression):
        return Pipeline([("scaler", StandardScaler()),
                         ("feature_selection", rfecv),
                         ("clf", final_estimator)])
    else:
        return Pipeline([("feature_selection", rfecv),
                         ("clf", final_estimator)])

# -------------------------
# 모델 정의
# -------------------------
models = {
    "ElasticNet": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegressionCV(
            Cs=10, cv=inner_cv, penalty="elasticnet", solver="saga",
            l1_ratios=[.1,.3,.5,.7,.9], scoring="f1",
            max_iter=5000, random_state=RANDOM_STATE, n_jobs=-1
        ))
    ]),
    "RFECV_LR": make_rfecv_pipeline(LogisticRegression(max_iter=1000, solver='liblinear', random_state=RANDOM_STATE),
                                    LogisticRegression(max_iter=1000, solver='liblinear', random_state=RANDOM_STATE)),
    "RFECV_DT": make_rfecv_pipeline(DecisionTreeClassifier(random_state=RANDOM_STATE),
                                    DecisionTreeClassifier(random_state=RANDOM_STATE)),
    "RFECV_RF": make_rfecv_pipeline(RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
                                    RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)),
    "RFECV_XGB": make_rfecv_pipeline(XGBClassifier(n_estimators=200, random_state=RANDOM_STATE,
                                                   use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
                                     XGBClassifier(n_estimators=200, random_state=RANDOM_STATE,
                                                   use_label_encoder=False, eval_metric='logloss', n_jobs=-1))
}

# -------------------------
# 폴드별 선택 변수 추출
# -------------------------
def extract_fold_selected(pipe_name, pipe):
    fold_sets = []
    for tr_idx, val_idx in outer_cv.split(X_train, y_train):
        X_tr, y_tr = X_train.iloc[tr_idx], y_train.iloc[tr_idx]

        if pipe_name == "ElasticNet":
            enet_fold = LogisticRegressionCV(
                Cs=10, cv=inner_cv, penalty="elasticnet", solver="saga",
                l1_ratios=[.1,.3,.5,.7,.9], scoring="f1",
                max_iter=5000, random_state=RANDOM_STATE, n_jobs=-1
            )
            enet_fold.fit(X_tr, y_tr)
            sel = set(X_tr.columns[enet_fold.coef_.ravel() != 0])
        else:
            # RFECV 모델
            base_map = {
                "RFECV_LR": LogisticRegression(max_iter=1000, solver='liblinear', random_state=RANDOM_STATE),
                "RFECV_DT": DecisionTreeClassifier(random_state=RANDOM_STATE),
                "RFECV_RF": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
                "RFECV_XGB": XGBClassifier(n_estimators=200, random_state=RANDOM_STATE,
                                           eval_metric='logloss', n_jobs=-1)
            }
            rfecv_fold = RFECV(
                estimator=base_map[pipe_name], step=1, cv=inner_cv, scoring='f1',
                min_features_to_select=max(1, int(0.05 * X_train.shape[1])),
                n_jobs=1
            )
            rfecv_fold.fit(X_tr, y_tr)
            sel = set(X_tr.columns[rfecv_fold.support_])
        fold_sets.append(sel)
    return fold_sets

selected_per_model = {name: extract_fold_selected(name, pipe) for name, pipe in models.items()}

# -------------------------
# Jaccard 안정성 지수
# -------------------------
def mean_pairwise_jaccard(list_of_sets):
    if len(list_of_sets) < 2: return np.nan
    scores = []
    from itertools import combinations
    for i,j in combinations(range(len(list_of_sets)),2):
        a, b = list_of_sets[i], list_of_sets[j]
        scores.append(1.0 if len(a|b)==0 else len(a&b)/len(a|b))
    return np.mean(scores)

# -------------------------
# CV & Hold-out 평가
# -------------------------
results = []
for name, pipe in models.items():
    print(f"\n===== 평가 중: {name} =====")
    cv_res = cross_validate(pipe, X_train, y_train, cv=outer_cv, scoring=scoring,
                            n_jobs=-1, error_score=np.nan)
    cv_mean = {m: np.nanmean(cv_res[f"test_{m}"]) for m in scoring.keys()}
    cv_std = {m: np.nanstd(cv_res[f"test_{m}"]) for m in scoring.keys()}

    fitted_pipe = pipe.fit(X_train, y_train)
    y_pred = fitted_pipe.predict(X_test)
    try:
        y_score = fitted_pipe.predict_proba(X_test)[:,1]
    except:
        try: y_score = fitted_pipe.decision_function(X_test)
        except: y_score = None

    holdout_metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_score) if y_score is not None else np.nan
    }

    fold_sets = selected_per_model[name]
    stable_vars = set.intersection(*fold_sets) if fold_sets else set()
    jacc = mean_pairwise_jaccard(fold_sets)

    # 최종 선택 변수
    final_selected = []
    if "feature_selection" in fitted_pipe.named_steps:
        try: final_selected = list(X_train.columns[fitted_pipe.named_steps['feature_selection'].support_])
        except: final_selected = []
    elif name == "ElasticNet":
        try: final_selected = list(X_train.columns[fitted_pipe.named_steps['clf'].coef_.ravel()!=0])
        except: final_selected = []

    results.append({
        "model": name,
        "cv_mean": cv_mean,
        "cv_std": cv_std,
        "holdout": holdout_metrics,
        "jaccard": jacc,
        "stable_vars": sorted(list(stable_vars)),
        "final_selected": sorted(final_selected)
    })

# -------------------------
# 결과 정리
# -------------------------
summary_rows = []
for r in results:
    summary_rows.append({
        "model": r["model"],
        "n_stable_vars": len(r["stable_vars"]),
        "stable_vars": ", ".join(r["stable_vars"]),
        "CV_f1_mean": r["cv_mean"]["f1"],
        "CV_acc_mean": r["cv_mean"]["accuracy"],
        "CV_roc_mean": r["cv_mean"]["roc_auc"],
        "Holdout_f1": r["holdout"]["f1"],
        "Holdout_acc": r["holdout"]["accuracy"],
        "Holdout_roc": r["holdout"]["roc_auc"],
        "Jaccard": r["jaccard"]
    })

summary_df = pd.DataFrame(summary_rows).sort_values(by="CV_f1_mean", ascending=False)
print("\n\n=== 요약 표 ===")
display(summary_df)

for r in results:
    print(f"\n--- {r['model']} ---")
    print(f"폴드 교집합(안정적) 변수 수: {len(r['stable_vars'])}")
    print("안정적 변수:" + (", ".join(r['stable_vars']) if r['stable_vars'] else " 없음"))
    print(f"최종 선택 변수 수: {len(r['final_selected'])}")
    print("최종 선택 변수:" + (", ".join(r['final_selected']) if r['final_selected'] else " 없음"))



===== 평가 중: ElasticNet =====

===== 평가 중: RFECV_LR =====

===== 평가 중: RFECV_DT =====

===== 평가 중: RFECV_RF =====

===== 평가 중: RFECV_XGB =====


=== 요약 표 ===


Unnamed: 0,model,n_stable_vars,stable_vars,CV_f1_mean,CV_acc_mean,CV_roc_mean,Holdout_f1,Holdout_acc,Holdout_roc,Jaccard
3,RFECV_RF,81,"admin_serv, ask_q, birth_area_chungcheong, bur...",0.866966,0.83089,0.888623,0.886364,0.859155,0.877907,0.762252
0,ElasticNet,17,"course_variety, know_reason, move_univ, peer_a...",0.856121,0.816917,0.889922,0.873563,0.84507,0.858804,0.471099
4,RFECV_XGB,10,"move_univ, peer_event, prof_access, prof_passi...",0.829109,0.792231,0.873803,0.833333,0.802817,0.86794,0.333155
1,RFECV_LR,8,"abroad_practice, edu_doubt, exp_video, has_inc...",0.82255,0.792481,0.871897,0.825,0.802817,0.862126,0.295061
2,RFECV_DT,0,,0.73087,0.68302,0.673068,0.746988,0.704225,0.699751,0.184756



--- ElasticNet ---
폴드 교집합(안정적) 변수 수: 17
안정적 변수:course_variety, know_reason, move_univ, peer_advice, peer_event, peer_external, prof_access, prof_event, prof_passion, student_interact, students_diligent, study_hard, support_fac, theory_cls, time_eff, try_hard, work_fee
최종 선택 변수 수: 91
최종 선택 변수:abroad_practice, admin_serv, birth_area_chungcheong, birth_area_gangwon, birth_area_seoul, burnout, career_course, change_career, change_major, course_variety, drink_freq, drop_out, edu_doubt, enter_type_early, exercise_reg, exp_concours, exp_video, field_practice, fit_major, gender, go_grad, has_income, housing_edu, housing_green, housing_infra, housing_neighbor, housing_type_independent, housing_type_parents, job_prep, job_region_abroad, job_region_honam, job_region_yeongnam, job_test, know_reason, learn_fun, major_detail_jazzdance, major_detail_sportsdance, major_detail_streetdance, major_factor_on_interest, major_factor_total, major_rank, move_univ, peer_advice, peer_career, peer_class, peer_e

In [3]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

# -------------------------
# 최종 모델 후보 + Bayesian Search 공간
# -------------------------
bayes_models = {
    "LogReg": {
        "estimator": LogisticRegression(max_iter=1000, solver='liblinear', random_state=RANDOM_STATE),
        "search_space": {
            "C": Real(0.01, 10.0, prior='log-uniform'),
            "penalty": Categorical(["l1", "l2"])
        }
    },
    "DT": {
        "estimator": DecisionTreeClassifier(random_state=RANDOM_STATE),
        "search_space": {
            "max_depth": Integer(1, 20),
            "min_samples_split": Integer(2, 20),
            "min_samples_leaf": Integer(1, 20)
        }
    },
    "RF": {
        "estimator": RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
        "search_space": {
            "n_estimators": Integer(100, 500),
            "max_depth": Integer(3, 20),
            "min_samples_split": Integer(2, 10),
            "min_samples_leaf": Integer(1, 10)
        }
    },
    "XGB": {
        "estimator": XGBClassifier(random_state=RANDOM_STATE, use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
        "search_space": {
            "n_estimators": Integer(100, 500),
            "max_depth": Integer(3, 10),
            "learning_rate": Real(0.01, 0.3, prior='log-uniform'),
            "subsample": Real(0.5, 1.0),
            "colsample_bytree": Real(0.5, 1.0)
        }
    }
}

# -------------------------
# 안정적 변수 기반 Bayesian Optimization + 평가
# -------------------------
bayes_results = []

for r in results:
    stable_vars = r["stable_vars"]
    if not stable_vars:
        print(f"{r['model']}는 안정적 변수가 없어 제외합니다.")
        continue

    print(f"\n=== {r['model']} 안정적 변수 기반 모델링 ===")
    X_tr_stable = X_train[stable_vars]
    X_te_stable = X_test[stable_vars]

    for name, config in bayes_models.items():
        print(f"> Bayesian Optimization: {name}")
        bayes = BayesSearchCV(
            estimator=config["estimator"],
            search_spaces=config["search_space"],
            n_iter=30,
            cv=outer_cv,
            scoring="f1",
            n_jobs=-1,
            random_state=RANDOM_STATE,
            verbose=0
        )
        bayes.fit(X_tr_stable, y_train)
        
        # 최적 파라미터 및 CV 최고 점수
        best_params = bayes.best_params_
        best_score = bayes.best_score_

        # Hold-out 평가
        y_pred = bayes.predict(X_te_stable)
        try: y_score = bayes.predict_proba(X_te_stable)[:,1]
        except:
            try: y_score = bayes.decision_function(X_te_stable)
            except: y_score = None
        
        holdout_metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "f1": f1_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred, zero_division=0),
            "recall": recall_score(y_test, y_pred),
            "roc_auc": roc_auc_score(y_test, y_score) if y_score is not None else np.nan
        }

        bayes_results.append({
            "source_model": r["model"],      # 안정적 변수 추출 기준 모델
            "model": name,                   # 학습 모델
            "n_features": len(stable_vars),
            "stable_vars": ", ".join(stable_vars),
            "CV_best_f1": best_score,
            "best_params": best_params,
            **holdout_metrics
        })

bayes_results_df = pd.DataFrame(bayes_results).sort_values(by="CV_best_f1", ascending=False)
print("\n=== 안정적 변수 기반 Bayesian Optimization 결과 ===")
display(bayes_results_df)



=== ElasticNet 안정적 변수 기반 모델링 ===
> Bayesian Optimization: LogReg
> Bayesian Optimization: DT
> Bayesian Optimization: RF
> Bayesian Optimization: XGB

=== RFECV_LR 안정적 변수 기반 모델링 ===
> Bayesian Optimization: LogReg
> Bayesian Optimization: DT
> Bayesian Optimization: RF
> Bayesian Optimization: XGB
RFECV_DT는 안정적 변수가 없어 제외합니다.

=== RFECV_RF 안정적 변수 기반 모델링 ===
> Bayesian Optimization: LogReg
> Bayesian Optimization: DT
> Bayesian Optimization: RF
> Bayesian Optimization: XGB

=== RFECV_XGB 안정적 변수 기반 모델링 ===
> Bayesian Optimization: LogReg
> Bayesian Optimization: DT
> Bayesian Optimization: RF
> Bayesian Optimization: XGB

=== 안정적 변수 기반 Bayesian Optimization 결과 ===


Unnamed: 0,source_model,model,n_features,stable_vars,CV_best_f1,best_params,accuracy,f1,precision,recall,roc_auc
14,RFECV_XGB,RF,10,"move_univ, peer_event, prof_access, prof_passi...",0.880876,"{'max_depth': 3, 'min_samples_leaf': 5, 'min_s...",0.830986,0.860465,0.860465,0.860465,0.898256
11,RFECV_RF,XGB,81,"admin_serv, ask_q, birth_area_chungcheong, bur...",0.875509,"{'colsample_bytree': 0.9061979941786817, 'lear...",0.84507,0.873563,0.863636,0.883721,0.887043
15,RFECV_XGB,XGB,10,"move_univ, peer_event, prof_access, prof_passi...",0.87344,"{'colsample_bytree': 1.0, 'learning_rate': 0.0...",0.802817,0.833333,0.853659,0.813953,0.892442
10,RFECV_RF,RF,81,"admin_serv, ask_q, birth_area_chungcheong, bur...",0.872374,"{'max_depth': 17, 'min_samples_leaf': 3, 'min_...",0.859155,0.883721,0.883721,0.883721,0.881229
3,ElasticNet,XGB,17,"course_variety, know_reason, move_univ, peer_a...",0.871783,"{'colsample_bytree': 0.7265546037214652, 'lear...",0.71831,0.756098,0.794872,0.72093,0.872924
12,RFECV_XGB,LogReg,10,"move_univ, peer_event, prof_access, prof_passi...",0.871756,"{'C': 1.1408107308404316, 'penalty': 'l1'}",0.802817,0.837209,0.837209,0.837209,0.858389
2,ElasticNet,RF,17,"course_variety, know_reason, move_univ, peer_a...",0.865408,"{'max_depth': 3, 'min_samples_leaf': 3, 'min_s...",0.802817,0.840909,0.822222,0.860465,0.879568
0,ElasticNet,LogReg,17,"course_variety, know_reason, move_univ, peer_a...",0.847096,"{'C': 0.9480773400045165, 'penalty': 'l1'}",0.816901,0.847059,0.857143,0.837209,0.870432
5,RFECV_LR,DT,8,"abroad_practice, edu_doubt, exp_video, has_inc...",0.84485,"{'max_depth': 9, 'min_samples_leaf': 15, 'min_...",0.774648,0.809524,0.829268,0.790698,0.818522
6,RFECV_LR,RF,8,"abroad_practice, edu_doubt, exp_video, has_inc...",0.836051,"{'max_depth': 3, 'min_samples_leaf': 6, 'min_s...",0.802817,0.837209,0.837209,0.837209,0.826827
