# 00. 라이브러리 임포트

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, SelectKBest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score,
    confusion_matrix, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

# 트리 기반 모델
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

# 01. 데이터 로드

In [3]:
df = pd.read_csv("../data/processed/survey_2024_winsor.csv")

# 02. 분석대상 필터링(웰니스 관광객)

In [4]:
wellness_df = df[df["Q8a04"] == 1]
wellness_df = wellness_df.drop(columns="Q8a04") # 참여한 활동(휴양/휴식(웰니스)) 컬럼 제거

In [5]:
wellness_df.shape

(2591, 115)

# 03. 불필요한 컬럼 제외

In [6]:
# 만족도 관련 컬럼 통일 (전반적 만족도(Q11)로 통일)
wellness_df = wellness_df.drop(columns=["Q13","Q14"])  # 재방문 의사, 타인 추천 의향 제거

# 중복되는 의미 컬럼 제거
wellness_df = wellness_df.drop(columns=["총액1인TOT2"]) # 1인 지출경비(상하위 1% 대체) 제거

# 인구통계 (분기별, 월별) 제거
wellness_df = wellness_df.drop(columns=["D_AGE", "D_SEX", "D_BUN", "D_MON"]) # 연령별, 성별, 분기별, 월별 제거

# 방문지역 제거
wellness_df = wellness_df.drop(columns=wellness_df.filter(regex="^Q9_2a").columns) # 방문지역 제거

# 방한횟수 통일
wellness_df = wellness_df.drop(columns=["D_NUM"]) # 방한횟수별 제거

# 쇼핑 장소 관련 컬럼 제거
wellness_df = wellness_df.drop(columns=wellness_df.filter(regex="^Q10_3a").columns) # 쇼핑 장소 제거

# 쇼핑 항목 관련 컬럼 제거
wellness_df = wellness_df.drop(columns=wellness_df.filter(regex="^Q10_2a").columns) # 쇼핑 항목 제거

# 04. 분산 0.03 미만 변수 제거

In [7]:
# 설정
target_col = "Q11"   # 종속변수(전반적 만족도)
variance_threshold = 0.03

# 후보 독립변수 선택
num_cols = wellness_df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in num_cols if c != target_col]

X0 = wellness_df[feature_cols].copy()

# 분산 계산 & 필터링
variances = X0.var(numeric_only=True)
low_var_cols = variances[variances < variance_threshold].index.tolist()

X1 = X0.drop(columns=low_var_cols)

print(f"[분산 필터] 총 {len(feature_cols)}개 → {X1.shape[1]}개 유지")
if low_var_cols:
    print(f" - 제거({len(low_var_cols)}): {low_var_cols[:20]}{' ...' if len(low_var_cols)>20 else ''}")
else:
    print(" - 제거된 변수 없음")

[분산 필터] 총 68개 → 50개 유지
 - 제거(18): ['Q7a8', 'Q8a17', 'Q8a20', 'Q9_5A3', 'Q9_5A5', 'Q9_5A6', 'Q9_5A9', 'TYP2', 'D_MOK4', 'D_MOK5', 'Q8_1a9', 'Q8_1a10', 'Q8_1a15', 'Q8_1a16', 'Q8_1a17', 'Q8_1a18', 'Q8_1a19', 'Q8_1a20']


# 05. 상관관계 기준 필터링 (자기상관 의심 제거)

In [8]:
corr_threshold = 0.80

X2 = X1.copy()

# 타깃과의 상관(있으면) 또는 평균 상관으로 제거 기준 결정
target_series = wellness_df[target_col] if target_col in wellness_df.columns else None

# 상관행렬(피어슨)
corr = X2.corr().abs()

# 상삼각만 사용하여 중복 제거
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop_corr = set()

for col in upper.columns:
    high_corr_partners = upper.index[upper[col] > corr_threshold].tolist()
    for partner in high_corr_partners:
        if col in to_drop_corr or partner in to_drop_corr:
            continue
        # 제거 기준 결정
        if target_series is not None:
            r_col = abs(np.corrcoef(X2[col], target_series)[0,1]) if X2[col].var() > 0 else 0
            r_partner = abs(np.corrcoef(X2[partner], target_series)[0,1]) if X2[partner].var() > 0 else 0
            loser = col if r_col < r_partner else partner
        else:
            mean_col = corr[col].mean()
            mean_partner = corr[partner].mean()
            loser = col if mean_col > mean_partner else partner  # 평균 상관이 더 큰 쪽 제거
        to_drop_corr.add(loser)

X3 = X2.drop(columns=list(to_drop_corr)) if to_drop_corr else X2

print(f"[상관 필터] 입력 {X2.shape[1]}개 → {X3.shape[1]}개 유지")
print(f" - 제거({len(to_drop_corr)}): {list(to_drop_corr)[:20]}{' ...' if len(to_drop_corr)>20 else ''}")

[상관 필터] 입력 50개 → 48개 유지
 - 제거(2): ['Q7a_dk', 'TYP3']


# 06. 다중공선성 확인

In [9]:
vif_threshold = 10.0

def compute_vif(df_num: pd.DataFrame):
    X_ = sm.add_constant(df_num, has_constant='add')
    vifs = []
    for i, col in enumerate(X_.columns):
        if col == 'const':
            vifs.append(np.nan)
        else:
            vifs.append(variance_inflation_factor(X_.values, i))
    out = pd.DataFrame({"variable": X_.columns, "VIF": vifs})
    return out[out["variable"] != "const"].sort_values("VIF", ascending=False)

Xv = X3.copy()
removed_vif = []

while True:
    vif_df = compute_vif(Xv)
    max_row = vif_df.iloc[0]
    if max_row["VIF"] <= vif_threshold or np.isinf(max_row["VIF"]):
        # 임계 이하이거나 (드물게) 무한대 처리시 루프 종료
        break
    drop_var = max_row["variable"]
    removed_vif.append((drop_var, max_row["VIF"]))
    Xv = Xv.drop(columns=[drop_var])

print(f"[VIF 필터] 입력 {X3.shape[1]}개 → {Xv.shape[1]}개 유지")
if removed_vif:
    print(" - 제거(변수, VIF):", removed_vif[:10], "..." if len(removed_vif)>10 else "")

# 최종 피처 목록
final_features = Xv.columns.tolist()

# 종속변수와 합치기
final_df = pd.concat([wellness_df[[target_col]], Xv], axis=1)

print(f"\n[최종 데이터셋] 독립변수 {len(final_features)}개 + 종속변수 1개")

[VIF 필터] 입력 48개 → 48개 유지

[최종 데이터셋] 독립변수 48개 + 종속변수 1개


# 07. 만족도 지표 이진화

In [10]:
# 타깃 이진화 (5점: 만족, 1~4점: 불만족)
final_df["rate"] = final_df["Q11"].apply(lambda x: 1 if x == 5 else 0)
final_df.drop(columns=["Q11"], inplace=True)

# 변환 후 분포 확인
print(final_df["rate"].value_counts())
print("\n",final_df["rate"].value_counts(normalize=True))  # 비율 확인

rate
1    1918
0     673
Name: count, dtype: int64

 rate
1    0.740255
0    0.259745
Name: proportion, dtype: float64


# 08. 트리모형 학습

In [11]:
target = "rate"
X = final_df.drop(columns=[target])
y = final_df[target].astype(int)

In [12]:
# 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [13]:
# 교차검증 & 스코어
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
SCORING = "f1_macro"   # macro-F1로 최적화

def extract_feature_importance(best_estimator, X_tr, y_tr, X_te, y_te, top_k=7):
    # 1) 트리 계열: feature_importances_
    if hasattr(best_estimator, "feature_importances_"):
        imp = best_estimator.feature_importances_
        return pd.DataFrame({"feature": X_tr.columns, "importance": imp})\
                 .sort_values("importance", ascending=False).head(top_k)

    # 2) 선형 계열: coef_
    if hasattr(best_estimator, "coef_"):
        coefs = np.abs(best_estimator.coef_).ravel()  # 절대값 기준
        return pd.DataFrame({"feature": X_tr.columns, "importance": coefs})\
                 .sort_values("importance", ascending=False).head(top_k)

    # 3) 그 외: 퍼뮤테이션 중요도 (검증 데이터 기준, macro-F1)
    perm = permutation_importance(
        best_estimator, X_te, y_te,
        n_repeats=10, random_state=42, n_jobs=-1, scoring=SCORING
    )
    return pd.DataFrame({
        "feature": X_te.columns,
        "importance": perm.importances_mean,
        "importance_std": perm.importances_std
    }).sort_values("importance", ascending=False).head(top_k)

In [14]:
# 그리드 서치를 위한 모델과 파라미터 그리드 설정
model_grids = []

# 1) Logistic Regression (표준화 필수)
pipe_logit = Pipeline([
    ("scaler", StandardScaler(with_mean=False) if hasattr(X_train, "sparse") and X_train.sparse else StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42))
])
param_logit = {
    "clf__C": [0.1, 1.0, 3.0],
    "clf__penalty": ["l2"],
    "clf__solver": ["liblinear", "lbfgs"],
}
model_grids.append(("LogisticRegression", pipe_logit, param_logit))

# 2) Random Forest
rf = RandomForestClassifier(n_jobs=-1, class_weight="balanced", random_state=42)
param_rf = {
    "n_estimators": [300, 600],
    "max_depth": [None, 12, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", 0.3],
}
model_grids.append(("RandomForest", rf, param_rf))

# 3) Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
param_gb = {
    "n_estimators": [200, 400],
    "learning_rate": [0.05, 0.1],
    "max_depth": [2, 3],
    "subsample": [0.7, 1.0]
}
model_grids.append(("GradientBoosting", gb, param_gb))

# 4) HistGradientBoosting
hgb = HistGradientBoostingClassifier(random_state=42)
param_hgb = {
    "max_depth": [None, 10],
    "learning_rate": [0.05, 0.1],
    "max_leaf_nodes": [31, 63],
    "min_samples_leaf": [20, 50]
}
model_grids.append(("HistGradientBoosting", hgb, param_hgb))

# 5) XGBoost
xgb = XGBClassifier(
    objective="binary:logistic",
    n_jobs=-1,
    random_state=42,
    tree_method="hist",
    eval_metric=None
)
param_xgb = {
    "n_estimators": [300, 600],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5, 7],
    "subsample": [0.7, 1.0],
    "colsample_bytree": [0.7, 1.0],
    "min_child_weight": [1, 3],
}
model_grids.append(("XGBoost", xgb, param_xgb))


# 6) LightGBM
lgbm = LGBMClassifier(
    objective="binary",
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)
param_lgbm = {
    "n_estimators": [200, 400],
    "learning_rate": [0.05],
    "num_leaves": [31, 63],
    "max_depth": [-1, 7],
    "min_child_samples": [20, 50],
    "subsample": [0.9],
    "colsample_bytree": [0.8]
}

model_grids.append(("LightGBM", lgbm, param_lgbm))

In [15]:
results = []

# 모델별 그리드 서치
for name, est, grid in model_grids:
    gs = GridSearchCV(
        estimator=est,
        param_grid=grid,
        scoring=SCORING,
        cv=cv,
        n_jobs=-1,
        verbose=1,
        refit=True
    )
    gs.fit(X_train, y_train)
    results.append({
        "model": name,
        "best_score_cv_f1_macro": gs.best_score_,
        "best_params": gs.best_params_,
        "best_estimator": gs.best_estimator_
    })
    print(f"[{name}] best CV macro-F1: {gs.best_score_:.4f}")
    print(f"[{name}] best params: {gs.best_params_}\n")

# 요약 테이블
summary = pd.DataFrame([
    {"model": r["model"], "cv_macroF1": r["best_score_cv_f1_macro"], "params": r["best_params"]}
    for r in results
]).sort_values("cv_macroF1", ascending=False)
summary

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[LogisticRegression] best CV macro-F1: 0.5483
[LogisticRegression] best params: {'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[RandomForest] best CV macro-F1: 0.5868
[RandomForest] best params: {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[GradientBoosting] best CV macro-F1: 0.5294
[GradientBoosting] best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400, 'subsample': 0.7}

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[HistGradientBoosting] best CV macro-F1: 0.5461
[HistGradientBoosting] best params: {'learning_rate': 0.1, 'max_depth': 10, 'max_leaf_nodes': 31, 'min_samples_leaf': 20}

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[XGBoost] best CV macro-F1: 0.5502
[XGBoost] 

Unnamed: 0,model,cv_macroF1,params
1,RandomForest,0.586789,"{'max_depth': 12, 'max_features': 'sqrt', 'min..."
5,LightGBM,0.56323,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
4,XGBoost,0.550209,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
0,LogisticRegression,0.548342,"{'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__so..."
3,HistGradientBoosting,0.546133,"{'learning_rate': 0.1, 'max_depth': 10, 'max_l..."
2,GradientBoosting,0.529375,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti..."


In [33]:
# CV 상에서 가장 좋은 모델 선택
best_entry = max(results, key=lambda r: r["best_score_cv_f1_macro"])
best_name = best_entry["model"]
best_model = best_entry["best_estimator"]

print(f"\n=== Best by CV macro-F1: {best_name} ===")
y_pred = best_model.predict(X_test)

print("Test macro-F1 :", f1_score(y_test, y_pred, average="macro"))
print("Test precision:", precision_score(y_test, y_pred, average="macro"))
print("Test recall   :", recall_score(y_test, y_pred, average="macro"))
print("Accuracy      :", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))


=== Best by CV macro-F1: RandomForest ===
Test macro-F1 : 0.6028324388789505
Test precision: 0.6065841340231585
Test recall   : 0.6001446759259259
Accuracy      : 0.7032755298651252

Confusion Matrix:
 [[ 52  83]
 [ 71 313]]

Classification Report:
               precision    recall  f1-score   support

           0      0.423     0.385     0.403       135
           1      0.790     0.815     0.803       384

    accuracy                          0.703       519
   macro avg      0.607     0.600     0.603       519
weighted avg      0.695     0.703     0.699       519



In [34]:
# 중요도 추출 (상위 10개)
selected_feature = extract_feature_importance(best_model, X_train, y_train, X_test, y_test, top_k=10)
print("=== Top-10 Important Features ===")
selected_feature

=== Top-10 Important Features ===


Unnamed: 0,feature,importance
31,MDAY전체TOT_RAW61,0.153282
0,M일HAP_61,0.096235
1,MVIT,0.060341
13,Q8a06,0.038535
29,Q9_5A7,0.024581
14,Q8a07,0.024498
40,Q8_1a5,0.024094
41,Q8_1a6,0.022521
38,Q8_1a3,0.021047
37,Q8_1a2,0.020229


# 09. 결과 정리 및 저장

In [35]:
selected_df = final_df[selected_feature['feature'].tolist()].reset_index(drop=True)

In [36]:
selected_df.to_csv("../data/processed/selected_features.csv", index=False)
selected_df.to_excel("../data/processed/selected_features.xlsx", index=False)