In [8]:
# 1. 라이브러리 임포트
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedKFold,
    train_test_split,
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone

try:
    from xgboost import XGBClassifier
except ImportError as exc:
    raise ImportError('xgboost 패키지가 필요합니다. pip install xgboost 로 설치해 주세요.') from exc

try:
    from lightgbm import LGBMClassifier
except ImportError as exc:
    raise ImportError('lightgbm 패키지가 필요합니다. pip install lightgbm 로 설치해 주세요.') from exc

print('라이브러리 임포트 완료')

라이브러리 임포트 완료


In [9]:
# 2. 데이터 로드 및 특징 선택
DATA_PATH = Path('../data/raw/dataset.csv')  # ✅ Raw 데이터 사용
if not DATA_PATH.exists():
    raise FileNotFoundError('dataset.csv 파일이 존재하지 않습니다. 경로를 확인하세요.')

df = pd.read_csv(DATA_PATH)
print(f'✅ 원본 데이터 형태: {df.shape}')

if 'Target' not in df.columns:
    raise KeyError('데이터셋에 Target 컬럼이 없습니다.')

df = df.copy()
removed_enrolled = df.loc[df['Target'] == 'Enrolled'].shape[0]
if removed_enrolled:
    print(f"Enrolled 라벨 {removed_enrolled}건 제거")
    df = df[df['Target'] != 'Enrolled']

TARGET_MAPPING = {'Dropout': 0, 'Graduate': 1}
df['Target'] = df['Target'].map(TARGET_MAPPING)
if df['Target'].isna().any():
    missing_labels = df.loc[df['Target'].isna(), 'Target']
    raise ValueError(f'정의되지 않은 타깃 라벨이 있습니다: {missing_labels.unique()}')

before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f'중복 제거: {before - after}행 삭제 (현재 {after}행)')

missing_summary = df.isnull().sum()
print('결측치 현황:')
print(missing_summary[missing_summary > 0] if missing_summary.any() else '결측치 없음')

feature_cols = [col for col in df.columns if col != 'Target']
X = df[feature_cols]
y = df['Target']

columns_to_drop = set()

# 2-1. 식별자 및 상수 컬럼 제거
id_like = [col for col in feature_cols if 'id' in col.lower()]
constant_cols = [col for col in feature_cols if X[col].nunique(dropna=False) <= 1]
columns_to_drop.update(id_like)
columns_to_drop.update(constant_cols)

# 2-2. 타깃과 상관이 거의 없는 수치 컬럼 제거
numeric_candidates = X.select_dtypes(include=['number']).columns.tolist()
low_corr_numeric = []
for col in numeric_candidates:
    corr = X[col].corr(y)
    if pd.isna(corr) or abs(corr) < 0.02:
        low_corr_numeric.append(col)
columns_to_drop.update(low_corr_numeric)

# 2-3. 고유값 비율이 높은 범주형 컬럼 제거
categorical_candidates = X.select_dtypes(include=['object', 'category']).columns.tolist()
high_cardinality = [
    col
    for col in categorical_candidates
    if (X[col].nunique(dropna=False) / len(X)) > 0.6
]
columns_to_drop.update(high_cardinality)

if columns_to_drop:
    print(f'✅ 제거 대상 컬럼 ({len(columns_to_drop)}개):', sorted(columns_to_drop))
    X = X.drop(columns=columns_to_drop)
else:
    print('제거 대상 컬럼 없음')

feature_cols = X.columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f'\n✅ 최종 특징 수: {len(feature_cols)}')
print(f'   - 수치형: {len(numeric_cols)}개')
print(f'   - 범주형: {len(categorical_cols)}개')

✅ 원본 데이터 형태: (4424, 35)
Enrolled 라벨 794건 제거
중복 제거: 0행 삭제 (현재 3630행)
결측치 현황:
결측치 없음
✅ 제거 대상 컬럼 (6개): ['Course', 'Educational special needs', "Father's qualification", 'International', 'Nacionality', 'Unemployment rate']

✅ 최종 특징 수: 28
   - 수치형: 28개
   - 범주형: 0개


In [10]:
# 3. 학습/검증 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42,
)

print('데이터 분할 완료:')
print(' - X_train:', X_train.shape, '| X_test:', X_test.shape)
print(' - y_train 분포:')
print(y_train.value_counts(normalize=True).rename('ratio'))

데이터 분할 완료:
 - X_train: (2904, 28) | X_test: (726, 28)
 - y_train 분포:
Target
1    0.608471
0    0.391529
Name: ratio, dtype: float64


In [11]:
# 4. 모델 및 하이퍼파라미터 탐색 공간 정의
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ],
    remainder='drop',
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search_spaces = {
    'LogisticRegression': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor),
            ('clf', LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=5000)),
        ]),
        'search': GridSearchCV,
        'param_grid': {
            'clf__C': [0.01, 0.1, 1, 10, 50],
            'clf__penalty': ['l1', 'l2'],
        },
    },
    'DecisionTree': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor),
            ('clf', DecisionTreeClassifier(random_state=42)),
        ]),
        'search': GridSearchCV,
        'param_grid': {
            'clf__max_depth': [None, 6, 12, 20],
            'clf__min_samples_split': [2, 5, 10],
            'clf__min_samples_leaf': [1, 2, 4],
            'clf__class_weight': [None, 'balanced'],
        },
    },
    'RandomForest': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor),
            ('clf', RandomForestClassifier(random_state=42, n_jobs=-1)),
        ]),
        'search': RandomizedSearchCV,
        'param_grid': {
            'clf__n_estimators': [200, 400, 600, 800],
            'clf__max_depth': [None, 12, 20, 30],
            'clf__min_samples_leaf': [1, 2, 4],
            'clf__min_samples_split': [2, 5, 10],
            'clf__max_features': ['sqrt', 'log2', 0.7],
            'clf__class_weight': [None, 'balanced_subsample'],
        },
        'n_iter': 30,
    },
    'AdaBoost': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor),
            ('clf', AdaBoostClassifier(random_state=42)),
        ]),
        'search': GridSearchCV,
        'param_grid': {
            'clf__n_estimators': [200, 400, 600, 800],
            'clf__learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0],
        },
    },
    'XGBoost': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor),
            ('clf', XGBClassifier(
                objective='binary:logistic',
                eval_metric='logloss',
                tree_method='auto',
                use_label_encoder=False,
                random_state=42,
                n_jobs=-1,
            )),
        ]),
        'search': RandomizedSearchCV,
        'param_grid': {
            'clf__n_estimators': [200, 400, 600, 800],
            'clf__max_depth': [3, 5, 7, 9],
            'clf__learning_rate': [0.03, 0.05, 0.1, 0.2],
            'clf__subsample': [0.7, 0.8, 0.9, 1.0],
            'clf__colsample_bytree': [0.6, 0.8, 1.0],
            'clf__gamma': [0, 0.5, 1.0],
            'clf__reg_lambda': [1.0, 2.0, 3.0, 5.0],
        },
        'n_iter': 35,
    },
    'LightGBM': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor),
            ('clf', LGBMClassifier(objective='binary', random_state=42, n_jobs=-1)),
        ]),
        'search': RandomizedSearchCV,
        'param_grid': {
            'clf__n_estimators': [400, 600, 800, 1000],
            'clf__learning_rate': [0.03, 0.05, 0.1],
            'clf__num_leaves': [64, 128, 256],
            'clf__max_depth': [-1, 10, 20, 30],
            'clf__subsample': [0.7, 0.8, 0.9, 1.0],
            'clf__colsample_bytree': [0.6, 0.8, 1.0],
            'clf__reg_lambda': [0.0, 1.0, 3.0],
        },
        'n_iter': 35,
    },
}

print(f"탐색 대상 모델 수: {len(search_spaces)}")

탐색 대상 모델 수: 6


In [12]:
# 5. 하이퍼파라미터 튜닝
best_models = {}
tuning_records = []

for name, config in search_spaces.items():
    base_pipeline = config['pipeline']
    search_cls = config['search']
    param_grid = config['param_grid']

    print(f"\n=== {name} 모델 튜닝 중 ===")

    if search_cls is GridSearchCV:
        searcher = search_cls(
            estimator=base_pipeline,
            param_grid=param_grid,
            scoring='f1',
            cv=cv,
            n_jobs=-1,
            verbose=0,
        )
    else:
        searcher = search_cls(
            estimator=base_pipeline,
            param_distributions=param_grid,
            scoring='f1',
            cv=cv,
            n_jobs=-1,
            n_iter=config.get('n_iter', 25),
            random_state=42,
            verbose=0,
        )

    searcher.fit(X_train, y_train)
    best_models[name] = searcher.best_estimator_

    tuning_records.append(
        {
            'model': name,
            'best_cv_f1': searcher.best_score_,
            'best_params': searcher.best_params_,
        }
    )

    print(f"최적 교차검증 F1: {searcher.best_score_:.4f}")
    print('최적 하이퍼파라미터:', searcher.best_params_)


if tuning_records:
    tuning_results_df = (
        pd.DataFrame(tuning_records)
        .sort_values(by='best_cv_f1', ascending=False)
        .reset_index(drop=True)
    )
    print('\n=== 하이퍼파라미터 탐색 결과 요약 ===')
    display(tuning_results_df)
else:
    raise RuntimeError('하이퍼파라미터 탐색 결과가 비어 있습니다.')


=== LogisticRegression 모델 튜닝 중 ===
최적 교차검증 F1: 0.9217
최적 하이퍼파라미터: {'clf__C': 0.1, 'clf__penalty': 'l1'}

=== DecisionTree 모델 튜닝 중 ===
최적 교차검증 F1: 0.9217
최적 하이퍼파라미터: {'clf__C': 0.1, 'clf__penalty': 'l1'}

=== DecisionTree 모델 튜닝 중 ===
최적 교차검증 F1: 0.9032
최적 하이퍼파라미터: {'clf__class_weight': None, 'clf__max_depth': 6, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2}

=== RandomForest 모델 튜닝 중 ===
최적 교차검증 F1: 0.9032
최적 하이퍼파라미터: {'clf__class_weight': None, 'clf__max_depth': 6, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2}

=== RandomForest 모델 튜닝 중 ===
최적 교차검증 F1: 0.9219
최적 하이퍼파라미터: {'clf__n_estimators': 400, 'clf__min_samples_split': 2, 'clf__min_samples_leaf': 1, 'clf__max_features': 'log2', 'clf__max_depth': 30, 'clf__class_weight': None}

=== AdaBoost 모델 튜닝 중 ===
최적 교차검증 F1: 0.9219
최적 하이퍼파라미터: {'clf__n_estimators': 400, 'clf__min_samples_split': 2, 'clf__min_samples_leaf': 1, 'clf__max_features': 'log2', 'clf__max_depth': 30, 'clf__class_weight': None}

=== AdaBoost 모델 튜닝 

Unnamed: 0,model,best_cv_f1,best_params
0,XGBoost,0.928373,"{'clf__subsample': 0.8, 'clf__reg_lambda': 3.0..."
1,LightGBM,0.925592,"{'clf__subsample': 1.0, 'clf__reg_lambda': 0.0..."
2,AdaBoost,0.923119,"{'clf__learning_rate': 1.0, 'clf__n_estimators..."
3,RandomForest,0.921896,"{'clf__n_estimators': 400, 'clf__min_samples_s..."
4,LogisticRegression,0.921744,"{'clf__C': 0.1, 'clf__penalty': 'l1'}"
5,DecisionTree,0.903192,"{'clf__class_weight': None, 'clf__max_depth': ..."


In [13]:
# 6. 튜닝된 모델 평가 및 저장
def evaluate_models(model_dict):
    records = []
    reports = {}

    for name, estimator in model_dict.items():
        print(f"\n=== {name} ===")
        fitted = clone(estimator)
        fitted.fit(X_train, y_train)
        y_pred = fitted.predict(X_test)

        report = classification_report(
            y_test,
            y_pred,
            target_names=['Dropout', 'Graduate'],
            zero_division=0,
        )
        print(report)
        reports[name] = report

        precision, recall, f1, _ = precision_recall_fscore_support(
            y_test,
            y_pred,
            average='binary',
            zero_division=0,
        )

        records.append(
            {
                'model': name,
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision,
                'recall': recall,
                'f1': f1,
            }
        )

    results_df = pd.DataFrame(records).sort_values(by='f1', ascending=False).reset_index(drop=True)
    best_model_name = results_df.loc[0, 'model']

    print('\n=== 모델 성능 요약 (F1 기준 정렬) ===')
    display(results_df)
    print(f"\n최적 모델: {best_model_name}")
    print('=== 최적 모델 classification_report ===')
    print(reports[best_model_name])

    return results_df, best_model_name

results_df, best_model_name = evaluate_models(best_models)

# ⚠️ 최종 모델 저장: Train 데이터만 사용 (데이터 누수 방지)
print('\n' + '='*70)
print('📦 최종 모델 저장 중...')
print('='*70)

best_pipeline = clone(best_models[best_model_name])
best_pipeline.fit(X_train, y_train)  # ✅ Train 데이터만 사용!

model_dir = Path('../model')
model_dir.mkdir(parents=True, exist_ok=True)
output_path = model_dir / 'model_trained.pkl'
joblib.dump(best_pipeline, output_path)

print(f'\n✅ 최적 파이프라인을 {output_path}에 저장했습니다.')
print(f'   - 모델: {best_model_name}')
print(f'   - 학습 데이터: X_train {X_train.shape}, y_train {y_train.shape}')
print(f'   - 테스트 데이터: X_test {X_test.shape}, y_test {y_test.shape}')
print(f'\n⚠️ 중요: 이 모델은 Train 데이터로만 학습되었으므로,')
print(f'   Test 데이터로 평가 시 실제 일반화 성능을 확인할 수 있습니다!')
print('='*70)


=== LogisticRegression ===
              precision    recall  f1-score   support

     Dropout       0.89      0.86      0.87       284
    Graduate       0.91      0.93      0.92       442

    accuracy                           0.90       726
   macro avg       0.90      0.89      0.90       726
weighted avg       0.90      0.90      0.90       726


=== DecisionTree ===
              precision    recall  f1-score   support

     Dropout       0.93      0.81      0.86       284
    Graduate       0.89      0.96      0.92       442

    accuracy                           0.90       726
   macro avg       0.91      0.88      0.89       726
weighted avg       0.90      0.90      0.90       726


=== RandomForest ===
              precision    recall  f1-score   support

     Dropout       0.89      0.86      0.87       284
    Graduate       0.91      0.93      0.92       442

    accuracy                           0.90       726
   macro avg       0.90      0.89      0.90       726
we

Unnamed: 0,model,accuracy,precision,recall,f1
0,RandomForest,0.914601,0.89749,0.970588,0.932609
1,XGBoost,0.911846,0.900424,0.961538,0.929978
2,AdaBoost,0.904959,0.892632,0.959276,0.924755
3,LightGBM,0.903581,0.892405,0.957014,0.923581
4,DecisionTree,0.900826,0.885417,0.961538,0.921909
5,LogisticRegression,0.900826,0.909292,0.929864,0.919463



최적 모델: RandomForest
=== 최적 모델 classification_report ===
              precision    recall  f1-score   support

     Dropout       0.95      0.83      0.88       284
    Graduate       0.90      0.97      0.93       442

    accuracy                           0.91       726
   macro avg       0.92      0.90      0.91       726
weighted avg       0.92      0.91      0.91       726


📦 최종 모델 저장 중...

✅ 최적 파이프라인을 ..\model\model_trained.pkl에 저장했습니다.
   - 모델: RandomForest
   - 학습 데이터: X_train (2904, 28), y_train (2904,)
   - 테스트 데이터: X_test (726, 28), y_test (726,)

⚠️ 중요: 이 모델은 Train 데이터로만 학습되었으므로,
   Test 데이터로 평가 시 실제 일반화 성능을 확인할 수 있습니다!

✅ 최적 파이프라인을 ..\model\model_trained.pkl에 저장했습니다.
   - 모델: RandomForest
   - 학습 데이터: X_train (2904, 28), y_train (2904,)
   - 테스트 데이터: X_test (726, 28), y_test (726,)

⚠️ 중요: 이 모델은 Train 데이터로만 학습되었으므로,
   Test 데이터로 평가 시 실제 일반화 성능을 확인할 수 있습니다!
