In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

# 1. 데이터 로드
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [None]:
# 2. 결측값 처리
# 각 컬럼별로 적절한 방식으로 결측값을 채움
for df in [train_df, test_df]:
    df.fillna({
        'Gender': 'Unknown',
        'Married': 'Unknown',
        'Self_Employed': 'Unknown',
        'LoanAmount': df['LoanAmount'].mean(),
        'Loan_Amount_Term': df['Loan_Amount_Term'].median(),
        'Credit_History': df['Credit_History'].mode()[0]
    }, inplace=True)


In [None]:
# 3. 범주형 변수 라벨 인코딩
cat_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
le = LabelEncoder()
for col in cat_cols:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])  # 동일한 인코더로 변환

In [None]:
# 4. 종속 변수 인코딩 ('Y' → 1, 'N' → 0)
train_df['Loan_Status'] = train_df['Loan_Status'].map({'Y': 1, 'N': 0})

In [None]:
# 5. 특성과 레이블 정의
features = [
    'Gender', 'Married', 'Education', 'Self_Employed',
    'ApplicantIncome', 'CoapplicantIncome',
    'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area'
]

X = train_df[features]
y = train_df['Loan_Status']


In [None]:
# 6. 학습/검증 데이터 분할 (stratify로 클래스 비율 유지)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# 7. 피처 스케일링 (LogisticRegression에 필요)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [None]:
# 스케일링 후 다시 DataFrame으로 변환 (MLflow input_example용)
X_train = pd.DataFrame(X_train, columns=features)
X_val = pd.DataFrame(X_val, columns=features)


In [None]:
# 8. MLflow 실험 설정
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Loan_Prediction_Experiment")

In [None]:
# 9. 실험에 사용할 모델 정의 (모델명, 클래스, 하이퍼파라미터 목록)
models = {
    "RandomForest": {
        "class": RandomForestClassifier,
        "params_list": [
            {"max_depth": 5, "n_estimators": 100},
            {"max_depth": 7, "n_estimators": 300},
        ]
    },
    "LogisticRegression": {
        "class": LogisticRegression,
        "params_list": [
            {"C": 1.0, "max_iter": 100},
            {"C": 0.5, "max_iter": 300},
        ]
    }
}


In [None]:
# 10. 모델별 실험 루프
for model_name, model_info in models.items():
    for params in model_info["params_list"]:
        with mlflow.start_run():
            # 모델 학습
            ModelClass = model_info["class"]
            model = ModelClass(**params)
            model.fit(X_train, y_train)

            # 예측 및 평가
            preds = model.predict(X_val)
            acc = accuracy_score(y_val, preds)
            f1 = f1_score(y_val, preds)
            try:
                proba = model.predict_proba(X_val)[:, 1]
                auc = roc_auc_score(y_val, proba)
            except:
                auc = None  # 일부 모델은 predict_proba 없음

            # MLflow에 실험 결과 기록
            mlflow.set_tag("model_name", model_name)
            for k, v in params.items():
                mlflow.log_param(k, v)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("f1_score", f1)
            if auc is not None:
                mlflow.log_metric("roc_auc", auc)

            # 모델 서명 및 예제 입력 포함하여 저장
            signature = infer_signature(X_train, preds)
            mlflow.sklearn.log_model(
                model,
                artifact_path="model",
                signature=signature,
                input_example=X_train.iloc[:1]
            )

            print(f"[{model_name}] Params={params} → ACC={acc:.4f}, F1={f1:.4f}, AUC={auc if auc else 'N/A'}")
