In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(10, 6))

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [13]:
def preprocess(file_path='data/titanic.csv'):
    """빠른 데이터 전처리"""
    df = pd.read_csv(file_path)
    
    # 필수 전처리만 수행
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # 특성 선택
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    X = df[features]
    y = df['Survived']
    
    return X, y

In [14]:
def evaluate_model(model, X_test, y_test, model_name="model"): #model은 이미 학습된 모델이라고 가정
    # 학습된 모델을 기반으로 예측
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    # 기본지표
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    #혼동행렬
    cm = confusion_matrix(y_test, y_pred)
    tn,fp,fn,tp = cm.ravel()

    #결과 측정
    print(f"정확도:{accuracy}")
    print(f"정밀도:{precision}")
    print(f"재현율:{recall}")
    print(f"f1:{f1}")
    print(f"ROC-AUC:{roc_auc}")
    print(f"-TN:{tn} -FP:{fp}")
    print(f"-FN:{fn} -TP:{tp}")

In [15]:
X,y = preprocess("data/titanic.csv")
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42, stratify=y)
# stratify : 원본 데이터셋의 클래스(label) 비율을 훈련(train) 데이터셋과 테스트(test) 데이터셋에서도 동일하게 유지시켜주는 기능

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
evaluate_model(rf, X_test, y_test, "RandomForest")

정확도:0.8156424581005587
정밀도:0.78125
재현율:0.7246376811594203
f1:0.7518796992481203
ROC-AUC:0.7986824769433466
-TN:96 -FP:14
-FN:19 -TP:50


In [16]:
def plot_confusion_matrix(cm, model_name):
    """혼동 행렬 시각화"""
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['사망', '생존'], 
                yticklabels=['사망', '생존'],
                cbar_kws={'label': '예측 수'})
    plt.title(f'{model_name} - 혼동 행렬')
    plt.xlabel('예측')
    plt.ylabel('실제')
    plt.show()

In [17]:
def plot_metrics_comparison(results_dict):
    """여러 모델의 지표 비교"""
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    models = list(results_dict.keys())
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    x = np.arange(len(metrics))
    width = 0.35
    
    for i, model in enumerate(models):
        values = [results_dict[model][metric] for metric in metrics]
        ax.bar(x + i*width, values, width, label=model, alpha=0.8)
    
    ax.set_xlabel('평가 지표')
    ax.set_ylabel('점수')
    ax.set_title('모델별 성능 비교')
    ax.set_xticks(x + width/2)
    ax.set_xticklabels(['정확도', '정밀도', '재현율', 'F1', 'ROC-AUC'])
    ax.legend()
    ax.set_ylim(0, 1)
    
    # 값 표시
    for i, model in enumerate(models):
        values = [results_dict[model][metric] for metric in metrics]
        for j, v in enumerate(values):
            ax.text(j + i*width, v + 0.01, f'{v:.3f}', 
                   ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()