##  선정된 피처로 모델 학습 및 평가
- 분산 데이
- 평가지표 정확도, 정밀도, f1, 재현율으로 평가해서 어떤 모델과 어떤 피처가 최적인지 테스트

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# 파일 경로 설정
file_path = "oversampled_data.csv"

# CSV 파일을 DataFrame으로 불러오기
data = pd.read_csv(file_path)

In [3]:
# 독립변수와 종속변수 설정
X = data[[' ROA(A) before interest and % after tax', ' Interest Expense Ratio', ' Net profit before tax/Paid-in capital', 
           " Net Income to Stockholder's Equity", ' Persistent EPS in the Last Four Seasons', ' Total debt/Total net worth', 
           ' Debt ratio %', ' Net Income to Total Assets', ' Degree of Financial Leverage (DFL)', ' Cash/Total Assets', ' Retained Earnings to Total Assets']]
y = data['Bankrupt?']

In [4]:
X

Unnamed: 0,ROA(A) before interest and % after tax,Interest Expense Ratio,Net profit before tax/Paid-in capital,Net Income to Stockholder's Equity,Persistent EPS in the Last Four Seasons,Total debt/Total net worth,Debt ratio %,Net Income to Total Assets,Degree of Financial Leverage (DFL),Cash/Total Assets,Retained Earnings to Total Assets
0,0.424389,0.629951,0.137757,0.827890,0.169141,0.021266,0.207576,0.716845,0.026601,0.004094,0.903225
1,0.538214,0.635172,0.168962,0.839969,0.208944,0.012502,0.171176,0.795297,0.264577,0.014948,0.931065
2,0.499019,0.629631,0.148036,0.836774,0.180581,0.021248,0.207516,0.774670,0.026555,0.000991,0.909903
3,0.451265,0.630228,0.147561,0.834697,0.193722,0.009572,0.151465,0.739555,0.026697,0.018851,0.906902
4,0.538432,0.636055,0.167461,0.839973,0.212537,0.005150,0.106509,0.795016,0.024752,0.014161,0.913850
...,...,...,...,...,...,...,...,...,...,...,...
13193,0.422698,0.629597,0.137559,0.830657,0.175798,0.017201,0.191892,0.728868,0.026592,0.023739,0.905469
13194,0.522676,0.631923,0.160716,0.838803,0.203294,0.013970,0.178502,0.785764,0.025463,0.039376,0.928041
13195,0.211932,0.630534,0.057502,0.856789,0.079995,0.021116,0.524151,0.520229,0.026769,0.024766,0.778029
13196,0.545405,0.633857,0.170132,0.840366,0.208961,0.015637,0.187162,0.798551,0.030415,0.021974,0.898060


In [5]:
# 훈련 데이터셋으로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

In [6]:
# 결과를 정리할 DataFrame 생성 함수
def create_results_df():
    return pd.DataFrame(columns=['Model', 'Dataset', 'Accuracy', 'Precision', 'Recall', 'F1'])

In [7]:
# 모델 평가 함수
def evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    results = create_results_df()
    
    # 모델 학습
    model.fit(X_train, y_train)
    
    # 훈련 데이터 성능 평가
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    # 결과 저장 (훈련 데이터)
    train_row = pd.DataFrame({
        'Model': [model_name],
        'Dataset': ['Train'],
        'Accuracy': [train_accuracy],
        'Precision': [train_precision],
        'Recall': [train_recall],
        'F1': [train_f1],
    })
    results = pd.concat([results, train_row], ignore_index=True)
    
    # 테스트 데이터 성능 평가
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    # 결과 저장 (테스트 데이터)
    test_row = pd.DataFrame({
        'Model': [model_name],
        'Dataset': ['Test'],
        'Accuracy': [test_accuracy],
        'Precision': [test_precision],
        'Recall': [test_recall],
        'F1': [test_f1],
    })
    results = pd.concat([results, test_row], ignore_index=True)
    
    return results

In [8]:
# Logistic Regression
log_reg_model = LogisticRegression(max_iter=3000)
log_reg_results = evaluate_model(log_reg_model, 'LogisticRegression', X_train, y_train, X_test, y_test)
print(log_reg_results)

                Model Dataset  Accuracy  Precision  Recall        F1
0  LogisticRegression   Train  0.498485   0.498485     1.0  0.665318
1  LogisticRegression    Test  0.506439   0.506252     1.0  0.672201


In [9]:
# Random Forest Classifier
rf_model = RandomForestClassifier()
rf_results = evaluate_model(rf_model, 'RandomForestClassifier', X_train, y_train, X_test, y_test)
print(rf_results)

                    Model Dataset  Accuracy  Precision    Recall        F1
0  RandomForestClassifier   Train  1.000000   1.000000  1.000000  1.000000
1  RandomForestClassifier    Test  0.963636   0.952555  0.976796  0.964523


In [None]:
# SVC
svc_model = SVC(kernel='linear')
svc_results = evaluate_model(svc_model, 'SVC', X_train, y_train, X_test, y_test)
print(svc_results)