##  선정된 피처로 모델 학습 및 평가
- Original과 SMOTE로 증간된 데이터에서 평가
- Feature Selection에서 RFE,RFECV,상관 계수 고려한 경우
- 평가지표 정확도, 정밀도, f1, 재현율으로 평가해서 어떤 모델과 어떤 피처가 최적인지 테스트

In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [6]:
# 파일 경로 설정
file_path1 = "data.csv"
file_path2 = "oversampled_data.csv"

# CSV 파일을 DataFrame으로 불러오기
data1 = pd.read_csv(file_path1)
data2 = pd.read_csv(file_path2)

In [7]:
X1 = data1[[' ROA(A) before interest and % after tax', ' Interest Expense Ratio', ' Net profit before tax/Paid-in capital', 
           " Net Income to Stockholder's Equity", ' Persistent EPS in the Last Four Seasons', ' Total debt/Total net worth', 
           ' Debt ratio %', ' Net Income to Total Assets', ' Degree of Financial Leverage (DFL)', ' Cash/Total Assets', ' Retained Earnings to Total Assets']]

In [8]:
X2 = data2[[' ROA(A) before interest and % after tax', ' Interest Expense Ratio', ' Net profit before tax/Paid-in capital', 
           " Net Income to Stockholder's Equity", ' Persistent EPS in the Last Four Seasons', ' Total debt/Total net worth', 
           ' Debt ratio %', ' Net Income to Total Assets', ' Degree of Financial Leverage (DFL)', ' Cash/Total Assets', ' Retained Earnings to Total Assets']]

In [9]:
# 독립변수와 종속변수 설정
y1 = data1['Bankrupt?']
y2 = data2['Bankrupt?']

In [10]:
# 훈련 데이터셋으로 분할
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=111)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X1, y1, test_size=0.2, random_state=111)

In [11]:
# 결과를 정리할 DataFrame 생성 함수
def create_results_df():
    return pd.DataFrame(columns=['Model', 'Dataset', 'Accuracy', 'Precision', 'Recall', 'F1'])

In [12]:
# 모델 평가 함수
def evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    results = create_results_df()
    
    # 모델 학습
    model.fit(X_train, y_train)
    
    # 훈련 데이터 성능 평가
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    # 결과 저장 (훈련 데이터)
    train_row = pd.DataFrame({
        'Model': [model_name],
        'Dataset': ['Train'],
        'Accuracy': [train_accuracy],
        'Precision': [train_precision],
        'Recall': [train_recall],
        'F1': [train_f1],
    })
    results = pd.concat([results, train_row], ignore_index=True)
    
    # 테스트 데이터 성능 평가
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    # 결과 저장 (테스트 데이터)
    test_row = pd.DataFrame({
        'Model': [model_name],
        'Dataset': ['Test'],
        'Accuracy': [test_accuracy],
        'Precision': [test_precision],
        'Recall': [test_recall],
        'F1': [test_f1],
    })
    results = pd.concat([results, test_row], ignore_index=True)
    
    return results

In [13]:
# Logistic Regression
log_reg_model = LogisticRegression(max_iter=3000)
log_reg_results = evaluate_model(log_reg_model, 'LogisticRegression', X_train1, y_train1, X_test1, y_test1)
print(log_reg_results)

                Model Dataset  Accuracy  Precision  Recall   F1
0  LogisticRegression   Train  0.967919        0.0     0.0  0.0
1  LogisticRegression    Test  0.967009        0.0     0.0  0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=1000,  # 기본 트리의 수
        max_depth=5,  # 트리의 최대 깊이 제한
        min_samples_split=2,  # 노드를 분할하는 데 필요한 최소 샘플 수
        min_samples_leaf=1,  # 리프 노드에 있어야 하는 최소 샘플 수
        max_features='sqrt',  # 각 분할에 사용할 피처의 최대 수
        random_state=111  # 결과 재현성을 위한 랜덤 시드
    )
rf_results = evaluate_model(rf_model, 'RandomForestClassifier', X_train1, y_train1, X_test1, y_test1)
print(rf_results)

                    Model Dataset  Accuracy  Precision    Recall        F1
0  RandomForestClassifier   Train  0.976535   0.979592  0.274286  0.428571
1  RandomForestClassifier    Test  0.967742   0.555556  0.111111  0.185185


In [None]:
# SVC
svc_model = SVC(kernel='linear')
svc_results = evaluate_model(svc_model, 'SVC', X_train1, y_train1, X_test1, y_test1)
print(svc_results)

In [17]:
# Logistic Regression
log_reg_model = LogisticRegression(max_iter=3000)
log_reg_results = evaluate_model(log_reg_model, 'LogisticRegression', X_train2, y_train2, X_test2, y_test2)
print(log_reg_results)

                Model Dataset  Accuracy  Precision  Recall   F1
0  LogisticRegression   Train  0.967919        0.0     0.0  0.0
1  LogisticRegression    Test  0.967009        0.0     0.0  0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# Random Forest Classifier
rf_model =  RandomForestClassifier(n_estimators=1000,  # 기본 트리의 수
        max_depth=5,  # 트리의 최대 깊이 제한
        min_samples_split=2,  # 노드를 분할하는 데 필요한 최소 샘플 수
        min_samples_leaf=1,  # 리프 노드에 있어야 하는 최소 샘플 수
        max_features='sqrt',  # 각 분할에 사용할 피처의 최대 수
        random_state=111  # 결과 재현성을 위한 랜덤 시드
    )
rf_results = evaluate_model(rf_model, 'RandomForestClassifier', X_train2, y_train2, X_test2, y_test2)
print(rf_results)

                    Model Dataset  Accuracy  Precision    Recall        F1
0  RandomForestClassifier   Train  0.976535   0.979592  0.274286  0.428571
1  RandomForestClassifier    Test  0.967742   0.555556  0.111111  0.185185


In [None]:
# SVC
svc_model = SVC(kernel='linear')
svc_results = evaluate_model(svc_model, 'SVC', X_train2, y_train2, X_test2, y_test2)
print(svc_results)