##  선정된 피처로 모델 학습 및 평가
- Feature Selection에서 분산 고려한 경우
- Original과 SMOTE로 증간된 데이터에서 평가
- 평가지표 정확도, 정밀도, f1, 재현율으로 평가해서 어떤 모델과 어떤 피처가 최적인지 테스트

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# 파일 경로 설정
file_path1 = "data.csv"
file_path2 = "oversampled_data.csv"

# CSV 파일을 DataFrame으로 불러오기
data1 = pd.read_csv(file_path1)
data2 = pd.read_csv(file_path2)

In [3]:
correlation_features = [' Net Income to Total Assets', ' ROA(A) before interest and % after tax', 
                        ' ROA(B) before interest and depreciation after tax', ' ROA(C) before interest and depreciation before interest', 
                        ' Net worth/Assets', ' Debt ratio %', ' Persistent EPS in the Last Four Seasons', 
                        ' Retained Earnings to Total Assets', ' Net profit before tax/Paid-in capital', ' Per Share Net profit before tax (Yuan ¥)']

forward_features = [' Net Income to Total Assets', ' ROA(A) before interest and % after tax', ' ROA(B) before interest and depreciation after tax', ' ROA(C) before interest and depreciation before interest', ' Net worth/Assets', ' Persistent EPS in the Last Four Seasons', ' Net profit before tax/Paid-in capital', ' Per Share Net profit before tax (Yuan ¥)']

backward_features = [' Net Income to Total Assets', ' ROA(A) before interest and % after tax', ' ROA(B) before interest and depreciation after tax', ' ROA(C) before interest and depreciation before interest', ' Net worth/Assets', ' Debt ratio %', ' Persistent EPS in the Last Four Seasons', ' Retained Earnings to Total Assets', ' Net profit before tax/Paid-in capital', ' Per Share Net profit before tax (Yuan ¥)']

stepwise_features = [' Net Income to Total Assets', ' ROA(A) before interest and % after tax', ' ROA(B) before interest and depreciation after tax', ' Net worth/Assets', ' Persistent EPS in the Last Four Seasons', ' Net profit before tax/Paid-in capital', ' Per Share Net profit before tax (Yuan ¥)']


# RFE로 선택된 Feature
rfe_features = [' Non-industry income and expenditure/revenue', ' Interest-bearing debt interest rate', ' Net Value Per Share (B)',
                ' Persistent EPS in the Last Four Seasons', ' Net Value Growth Rate', ' Quick Ratio', ' Interest Expense Ratio',
                ' Total debt/Total net worth', ' Borrowing dependency', ' Net profit before tax/Paid-in capital',
                ' Accounts Receivable Turnover', ' Cash/Total Assets', ' Working Capital/Equity', " Net Income to Stockholder's Equity",
                ' Degree of Financial Leverage (DFL)']

# RFECV로 선택된 Feature
rfecv_features = [' ROA(A) before interest and % after tax', ' After-tax net Interest Rate',
                  ' Non-industry income and expenditure/revenue', ' Continuous interest rate (after tax)', ' Cash flow rate',
                  ' Tax rate (A)', ' Net Value Per Share (B)', ' Net Value Per Share (A)', ' Net Value Per Share (C)',
                  ' Operating Profit Per Share (Yuan ¥)', ' Total Asset Growth Rate', ' Net Value Growth Rate', ' Current Ratio',
                  ' Quick Ratio', ' Total debt/Total net worth', ' Debt ratio %', ' Long-term fund suitability ratio (A)',
                  ' Operating profit/Paid-in capital', ' Total Asset Turnover', ' Accounts Receivable Turnover', ' Average Collection Days',
                  ' Inventory Turnover Rate (times)', ' Net Worth Turnover Rate (times)', ' Revenue per person', ' Operating profit per person',
                  ' Allocation rate per person', ' Current Assets/Total Assets', ' Cash/Total Assets', ' Quick Assets/Current Liability',
                  ' Operating Funds to Liability', ' Current Liabilities/Liability', ' Retained Earnings to Total Assets', ' Liability-Assets Flag',
                  ' Net Income to Total Assets', ' Total assets to GNP price', ' Gross Profit to Sales', " Net Income to Stockholder's Equity",
                  ' Liability to Equity', ' Degree of Financial Leverage (DFL)', ' Net Income Flag']

variance_feature = [' Operating Expense Rate', ' Research and development expense rate', ' Interest-bearing debt interest rate',
                   ' Revenue Per Share (Yuan ¥)', ' Total Asset Growth Rate', ' Net Value Growth Rate', ' Current Ratio',
                   ' Quick Ratio', ' Total debt/Total net worth', ' Accounts Receivable Turnover', ' Average Collection Days', ' Inventory Turnover Rate (times)',
                   ' Fixed Assets Turnover Frequency', ' Revenue per person', ' Allocation rate per person', ' Quick Assets/Current Liability', ' Cash/Current Liability',
                   ' Inventory/Current Liability', ' Long-term Liability to Current Assets', ' Current Asset Turnover Rate', ' Quick Asset Turnover Rate',
                   ' Cash Turnover Rate', ' Fixed Assets to Assets', ' Total assets to GNP price']

In [4]:
# 독립변수와 종속변수 설정
X1 = data1[correlation_features]
X2 = data1[forward_features]
X3 = data1[backward_features]
X4 = data1[stepwise_features]
X5 = data1[rfe_features]
X6 = data1[rfecv_features]
X7 = data1[variance_feature]

In [5]:
# 독립변수와 종속변수 설정
y = data1['Bankrupt?']

In [6]:
# 결과를 정리할 DataFrame 생성 함수
def create_results_df():
    return pd.DataFrame(columns=['Model', 'Dataset', 'Accuracy', 'Precision', 'Recall', 'F1'])

In [7]:
# 모델 평가 함수
def evaluate_model(model, model_name, X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    results = create_results_df()
    
    # 모델 학습
    model.fit(X_train, y_train)
    
    # 훈련 데이터 성능 평가
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    # 결과 저장 (훈련 데이터)
    train_row = pd.DataFrame({
        'Model': [model_name],
        'Dataset': ['Train'],
        'Accuracy': [train_accuracy],
        'Precision': [train_precision],
        'Recall': [train_recall],
        'F1': [train_f1],
    })
    results = pd.concat([results, train_row], ignore_index=True)
    
    # 테스트 데이터 성능 평가
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    # 결과 저장 (테스트 데이터)
    test_row = pd.DataFrame({
        'Model': [model_name],
        'Dataset': ['Test'],
        'Accuracy': [test_accuracy],
        'Precision': [test_precision],
        'Recall': [test_recall],
        'F1': [test_f1],
    })
    results = pd.concat([results, test_row], ignore_index=True)
    
    return results

In [43]:
# Logistic Regression

log_reg_model = LogisticRegression(max_iter=1000)

for X_data in enumerate([X1, X2, X3, X4, X5, X6, X7], 1):

log_reg_results = evaluate_model(log_reg_model, 'LogisticRegression', X_train1, y_train1, X_test1, y_test1)
print(log_reg_results)

NameError: name 'X_train1' is not defined

In [None]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=1000,  # 기본 트리의 수
        max_depth=5,  # 트리의 최대 깊이 제한
        min_samples_split=2,  # 노드를 분할하는 데 필요한 최소 샘플 수
        min_samples_leaf=1,  # 리프 노드에 있어야 하는 최소 샘플 수
        max_features='sqrt',  # 각 분할에 사용할 피처의 최대 수
        random_state=111  # 결과 재현성을 위한 랜덤 시드
    )
rf_results = evaluate_model(rf_model, 'RandomForestClassifier', X_train1, y_train1, X_test1, y_test1)
print(rf_results)

In [None]:
# SVC
svc_model = SVC(kernel='linear')
svc_results = evaluate_model(svc_model, 'SVC', X_train1, y_train1, X_test1, y_test1)
print(svc_results)

In [11]:
# Logistic Regression
log_reg_model = LogisticRegression(max_iter=3000)
log_reg_results = evaluate_model(log_reg_model, 'LogisticRegression', X_train2, y_train2, X_test2, y_test2)
print(log_reg_results)

                Model Dataset  Accuracy  Precision    Recall        F1
0  LogisticRegression   Train  0.500189   0.499329  0.990120  0.663864
1  LogisticRegression    Test  0.508712   0.507480  0.990269  0.671063


In [12]:
# Random Forest Classifier
rf_model =  RandomForestClassifier(n_estimators=1000,  # 기본 트리의 수
        max_depth=5,  # 트리의 최대 깊이 제한
        min_samples_split=2,  # 노드를 분할하는 데 필요한 최소 샘플 수
        min_samples_leaf=1,  # 리프 노드에 있어야 하는 최소 샘플 수
        max_features='sqrt',  # 각 분할에 사용할 피처의 최대 수
        random_state=111  # 결과 재현성을 위한 랜덤 시드
    )
rf_results = evaluate_model(rf_model, 'RandomForestClassifier', X_train2, y_train2, X_test2, y_test2)
print(rf_results)

                    Model Dataset  Accuracy  Precision    Recall        F1
0  RandomForestClassifier   Train  0.922144   0.902483  0.946038  0.923748
1  RandomForestClassifier    Test  0.914015   0.903860  0.928892  0.916205


In [None]:
# SVC
svc_model = SVC(kernel='linear')
svc_results = evaluate_model(svc_model, 'SVC', X_train2, y_train2, X_test2, y_test2)
print(svc_results)