In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# 파일 경로 설정
file_path = "data.csv"

# CSV 파일을 DataFrame으로 불러오기
data = pd.read_csv(file_path)

In [3]:
# 독립변수와 종속변수 설정
X = data[[' ROA(A) before interest and % after tax', ' Interest Expense Ratio', ' Net profit before tax/Paid-in capital', 
           " Net Income to Stockholder's Equity", ' Persistent EPS in the Last Four Seasons', ' Total debt/Total net worth', 
           ' Debt ratio %', ' Net Income to Total Assets', ' Degree of Financial Leverage (DFL)', ' Cash/Total Assets', ' Retained Earnings to Total Assets']]
y = data['Bankrupt?']

In [4]:
X

Unnamed: 0,ROA(A) before interest and % after tax,Interest Expense Ratio,Net profit before tax/Paid-in capital,Net Income to Stockholder's Equity,Persistent EPS in the Last Four Seasons,Total debt/Total net worth,Debt ratio %,Net Income to Total Assets,Degree of Financial Leverage (DFL),Cash/Total Assets,Retained Earnings to Total Assets
0,0.424389,0.629951,0.137757,0.827890,0.169141,0.021266,0.207576,0.716845,0.026601,0.004094,0.903225
1,0.538214,0.635172,0.168962,0.839969,0.208944,0.012502,0.171176,0.795297,0.264577,0.014948,0.931065
2,0.499019,0.629631,0.148036,0.836774,0.180581,0.021248,0.207516,0.774670,0.026555,0.000991,0.909903
3,0.451265,0.630228,0.147561,0.834697,0.193722,0.009572,0.151465,0.739555,0.026697,0.018851,0.906902
4,0.538432,0.636055,0.167461,0.839973,0.212537,0.005150,0.106509,0.795016,0.024752,0.014161,0.913850
...,...,...,...,...,...,...,...,...,...,...,...
6814,0.539468,0.631415,0.171111,0.840359,0.216602,0.006655,0.124618,0.799927,0.027064,0.099481,0.925611
6815,0.538269,0.631489,0.171805,0.840306,0.216697,0.004623,0.099253,0.799748,0.027009,0.080337,0.932629
6816,0.533744,0.630612,0.172287,0.840138,0.210929,0.001392,0.038939,0.797778,0.026791,0.412885,0.932000
6817,0.559911,0.630731,0.182498,0.841084,0.228326,0.003816,0.086979,0.811808,0.026822,0.112238,0.939613


In [5]:
# 훈련 데이터셋으로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

In [None]:
# 모델 정의
models = {
    'RandomForestClassifier': RandomForestClassifier(),
    'SVC': SVC(kernel='linear')
}

# 결과를 정리할 DataFrame
results = pd.DataFrame(columns=['Model', 'Dataset', 'Accuracy', 'Precision', 'Recall', 'F1'])

# 각 모델 학습 및 평가
for name, model in models.items():
    for data in [('Train', X_train, y_train), ('Test', X_test, y_test)]:
        dataset_name, X_data, y_data = data

        # 모델 학습
        model.fit(X_data, y_data)

        # 예측값 생성
        y_pred = model.predict(X_data)

        # 성능 평가
        accuracy = accuracy_score(y_data, y_pred)
        precision = precision_score(y_data, y_pred)
        recall = recall_score(y_data, y_pred)
        f1 = f1_score(y_data, y_pred)

        # 결과 저장
        new_row = pd.DataFrame({
            'Model': [name],
            'Dataset': [dataset_name],
            'Accuracy': [accuracy],
            'Precision': [precision],
            'Recall': [recall],
            'F1': [f1],
        })

        results = pd.concat([results, new_row], ignore_index=True)

print(results)

In [None]:

# 모델 정의
models = {
    'LogisticRegression': LogisticRegression(max_iter=3000),
    'RandomForestClassifier': RandomForestClassifier(),
    'SVC': SVC(kernel='linear')
}

# 결과를 정리할 DataFrame
results = pd.DataFrame(columns=['Model', 'Dataset', 'Accuracy', 'Precision', 'Recall', 'F1'])

# 각 모델 학습 및 평가
for name, model in models.items():
    for data in [('Train', X_train, y_train), ('Test', X_test, y_test)]:
        dataset_name, X_data, y_data = data

        # 모델 학습
        model.fit(X_data, y_data)

        # 예측값 생성
        y_pred = model.predict(X_data)

        # 성능 평가
        accuracy = accuracy_score(y_data, y_pred)
        precision = precision_score(y_data, y_pred)
        recall = recall_score(y_data, y_pred)
        f1 = f1_score(y_data, y_pred)

        # 결과 저장
        new_row = pd.DataFrame({
            'Model': [name],
            'Dataset': [dataset_name],
            'Accuracy': [accuracy],
            'Precision': [precision],
            'Recall': [recall],
            'F1': [f1],
        })

        results = pd.concat([results, new_row], ignore_index=True)

print(results)