In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix,roc_auc_score
import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

In [2]:
def model_basic(x_train, y_train, x_test, y_test):
    models = [
        LogisticRegression(),
        DecisionTreeClassifier(),
        SVC(probability=True),
        RandomForestClassifier(),
        XGBClassifier(),
        KNeighborsClassifier(),
        LGBMClassifier()
    ]

    rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

    for clf in models:
        clf = clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
        auc_score = roc_auc_score(y_test, pred_prob_rf)
        results = (
            round(accuracy_score(y_test, pred), 4),
            round(precision_score(y_test, pred), 4),
            round(recall_score(y_test, pred), 4),
            round(f1_score(y_test, pred), 4),
            round(auc_score, 4)
        )

        rdict['model'].append(clf)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['auc_score'].append(results[4])

        confusion = confusion_matrix(y_test, pred)
        print(confusion)

    rdf = pd.DataFrame(data=rdict)
    return rdf


In [3]:
def eval(test,pred):
    acc = accuracy_score(test,pred)
    f1 = f1_score(test,pred)
    precision = precision_score(test,pred)
    recall = recall_score(test,pred)
    print('##############\n',confusion_matrix(test,pred),
    "\n############\n",f'acc_score: {acc}\n f1_score: {f1} \n precision: {precision} \n recall: {recall}')


In [4]:
train = pd.read_csv('train.csv',index_col=0)
test = pd.read_csv('test.csv',index_col=0)

map = {'보통주식비율':'대주주지분율'}
train.rename(columns = map,inplace=True)
test.rename(columns = map,inplace=True)


list_int = ['대주주지분변화분', '외국인지분분변화', '자산', '비유동자산증가율', '유동자산증가율', '자기자본증가율',
       '정상영업이익증가율','매출액순이익률', '총자본순이익률', '자기자본순이익률',
       '매출액증가율', '광고선전비대매출액비율', '비유동비율', '당좌비율', '현금비율', '부채비율',
       '이자보상배율(이자비용)', '유보액대납입자본배율', '투자집중도', '1인년간평균급여(직원)', '토빈Q', 'WW지수',
       'RDS', '업력', '산업평균_총자산증가율차분', '대주주지분율', '외국인_주식분포비율']
x_train = train[list_int]
y_train = train[['target']]

x_test = test[list_int]
y_test = test[['target']]



col_int = ['유동자산증가율', '자기자본증가율', '산업평균_총자산증가율차분',
       '정상영업이익증가율', '매출액순이익률', '자기자본순이익률','투자집중도',
         '유보액대납입자본배율', '1인년간평균급여(직원)', '토빈Q',
         '업력', '대주주지분율', '외국인_주식분포비율']

x_train = x_train[col_int]
y_train = y_train[['target']]

x_test = x_test[col_int]
y_test = y_test[['target']]

In [33]:
print(x_train.shape,x_test.shape)

(8368, 13) (1539, 13)


In [34]:
model_basic(x_train,y_train,x_test,y_test)

[[729 197]
 [345 268]]
[[625 301]
 [331 282]]
[[859  67]
 [477 136]]
[[795 131]
 [361 252]]
[[724 202]
 [316 297]]
[[720 206]
 [338 275]]
[[751 175]
 [290 323]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.6478,0.5763,0.4372,0.4972,0.7005
1,DecisionTreeClassifier(),0.5893,0.4837,0.46,0.4716,0.5675
2,SVC(probability=True),0.6465,0.67,0.2219,0.3333,0.6795
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.6803,0.658,0.4111,0.506,0.7508
4,"XGBClassifier(base_score=None, booster=None, c...",0.6634,0.5952,0.4845,0.5342,0.7181
5,KNeighborsClassifier(),0.6465,0.5717,0.4486,0.5027,0.6616
6,LGBMClassifier(),0.6979,0.6486,0.5269,0.5815,0.7452


## GLM

In [6]:
map = {'1인년간평균급여(직원)' : '직원년간평균급여',}
train.rename(columns = map,inplace=True)

In [7]:
train.columns

Index(['대주주지분변화분', '외국인지분분변화', '자산', '비유동자산증가율', '유동자산증가율', '자기자본증가율',
       '정상영업이익증가율', '순이익증가율', '총포괄이익증가율', '매출액순이익률', '총자본순이익률', '자기자본순이익률',
       '매출액증가율', '광고선전비대매출액비율', '비유동비율', '당좌비율', '현금비율', '부채비율',
       '이자보상배율(이자비용)', '유보액대납입자본배율', '투자집중도', '직원년간평균급여', '토빈Q', 'WW지수', 'RDS',
       '업력', '산업평균_총자산증가율차분', '대주주지분율', '외국인_주식분포비율', '도입기', '성숙기', '성장기',
       '쇠퇴기', 'target'],
      dtype='object')

In [8]:
formula = 'target ~ 유동자산증가율+자기자본증가율+투자집중도+정상영업이익증가율+매출액순이익률+자기자본순이익률+산업평균_총자산증가율차분+유보액대납입자본배율+직원년간평균급여+토빈Q+업력+대주주지분율+외국인_주식분포비율'

In [9]:
import statsmodels.formula.api as smf
import statsmodels.api as sm
model = smf.glm(formula, train,family=sm.families.Binomial()).fit()

In [10]:
model.summary()

0,1,2,3
Dep. Variable:,target,No. Observations:,8368.0
Model:,GLM,Df Residuals:,8354.0
Model Family:,Binomial,Df Model:,13.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-4469.1
Date:,"Sat, 25 Mar 2023",Deviance:,8938.2
Time:,17:33:11,Pearson chi2:,8100.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.1255
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.5699,0.128,-27.961,0.000,-3.820,-3.320
유동자산증가율,0.3190,0.153,2.078,0.038,0.018,0.620
자기자본증가율,-0.6746,0.180,-3.749,0.000,-1.027,-0.322
투자집중도,-0.4658,0.132,-3.534,0.000,-0.724,-0.207
정상영업이익증가율,1.1359,0.114,9.984,0.000,0.913,1.359
매출액순이익률,0.7786,0.211,3.690,0.000,0.365,1.192
자기자본순이익률,2.4406,0.243,10.045,0.000,1.964,2.917
산업평균_총자산증가율차분,0.2114,0.168,1.259,0.208,-0.118,0.540
유보액대납입자본배율,0.6248,0.104,6.016,0.000,0.421,0.828
