# Prerequisite

Following packages must be installed to run codes below

```bash
$ pip3 install openpyxl 
```


In [1]:
%load_ext autoreload
%autoreload 2

# Add project path

In [2]:
import os
import sys
from pathlib import Path

PROJECT_PATH = Path().resolve().parent.parent
SRC_PATH = PROJECT_PATH / 'src'

sys.path.append(str(SRC_PATH))

In [3]:
exp_name = 'heavy_industry_beavers_model'

# Load data

In [4]:
from data_loader import SectorDataLoader

data_loader = SectorDataLoader()
data_finance = (
    data_loader
    .load_dataset()
    .pipe(data_loader.add_beaver_indicator)
    .pipe(data_loader.add_label)
)
data_finance.head()

Unnamed: 0,BIZ_NO,CMP_PFIX_NM,CMP_NM,CMP_NM1,CMP_SFIX_NM,CMP_ENM,BZ_TYP,CMP_TYP,CMP_SCL,PBCO_GB,...,STAT_OCR_DATE_y,Closed_Year,유동자산/부채총계,당기순이익(손실)/자산총계,부채총계/자산총계,순운전자본/자산총계,유동부채/유동자산,Years_From_Closed_Year_To_FS,Closed_In_1Yr,Closed_In_2Yrs
117,1018135422,(주),한국선박기술,,,Korea Marine Time Service,M,,2,2,...,20200930.0,2020.0,1.031451,0.038773,0.578094,0.543045,0.089273,2.0,0,1
219,1018154206,(주),드림미즈,,,"dreammiz Co., Ltd.",M,,2,2,...,20201231.0,2020.0,1.22004,0.001177,0.662356,0.608449,0.247063,2.0,0,1
279,1018163684,,디에프에스서울,,(주),DFS Seoul Ltd.,M,,2,2,...,20190917.0,2019.0,11.012914,0.046666,0.090794,0.909114,0.090802,1.0,1,1
339,1018178760,(주),대교디앤씨,,,"DAEGYO D & C CO.,LTD.",M,,2,2,...,20191010.0,2019.0,0.916364,-0.026128,1.091269,0.097514,0.902486,1.0,1,1
406,1018194173,,에코에너지,,(주),"Eco Energy Co.,Ltd.",M,,2,2,...,20200831.0,2020.0,1.183355,-0.062095,0.828142,0.194625,0.8014,2.0,0,1


In [5]:
from data_loader import IndexDataLoader
import pandas as pd

index_features = [
    '3YEAR TREASURY',
    '5YEAR TREASURY',
    '10YEAR TREASURY',
    '3YEAR COPORATE BOND',
    'CD 91DAYS',
    'CALL RATE 1DAY',
    'BASE RATE',
    'YEAR_MONTH',
    'DOW_JONES_COMMODITY_INDEX_PRICE',
    'DOW_JONES_COMMODITY_INDEX_CHANGE_RATE',
    'EXCHANGE_RATE_PRICE',
    'EXCHANGE_RATE_CHANGE_RATE',
]

index_data_loader = IndexDataLoader()

index_data = (
    index_data_loader.load_interest_rate()
    .merge(
        (
            index_data_loader.load_dow_jones_commodity_index()
            .rename(columns={'PRICE':'DOW_JONES_COMMODITY_INDEX_PRICE',
                            'CHANGE RATE':'DOW_JONES_COMMODITY_INDEX_CHANGE_RATE'})
        ),
        how='outer',
        left_on='YEAR_MONTH',
        right_on='YEAR_MONTH'
    )
    .merge(
        (
            index_data_loader.load_exchange_rate()
            .rename(columns={'PRICE':'EXCHANGE_RATE_PRICE',
                            'CHANGE RATE':'EXCHANGE_RATE_CHANGE_RATE'})
        ),
        how='outer',
        left_on='YEAR_MONTH',
        right_on='YEAR_MONTH'
    )
    .assign(year=lambda x : x['YEAR_MONTH'].dt.year)
    .assign(month=lambda x : x['YEAR_MONTH'].dt.month)
)

index_data_mean = index_data.groupby(['year'])[index_features].mean()
index_data_mean.columns = [f'{x}_AVERAGE' for x in index_data_mean.columns]
index_data_std = index_data.groupby(['year'])[index_features].std()
index_data_std.columns = [f'{x}_STD' for x in index_data_mean.columns]

index_feature_data = (
    pd.concat([index_data_mean, index_data_std], axis=1)
    .reset_index(drop=False)
)

In [7]:
index_features_stats = index_feature_data.drop(['year'], axis=1).columns.tolist()

In [8]:
data = (
    data_finance
    .assign(year=lambda x : x['결산년월'].astype(str).str[:4].astype(int))
    .merge(
        index_feature_data,
        how='left',
        left_on='year',
        right_on='year'
    )
)

# Baseline classifier using Beaver's features

In [9]:
from project_paths import DATA_PATH

sectors = [
    '제조업(10~34)',
    '부동산업(68)',
    '도매 및 소매업(45~47)',
    '숙박 및 음식점업(55~56)',
    '건설업(41~42)'
]

beaver_features = [
    '유동자산/부채총계',
    '당기순이익(손실)/자산총계',
    '부채총계/자산총계',
    '순운전자본/자산총계',
    '유동부채/유동자산'
]

years_to_close = [
    'Closed_In_1Yr',
    'Closed_In_2Yrs',
]

x_data_year = list(range(2018, 2021))

EXP_RESULT_PATH = DATA_PATH / 'experiment_result' / exp_name

if not os.path.exists(EXP_RESULT_PATH) : 
    os.mkdir(EXP_RESULT_PATH)

In [10]:
import numpy as np
import pandas as pd
from pycaret.classification import ClassificationExperiment
from tqdm import tqdm

In [11]:
exp_result = None

for target_years_to_close in tqdm(years_to_close) : 
    for x_data_target_year in tqdm(x_data_year, leave=True) : 
        for sector in sectors : 

            filename = EXP_RESULT_PATH / f'Beaver_baseline_{sector}_{target_years_to_close}_{x_data_target_year}.csv'

            if not os.path.exists(filename) : 

                data_to_train = (
                    data
                    .loc[~data[beaver_features[0]].isin([np.nan, np.inf, -np.inf]), :]
                    .loc[~data[beaver_features[1]].isin([np.nan, np.inf, -np.inf]), :]
                    .loc[~data[beaver_features[2]].isin([np.nan, np.inf, -np.inf]), :]
                    .loc[~data[beaver_features[3]].isin([np.nan, np.inf, -np.inf]), :]
                    .loc[~data[beaver_features[4]].isin([np.nan, np.inf, -np.inf]), :]
                    .loc[lambda x : pd.to_datetime(x['결산년월'], format='%Y%m%d').dt.year==x_data_target_year]
                    .loc[lambda x : x['대분류']==sector]
                )

                exp = ClassificationExperiment()

                exp.setup(
                    (
                        data_to_train
                        .loc[:, beaver_features+index_features_stats+[target_years_to_close]]
                        .reset_index(drop=True)
                    ),
                    target=target_years_to_close
                )

                models = exp.compare_models()

                result = (
                    exp.pull()
                    .assign(target=target_years_to_close)
                    .assign(x_data_yaer=x_data_target_year)
                    .assign(대분류=sector)
                    .reset_index(drop=False)
                    .rename(columns={'index':'model_name'})
                )
                result.to_csv(filename)

            else : 
                result = pd.read_csv(filename)

            if exp_result is None : 
                exp_result = result
            else : 
                exp_result = pd.concat([exp_result, result], axis=0)

  0%|                                                                                    | 0/2 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 40.94it/s][A

100%|████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 42.09it/s][A
100%|████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 13.12it/s]


In [12]:
aggregated_result_path = EXP_RESULT_PATH / 'result.csv'

if not os.path.exists(aggregated_result_path) :
    exp_result.to_csv(aggregated_result_path)
    
exp_result

Unnamed: 0.1,Unnamed: 0,model_name,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec),target,x_data_yaer,대분류
0,0,rf,Random Forest Classifier,0.8745,0.5738,0.1217,0.6817,0.1972,0.1619,0.2364,0.080,Closed_In_1Yr,2018,제조업(10~34)
1,1,ridge,Ridge Classifier,0.8705,0.0000,0.0062,0.1000,0.0118,0.0104,0.0235,0.006,Closed_In_1Yr,2018,제조업(10~34)
2,2,lr,Logistic Regression,0.8697,0.5194,0.0062,0.1000,0.0118,0.0089,0.0199,0.184,Closed_In_1Yr,2018,제조업(10~34)
3,3,lda,Linear Discriminant Analysis,0.8697,0.5174,0.0062,0.1000,0.0118,0.0089,0.0199,0.006,Closed_In_1Yr,2018,제조업(10~34)
4,4,dummy,Dummy Classifier,0.8697,0.5000,0.0000,0.0000,0.0000,0.0000,0.0000,0.005,Closed_In_1Yr,2018,제조업(10~34)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,5,ada,Ada Boost Classifier,1.0000,0.0000,1.0000,1.0000,1.0000,,0.0000,0.007,Closed_In_2Yrs,2020,건설업(41~42)
6,6,lda,Linear Discriminant Analysis,1.0000,0.0000,1.0000,1.0000,1.0000,,0.0000,0.006,Closed_In_2Yrs,2020,건설업(41~42)
7,7,et,Extra Trees Classifier,1.0000,0.0000,1.0000,1.0000,1.0000,,0.0000,0.047,Closed_In_2Yrs,2020,건설업(41~42)
8,8,lightgbm,Light Gradient Boosting Machine,1.0000,0.0000,1.0000,1.0000,1.0000,,0.0000,0.007,Closed_In_2Yrs,2020,건설업(41~42)
