## 01. 라이브러리 로드

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier
import lightgbm as lgb
from collections import Counter
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
# 경고 메시지 무시
import warnings
warnings.filterwarnings('ignore')

## 02. 데이터 로드

In [17]:
df = pd.read_excel("data/변수중요도.xlsx", dtype={"거래소코드":str})

## 03. 제외할 컬럼 및 독립변스 리스트 생성

In [18]:
col_list = list(df.columns)

exclude_cols = ['회사명', '거래소코드', '회계년도', 'target_class']

feature_columns = [col for col in col_list if col not in exclude_cols]

print("사용될 피처 개수:", len(feature_columns))

사용될 피처 개수: 10


## 4. 슬라이딩 윈도우 - create_grouped_sliding_windows 함수 이용
- 회사별로 슬라이딩 윈도우 생성 - 슬라이딩 크기 : 3
- 윈도우의 마지막 회계년도 반환

In [19]:
def create_grouped_sliding_windows(df, feature_cols, target_col, window_size):
    """
    회사별로 슬라이딩 윈도우를 생성하는 함수.
    윈도우의 마지막 연도(회계년도)도 반환합니다.
    """
    X_windowed_list = []
    y_windowed_list = []
    window_years = []
    window_companies = []

    for name, group in df.groupby('회사명'):
        group = group.sort_values('회계년도')
        group_features = group[feature_cols].values
        group_target = group[target_col].values
        group_years = group['회계년도'].values

        if len(group) >= window_size:
            for i in range(len(group) - window_size + 1):
                window = group_features[i : i + window_size].flatten()
                label = group_target[i + window_size - 1]
                last_year = group_years[i + window_size - 1]

                X_windowed_list.append(window)
                y_windowed_list.append(label)
                window_years.append(last_year)
                window_companies.append(name)

    return (
        np.array(X_windowed_list),
        np.array(y_windowed_list),
        np.array(window_years),
        np.array(window_companies)
    )

WINDOW_SIZE = 3

X_windowed, y_windowed, window_years, window_companies = create_grouped_sliding_windows(
    df,
    feature_columns,
    'target_class',
    WINDOW_SIZE
)

print(f"\n--- {WINDOW_SIZE}년 회사별 슬라이딩 윈도우 적용 후 데이터 형태 ---")
print("X_windowed shape:", X_windowed.shape)
print("y_windowed shape:", y_windowed.shape)


--- 3년 회사별 슬라이딩 윈도우 적용 후 데이터 형태 ---
X_windowed shape: (13814, 30)
y_windowed shape: (13814,)


## 05. train, test 분할
- train : 2013 ~ 2019
- test : 2020 ~ 2023

In [20]:
split_year = 2019

X_train = X_windowed[window_years <= split_year]
y_train = y_windowed[window_years <= split_year]

X_test = X_windowed[window_years > split_year]
y_test = y_windowed[window_years > split_year]

## 06. 표준화
- RobustScaler 사용

In [21]:
# 스케일링 (표준화)
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 07. 불균형데이터 처리 (SMOTE)
- borderline-2 사용

In [22]:
# SMOTE
min_class_samples = min(Counter(y_train).values())
safe_k = min(5, max(1, min_class_samples - 1))

borderline_smote2 = BorderlineSMOTE(
            random_state=42, 
            kind='borderline-2',
            k_neighbors=safe_k,
            m_neighbors=max(5, safe_k)
        )

print("원본데이터 : ", Counter(y_train))
X_train_resampled, y_train_resampled = borderline_smote2.fit_resample(X_train_scaled, y_train)
print("SMOTE 적용 이후 데이터:", Counter(y_train_resampled))

원본데이터 :  Counter({0: 6069, 2: 989, 3: 780, 1: 303})
SMOTE 적용 이후 데이터: Counter({0: 6069, 2: 6069, 1: 6069, 3: 6069})


## 08. 모델링 - 각각 혼동행렬 확인
- Logistic Regression
- Random Forest
- XGBoost
- LightGBM
- Voting(Soft)
- Stacking

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# 최적 하이퍼파라미터로 다중로지스틱회귀 모델 생성
lr_model = LogisticRegression(
    penalty='l2',
    max_iter=5000,
    class_weight='balanced',
    C=0.01,
    random_state=42,
    n_jobs=-1
)

# 모델 훈련
lr_model.fit(X_train_resampled, y_train_resampled)

# 예측
y_pred = lr_model.predict(X_test_scaled)
y_proba = lr_model.predict_proba(X_test_scaled)

# 성능 지표 계산
lr_confusion_matrix = confusion_matrix(y_test, y_pred)
lr_accuracy = accuracy_score(y_test, y_pred)
lr_precision = precision_score(y_test, y_pred, average='macro')
lr_recall = recall_score(y_test, y_pred, average='macro')
lr_macro_f1 = f1_score(y_test, y_pred, average='macro')
lr_weighted_f1 = f1_score(y_test, y_pred, average='weighted')
lr_auc_roc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')

# 결과 출력
print(lr_confusion_matrix)
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision (macro): {lr_precision:.4f}")
print(f"Recall (macro): {lr_recall:.4f}")
print(f"Macro-F1: {lr_macro_f1:.4f}")
print(f"Weighted-F1: {lr_weighted_f1:.4f}")
print(f"AUC-ROC (macro): {lr_auc_roc:.4f}")

[[2871  508  533  108]
 [  50  105   29   63]
 [  69  143  382  119]
 [   7  162  102  422]]


#### Random Forest

In [None]:
# 최적 하이퍼파라미터로 랜덤포레스트 모델 생성
rf_model = RandomForestClassifier(
    n_estimators=700,
    min_weight_fraction_leaf=0.01,
    min_samples_split=15,
    min_samples_leaf=4,
    min_impurity_decrease=0.0001,
    max_samples=1.0,
    max_leaf_nodes=None,
    max_features='log2',
    max_depth=10,
    ccp_alpha=0.0,
    bootstrap=True,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# 모델 훈련
rf_model.fit(X_train_resampled, y_train_resampled)

# 예측
y_pred = rf_model.predict(X_test_scaled)
y_proba = rf_model.predict_proba(X_test_scaled)

# 성능 지표 계산
rf_confusion_matrix = confusion_matrix(y_test, y_pred)
rf_accuracy = accuracy_score(y_test, y_pred)
rf_precision = precision_score(y_test, y_pred, average='macro')
rf_recall = recall_score(y_test, y_pred, average='macro')
rf_macro_f1 = f1_score(y_test, y_pred, average='macro')
rf_weighted_f1 = f1_score(y_test, y_pred, average='weighted')
rf_auc_roc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')

# 결과 출력
print(rf_confusion_matrix)
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision (macro): {rf_precision:.4f}")
print(f"Recall (macro): {rf_recall:.4f}")
print(f"Macro-F1: {rf_macro_f1:.4f}")
print(f"Weighted-F1: {rf_weighted_f1:.4f}")
print(f"AUC-ROC (macro): {rf_auc_roc:.4f}")

[[3108  420  415   77]
 [   1  187   12   47]
 [  46   46  474  147]
 [   0   62   16  615]]


#### XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# 최적 하이퍼파라미터로 XGBoost 모델 생성
xgb_model = XGBClassifier(
    subsample=0.7,
    scale_pos_weight=4,
    reg_lambda=8,
    reg_alpha=0.1,
    n_estimators=800,
    min_child_weight=5,
    max_depth=8,
    max_delta_step=1,
    learning_rate=0.22999999999999998,
    gamma=0,
    colsample_bytree=0.8,
    colsample_bylevel=0.9,
    booster='gbtree',
    random_state=42,
    n_jobs=-1
)

# 모델 훈련
xgb_model.fit(X_train_resampled, y_train_resampled)

# 예측
y_pred = xgb_model.predict(X_test_scaled)
y_proba = xgb_model.predict_proba(X_test_scaled)

# 성능 지표 계산
xgb_confusion_matrix = confusion_matrix(y_test,y_pred)
xgb_accuracy = accuracy_score(y_test, y_pred)
xgb_precision = precision_score(y_test, y_pred, average='macro')
xgb_recall = recall_score(y_test, y_pred, average='macro')
xgb_macro_f1 = f1_score(y_test, y_pred, average='macro')
xgb_weighted_f1 = f1_score(y_test, y_pred, average='weighted')
xgb_auc_roc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')

# 결과 출력
print(xgb_confusion_matrix)
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"Precision (macro): {xgb_precision:.4f}")
print(f"Recall (macro): {xgb_recall:.4f}")
print(f"Macro-F1: {xgb_macro_f1:.4f}")
print(f"Weighted-F1: {xgb_weighted_f1:.4f}")
print(f"AUC-ROC (macro): {xgb_auc_roc:.4f}")

[[3619   88  253   60]
 [ 133   53   12   49]
 [ 145   13  436  119]
 [  26   25  105  537]]


#### LightGBM

In [None]:
# 최적 하이퍼파라미터로 LightGBM 모델 생성
lgb_model = LGBMClassifier(
    subsample=0.8,
    reg_lambda=0.1,
    reg_alpha=0.1,
    num_leaves=63,
    n_estimators=200,
    min_child_samples=50,
    max_depth=-1,
    learning_rate=0.1,
    colsample_bytree=1.0,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# 모델 훈련
lgb_model.fit(X_train_resampled, y_train_resampled)

# 예측
y_pred = lgb_model.predict(X_test_scaled)
y_proba = lgb_model.predict_proba(X_test_scaled)

# 성능 지표 계산
lgb_confusion_matrix = confusion_matrix(y_test,y_pred)
lgb_accuracy = accuracy_score(y_test, y_pred)
lgb_precision = precision_score(y_test, y_pred, average='macro')
lgb_recall = recall_score(y_test, y_pred, average='macro')
lgb_macro_f1 = f1_score(y_test, y_pred, average='macro')
lgb_weighted_f1 = f1_score(y_test, y_pred, average='weighted')
lgb_auc_roc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')

# 결과 출력
print(lgb_confusion_matrix)
print(f"Accuracy: {lgb_accuracy:.4f}")
print(f"Precision (macro): {lgb_precision:.4f}")
print(f"Recall (macro): {lgb_recall:.4f}")
print(f"Macro-F1: {lgb_macro_f1:.4f}")
print(f"Weighted-F1: {lgb_weighted_f1:.4f}")
print(f"AUC-ROC (macro): {lgb_auc_roc:.4f}")

[[3607   73  276   64]
 [ 134   53   13   47]
 [ 116   12  460  125]
 [  29   18   88  558]]


#### Voting(Soft)

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# XGBoost 모델 생성
xgb_base_model = XGBClassifier(
    objective='multi:softprob',
    colsample_bytree=0.7,
    eval_metric='mlogloss',
    learning_rate=0.08,
    max_depth=7,
    n_estimators=300,
    n_jobs=-1,
    random_state=42,
    reg_alpha=0.1,
    subsample=0.9,
    verbosity=0
)

# LightGBM 모델 생성
lgb_base_model = LGBMClassifier(
    boosting_type='gbdt',
    colsample_bytree=1.0,
    learning_rate=0.1,
    max_depth=8,
    min_child_samples=20,
    n_estimators=200,
    n_jobs=-1,
    num_leaves=50,
    objective='multiclass',
    random_state=42,
    reg_alpha=0.0,
    reg_lambda=0.0,
    subsample=1.0,
    metric='multi_logloss',
    verbosity=-1,
    feature_fraction=0.8,
    bagging_fraction=0.85
)

# 소프트 보팅 앙상블 모델 생성
voting_model = VotingClassifier(
    estimators=[('xgb', xgb_base_model), ('lgb', lgb_base_model)],
    voting='soft'
)

# 모델 훈련
voting_model.fit(X_train_resampled, y_train_resampled)

# 예측
y_pred = voting_model.predict(X_test_scaled)
y_proba = voting_model.predict_proba(X_test_scaled)

# 성능 지표 계산
voting_confusion_matrix = confusion_matrix(y_test,y_pred)
voting_accuracy = accuracy_score(y_test, y_pred)
voting_precision = precision_score(y_test, y_pred, average='macro')
voting_recall = recall_score(y_test, y_pred, average='macro')
voting_macro_f1 = f1_score(y_test, y_pred, average='macro')
voting_weighted_f1 = f1_score(y_test, y_pred, average='weighted')
voting_auc_roc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')

# 결과 출력
print(voting_confusion_matrix)
print(f"Accuracy: {voting_accuracy:.4f}")
print(f"Precision (macro): {voting_precision:.4f}")
print(f"Recall (macro): {voting_recall:.4f}")
print(f"Macro-F1: {voting_macro_f1:.4f}")
print(f"Weighted-F1: {voting_weighted_f1:.4f}")
print(f"AUC-ROC (macro): {voting_auc_roc:.4f}")

[[3590   86  283   61]
 [ 137   47   14   49]
 [ 121   14  456  122]
 [  23   24   90  556]]


#### Stacking

In [None]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# 베이스 모델들 생성
rf_base_model2 = RandomForestClassifier(
    bootstrap=True,
    ccp_alpha=0.0,
    class_weight='balanced',
    criterion='gini',
    max_depth=8,
    max_features='sqrt',
    max_leaf_nodes=None,
    max_samples=None,
    min_impurity_decrease=0.0,
    min_samples_leaf=4,
    min_samples_split=10,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=-1,
    oob_score=False,
    random_state=42,
    verbose=0,
    warm_start=False
)

xgb_base_model2 = XGBClassifier(
    objective='multi:softprob',
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    learning_rate=0.05,
    max_depth=4,
    n_estimators=100,
    random_state=42,
    reg_lambda=5,
    subsample=0.8,
    verbosity=0
)

lgb_base_model2 = LGBMClassifier(
    objective='multiclass',
    colsample_bytree=0.8,
    learning_rate=0.05,
    max_depth=4,
    n_estimators=100,
    random_state=42,
    reg_lambda=5,
    subsample=0.8,
    verbosity=-1
)

# 메타 모델 생성
meta_model = RidgeClassifier(random_state=42)

# 스태킹 앙상블 모델 생성
stacking_model = StackingClassifier(
    estimators=[
        ('rf', rf_base_model2),
        ('xgb', xgb_base_model2),
        ('lgb', lgb_base_model2)
    ],
    final_estimator=meta_model,
    cv=5,
    passthrough=True,
    n_jobs=-1
)

# 모델 훈련
stacking_model.fit(X_train_resampled, y_train_resampled)

# 예측
y_pred = stacking_model.predict(X_test_scaled)

# 성능 지표 계산
stacking_confusion_matrix = confusion_matrix(y_test,y_pred)
stacking_accuracy = accuracy_score(y_test, y_pred)
stacking_precision = precision_score(y_test, y_pred, average='macro')
stacking_recall = recall_score(y_test, y_pred, average='macro')
stacking_macro_f1 = f1_score(y_test, y_pred, average='macro')
stacking_weighted_f1 = f1_score(y_test, y_pred, average='weighted')
stacking_auc_roc = None # ridge는 proba를 제공 안함

# 결과 출력
print(stacking_confusion_matrix)
print(f"Accuracy: {stacking_accuracy:.4f}")
print(f"Precision (macro): {stacking_precision:.4f}")
print(f"Recall (macro): {stacking_recall:.4f}")
print(f"Macro-F1: {stacking_macro_f1:.4f}")
print(f"Weighted-F1: {stacking_weighted_f1:.4f}")

[[3350  201  395   74]
 [  64  103   22   58]
 [  47   20  502  144]
 [   7   32   73  581]]


## 09. 모델링 결과 확인
- Accuracy
- Precision
- Recall
- F1-score
- AUC-ROC

#### Accuracy

In [29]:
print("LogisticRegression : ", lr_accuracy)
print("RandomForest : ", rf_accuracy)
print("XGBoost : ", xgb_accuracy)
print("LightGBM : ", lgb_accuracy)
print("Voting : ", voting_accuracy)
print("Stacking : ", stacking_accuracy)

LogisticRegression :  0.666314119513485
RandomForest :  0.7727833597743698
XGBoost :  0.8187907632645867
LightGBM :  0.8246077912920853
Voting :  0.8194958575709501
Stacking :  0.7995769434161819


#### Precision

In [30]:
print("LogisticRegression : ", lr_precision)
print("RandomForest : ", rf_precision)
print("XGBoost : ", xgb_precision)
print("LightGBM : ", lgb_precision)
print("Voting : ", voting_precision)
print("Stacking : ", stacking_precision)

LogisticRegression :  0.5075586092392792
RandomForest :  0.6144188356309312
XGBoost :  0.6153753455761135
LightGBM :  0.6300750047619808
Voting :  0.6121929406693827
Stacking :  0.6098237948312442


#### Recall

In [31]:
print("LogisticRegression : ", lr_recall)
print("RandomForest : ", rf_recall)
print("XGBoost : ", xgb_recall)
print("LightGBM : ", lgb_recall)
print("Voting : ", voting_recall)
print("Stacking : ", stacking_recall)

LogisticRegression :  0.5709978259689243
RandomForest :  0.7706154674970264
XGBoost :  0.6253040327895943
LightGBM :  0.6405486689737127
Voting :  0.631294555283757
Stacking :  0.6981971353695721


#### Macro F1-score

In [32]:
print("LogisticRegression : ", lr_macro_f1)
print("RandomForest : ", rf_macro_f1)
print("XGBoost : ", xgb_macro_f1)
print("LightGBM : ", lgb_macro_f1)
print("Voting : ", voting_macro_f1)
print("Stacking : ", stacking_macro_f1)

LogisticRegression :  0.5084013411249104
RandomForest :  0.6539209946028557
XGBoost :  0.6176890729627609
LightGBM :  0.629887898550356
Voting :  0.6179350026060106
Stacking :  0.6437309722254956


#### Weighted F1-score

In [None]:
print("LogisticRegression : ", lr_weighted_f1)
print("RandomForest : ", rf_weighted_f1)
print("XGBoost : ", xgb_weighted_f1)
print("LightGBM : ", lgb_weighted_f1)
print("Voting : ", voting_weighted_f1)
print("Stacking : ", stacking_weighted_f1)

#### AUC-ROC

In [33]:
print("LogisticRegression : ", lr_auc_roc)
print("RandomForest : ", rf_auc_roc)
print("XGBoost : ", xgb_auc_roc)
print("LightGBM : ", lgb_auc_roc)
print("Voting : ", voting_auc_roc)
print("Stacking : ", stacking_auc_roc)

LogisticRegression :  0.839962830811422
RandomForest :  0.928567849781225
XGBoost :  0.932792385123473
LightGBM :  0.936946064177827
Voting :  0.9394848685941182
Stacking :  None


# 10. 데이터 저장

In [34]:
first_df = pd.read_excel("data/전처리후데이터셋.xlsx")

first_df['회계년도'] = first_df['회계년도'].str.split('/').str[0].astype(int)
first_df = first_df.sort_values(["회사명","회계년도"],ascending=True)

In [35]:
first_df['target_class'] = 0 # 슬라이딩 윈도우를 위한 임의의 수 삽입

test_X_windowed, test_y_windowed, window_years, window_companies = create_grouped_sliding_windows(
    first_df,
    feature_columns,
    'target_class',
    WINDOW_SIZE
)

split_year = 2019

predict_X = test_X_windowed[window_years > split_year]

scaled_predict_X = scaler.transform(predict_X)

In [36]:
y_pred = rf_model.predict(scaled_predict_X)

predict_y = pd.DataFrame({
    '회사명': window_companies[window_years > split_year],
    '회계년도': window_years[window_years > split_year],
    'target_pred': y_pred
})

In [37]:
predict_y.to_excel("data/predict.xlsx",index=False)