<a href="https://colab.research.google.com/github/hwangho-kim/pure-LAD/blob/master/LAD_Bender_LAD_%ED%8A%B9%EC%A7%95_%EA%B3%B5%ED%95%99_%EA%B8%B0%EB%B0%98_%EC%84%B1%EB%8A%A5_%EA%B0%9C%EC%84%A0_%ED%94%84%EB%A0%88%EC%9E%84%EC%9B%8C%ED%81%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 라이브러리 설치 ---
# 이 코드를 실행하기 전에 터미널에서 다음 명령어를 실행하여 필요한 라이브러리를 설치해주세요.
!pip install scikit-learn pandas numpy matplotlib seaborn lightgbm bayesian-optimization

import numpy as np
import pandas as pd
import random
import time
import warnings
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer, load_digits, load_wine, fetch_california_housing, make_moons
from bayes_opt import BayesianOptimization
import matplotlib.pyplot as plt
import seaborn as sns


# 불필요한 경고 메시지 무시
warnings.filterwarnings('ignore')

# 한국어 폰트 설정 (그래프용)
try:
    import matplotlib.font_manager as fm
    font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
    font_prop = fm.FontProperties(fname=font_path)
    plt.rc('font', family=font_prop.get_name())
    plt.rcParams['axes.unicode_minus'] = False
except FileNotFoundError:
    print("나눔고딕 폰트가 설치되어 있지 않아, 그래프의 한글이 깨질 수 있습니다.")


class LADBender:
    """
    LAD-Bender: LAD를 특징 공학 도구로 사용하여 생성된 패턴 위에서
                강력한 분류기를 훈련시키는 새로운 프레임워크
    """
    def __init__(self, purity_threshold=0.9, top_features_ratio=0.5,
                 lgbm_params=None):
        self.purity_threshold = purity_threshold
        self.top_features_ratio = top_features_ratio

        if lgbm_params is None:
            self.lgbm_params = {'objective': 'binary', 'verbose': -1}
        else:
            self.lgbm_params = lgbm_params

        self.literals = []
        self.selected_b_feature_names_ = None
        self.candidate_patterns = []
        # 최종 예측을 위한 분류 모델
        self.final_classifier = LogisticRegression(random_state=42, max_iter=1000)

    def _binarize(self, X: pd.DataFrame, y: pd.Series):
        X_b = pd.DataFrame(index=X.index)
        if not self.literals:
            for col in X.columns:
                if X[col].isnull().any(): continue
                stump = DecisionTreeClassifier(max_depth=1, criterion='entropy')
                stump.fit(X[[col]], y)
                if stump.tree_.node_count > 1:
                    threshold = stump.tree_.threshold[0]
                    le_name = f"{col}_le_{threshold:.2f}"
                    gt_name = f"{col}_gt_{threshold:.2f}"
                    self.literals.append({'name': le_name, 'feature': col, 'op': '<=', 'val': threshold})
                    self.literals.append({'name': gt_name, 'feature': col, 'op': '>', 'val': threshold})

        for literal_info in self.literals:
            feature, op, val, name = literal_info['feature'], literal_info['op'], literal_info['val'], literal_info['name']
            if feature in X.columns:
                if op == '<=': X_b[name] = (X[feature] <= val).astype(int)
                else: X_b[name] = (X[feature] > val).astype(int)
        return X_b

    def _select_features(self, X_b: pd.DataFrame, y: pd.Series):
        # [MODIFICATION] n_estimators를 하이퍼파라미터로 사용
        rf = RandomForestClassifier(n_estimators=self.lgbm_params.get('n_estimators', 100), random_state=42, n_jobs=-1)
        rf.fit(X_b, y)
        importances = rf.feature_importances_
        n_top_features = int(len(importances) * self.top_features_ratio)
        if n_top_features == 0 and len(importances) > 0: n_top_features = 1
        selected_indices = np.argsort(importances)[::-1][:n_top_features]
        self.selected_b_feature_names_ = X_b.columns[selected_indices]
        return X_b[self.selected_b_feature_names_]

    def _generate_candidate_patterns(self, X_b_min: pd.DataFrame, y: pd.Series):
        patterns = []
        for target_class in [1, 0]:
            lgbm = lgb.LGBMClassifier(**self.lgbm_params)
            lgbm.fit(X_b_min, (y == target_class))
            trees_df = lgbm.booster_.trees_to_dataframe()
            seeds = self._extract_seeds(trees_df, target_class)
            for seed in seeds:
                prime_pattern = self._refine_to_prime(X_b_min, y, seed, target_class)
                pattern_tuple = tuple(sorted(prime_pattern.items()))
                if prime_pattern and pattern_tuple not in {tuple(sorted(p.items())) for p in patterns}:
                    patterns.append(prime_pattern)
        return patterns

    def _extract_seeds(self, trees_df, target_class):
        seeds = []
        for tree_index in trees_df['tree_index'].unique():
            tree = trees_df[trees_df['tree_index'] == tree_index]
            nodes = {node['node_index']: node for _, node in tree.iterrows()}
            if not nodes: continue
            root_index = tree.iloc[0]['node_index']
            def find_paths_recursive(node_index, current_path_rules):
                node = nodes.get(node_index)
                if node is None: return
                is_leaf = pd.isna(node.get('left_child')) and pd.isna(node.get('right_child'))
                if is_leaf:
                    leaf_value = node.get('leaf_value', node.get('value'))
                    if leaf_value is None: return
                    prediction = 1 if leaf_value > 0 else 0
                    if prediction == target_class and current_path_rules:
                        seeds.append(current_path_rules.copy())
                    return
                feature = node.get('split_feature')
                if feature is not None:
                    left_child_index, right_child_index = node.get('left_child'), node.get('right_child')
                    next_path_left = current_path_rules.copy(); next_path_left[feature] = 0
                    find_paths_recursive(left_child_index, next_path_left)
                    next_path_right = current_path_rules.copy(); next_path_right[feature] = 1
                    find_paths_recursive(right_child_index, next_path_right)
            find_paths_recursive(root_index, {})
        return seeds

    def _refine_to_prime(self, X_b, y, pattern, target_class):
        current_pattern = pattern.copy()
        while True:
            removed = False
            if len(current_pattern) <= 1: break
            for literal_col in list(current_pattern.keys()):
                temp_pattern = current_pattern.copy(); del temp_pattern[literal_col]
                mask = self._get_pattern_mask(X_b, temp_pattern)
                if mask.sum() < 3: continue
                purity = y.loc[mask].mean()
                is_pure_enough = (purity >= self.purity_threshold) if target_class == 1 else (purity <= (1 - self.purity_threshold))
                if is_pure_enough:
                    current_pattern = temp_pattern
                    removed = True
                    break
            if not removed: break
        return current_pattern

    def _get_pattern_mask(self, X_b, pattern_dict):
        mask = pd.Series(True, index=X_b.index)
        for col, val in pattern_dict.items():
            if col in X_b.columns:
                mask &= (X_b[col] == val)
        return mask

    def _transform_data_with_patterns(self, X_b_min: pd.DataFrame):
        X_transformed = pd.DataFrame(index=X_b_min.index)
        for i, pattern in enumerate(self.candidate_patterns):
            X_transformed[f'pattern_{i}'] = self._get_pattern_mask(X_b_min, pattern).astype(int)
        return X_transformed

    def fit(self, X_train, y_train, verbose=True):
        if verbose: print("\n[LAD-Bender] 1단계: 데이터 이진화")
        X_train_b = self._binarize(X_train, y_train)

        if verbose: print("\n[LAD-Bender] 2단계: 특징 선택")
        X_train_b_min = self._select_features(X_train_b, y_train)

        if verbose: print("\n[LAD-Bender] 3단계: 후보 패턴 생성")
        self.candidate_patterns = self._generate_candidate_patterns(X_train_b_min, y_train)
        if verbose: print(f"  > 생성된 총 후보 패턴 수: {len(self.candidate_patterns)}")

        if verbose: print("\n[LAD-Bender] 4단계: 패턴 기반 특징 변환 및 최종 모델 훈련")
        X_train_transformed = self._transform_data_with_patterns(X_train_b_min)
        self.final_classifier.fit(X_train_transformed, y_train)
        if verbose: print("  > 최종 Logistic Regression 모델 훈련 완료.")

    def predict(self, X_test):
        X_test_b = pd.DataFrame(index=X_test.index)
        for literal_info in self.literals:
            feature, op, val, name = literal_info['feature'], literal_info['op'], literal_info['val'], literal_info['name']
            if feature in X_test.columns:
                if op == '<=': X_test_b[name] = (X_test[feature] <= val).astype(int)
                else: X_test_b[name] = (X_test[feature] > val).astype(int)

        for name in [l['name'] for l in self.literals]:
            if name not in X_test_b.columns: X_test_b[name] = 0

        if self.selected_b_feature_names_ is None or self.selected_b_feature_names_.empty:
            return np.zeros(len(X_test), dtype=int)

        X_test_b_min = X_test_b[self.selected_b_feature_names_]
        X_test_transformed = self._transform_data_with_patterns(X_test_b_min)

        return self.final_classifier.predict(X_test_transformed)

# 베이즈 최적화를 위한 설정
X_train_opt, y_train_opt, X_val_opt, y_val_opt = [None] * 4

def black_box_function(purity_threshold, top_features_ratio, n_estimators_rf, n_estimators_lgbm):
    top_features_ratio = max(0.1, min(1.0, top_features_ratio))
    purity_threshold = max(0.8, min(1.0, purity_threshold))

    model = LADBender(
        purity_threshold=purity_threshold,
        top_features_ratio=top_features_ratio,
        lgbm_params={'n_estimators': int(n_estimators_lgbm), 'objective': 'binary', 'verbose': -1}
    )
    model.n_estimators_rf = int(n_estimators_rf)

    model.fit(X_train_opt, y_train_opt, verbose=False)
    y_pred = model.predict(X_val_opt)
    return accuracy_score(y_val_opt, y_pred)

# 데이터셋 로더 함수
def load_dataset(name):
    print(f"\n\n" + "#"*70)
    print(f"# 데이터셋: {name} 처리 시작")
    print("#"*70)

    if name == 'Breast Cancer':
        data = load_breast_cancer()
        X = pd.DataFrame(data.data, columns=data.feature_names)
        y = pd.Series(np.where(data.target == 0, 1, 0), name='target')
    elif name == 'Digits (8 vs Rest)':
        data = load_digits()
        X = pd.DataFrame(data.data, columns=[f'pixel_{i}' for i in range(data.data.shape[1])])
        y = pd.Series((data.target == 8).astype(int), name='target')
    elif name == 'Wine (Class 0 vs Rest)':
        data = load_wine()
        X = pd.DataFrame(data.data, columns=data.feature_names)
        y = pd.Series((data.target == 0).astype(int), name='target')
    elif name == 'California Housing (High/Low)':
        data = fetch_california_housing()
        X = pd.DataFrame(data.data, columns=data.feature_names)
        y = pd.Series(data.target, name='target')
        median_price = y.median()
        y = (y > median_price).astype(int)
    elif name == 'Moons':
        X, y = make_moons(n_samples=1000, noise=0.3, random_state=42)
        X = pd.DataFrame(X, columns=['feature_0', 'feature_1'])
        y = pd.Series(y, name='target')
    else:
        raise ValueError("알 수 없는 데이터셋 이름입니다.")
    return X, y

# 메인 실행
if __name__ == '__main__':
    datasets_to_run = [
        'Breast Cancer',
        'Digits (8 vs Rest)',
        'Wine (Class 0 vs Rest)',
        'California Housing (High/Low)',
        'Moons'
    ]
    all_results = {}

    for ds_name in datasets_to_run:
        X, y = load_dataset(ds_name)

        X_train_full, X_test, y_train_full, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42, stratify=y
        )
        for df in [X_train_full, X_test, y_train_full, y_test]:
            df.reset_index(drop=True, inplace=True)

        X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
            X_train_full, y_train_full, test_size=0.3, random_state=42, stratify=y_train_full
        )
        for df in [X_train_opt, X_val_opt, y_train_opt, y_val_opt]:
            df.reset_index(drop=True, inplace=True)

        print("\n--- 베이즈 최적화를 통한 하이퍼파라미터 탐색 ---")
        pbounds = {
            'purity_threshold': (0.8, 1.0),
            'top_features_ratio': (0.1, 0.8),
            'n_estimators_rf': (50, 200),
            'n_estimators_lgbm': (20, 100)
        }
        optimizer = BayesianOptimization(f=black_box_function, pbounds=pbounds, random_state=42, verbose=2)
        optimizer.maximize(init_points=5, n_iter=10)
        best_params_raw = optimizer.max['params']

        best_params = {
            'purity_threshold': best_params_raw['purity_threshold'],
            'top_features_ratio': best_params_raw['top_features_ratio'],
            'n_estimators_rf': int(best_params_raw['n_estimators_rf']),
            'n_estimators_lgbm': int(best_params_raw['n_estimators_lgbm']),
            'lgbm_params': {
                'objective': 'binary', 'verbose': -1,
                'n_estimators': int(best_params_raw['n_estimators_lgbm'])
            }
        }

        print("\n" + "*"*50)
        print(f"최적 파라미터:")
        print(f"  - purity_threshold: {best_params['purity_threshold']:.3f}")
        print(f"  - top_features_ratio: {best_params['top_features_ratio']:.3f}")
        print(f"  - n_estimators_rf: {best_params['n_estimators_rf']}")
        print(f"  - n_estimators_lgbm: {best_params['n_estimators_lgbm']}")
        print("*"*50)

        start_time = time.time()

        print("\n--- 최적 파라미터로 LAD-Bender 훈련 ---")
        lad_bender_model = LADBender(
            purity_threshold=best_params['purity_threshold'],
            top_features_ratio=best_params['top_features_ratio'],
            lgbm_params=best_params['lgbm_params']
        )
        lad_bender_model.n_estimators_rf = best_params['n_estimators_rf']

        lad_bender_model.fit(X_train_full, y_train_full)
        y_pred_lad = lad_bender_model.predict(X_test)
        lad_acc = accuracy_score(y_test, y_pred_lad)

        end_time = time.time()

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_full)
        X_test_scaled = scaler.transform(X_test)

        # [NameError FIX] X_train, y_train -> X_train_full, y_train_full
        models = {
            "Decision Tree": DecisionTreeClassifier(random_state=42).fit(X_train_full, y_train_full).predict(X_test),
            "SVM": SVC(random_state=42).fit(X_train_scaled, y_train_full).predict(X_test_scaled),
            "Logistic Regression": LogisticRegression(random_state=42).fit(X_train_scaled, y_train_full).predict(X_test_scaled)
        }

        comparison_results = {"LAD-Bender": lad_acc}
        for name, y_pred in models.items():
            comparison_results[name] = accuracy_score(y_test, y_pred)

        all_results[ds_name] = comparison_results

        print("\n\n" + "#"*70)
        print(f"# {ds_name} 데이터셋에 대한 최종 성능 비교 요약")
        print("#"*70)

        results_df = pd.DataFrame.from_dict(comparison_results, orient='index', columns=['Accuracy'])
        print(results_df.to_string(float_format="%.4f"))

    # 전체 결과 요약 테이블 출력
    print("\n\n" + "#"*70)
    print("# 모든 데이터셋에 대한 최종 성능 비교 요약")
    print("#"*70)

    summary_df = pd.DataFrame(all_results).T
    print(summary_df.to_string(float_format="%.4f"))

Collecting bayesian-optimization
  Downloading bayesian_optimization-3.0.0-py3-none-any.whl.metadata (10 kB)
Collecting colorama<1.0.0,>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading bayesian_optimization-3.0.0-py3-none-any.whl (36 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-3.0.0 colorama-0.4.6
나눔고딕 폰트가 설치되어 있지 않아, 그래프의 한글이 깨질 수 있습니다.


######################################################################
# 데이터셋: Breast Cancer 처리 시작
######################################################################

--- 베이즈 최적화를 통한 하이퍼파라미터 탐색 ---
|   iter    |  target   | purity... | top_fe... | n_esti... | n_esti... |
-------------------------------------------------------------------------
| [39m2        [39m | [39m0.625    [39m | [39m0.8749080[39m | [39m0.7655000[39m | [39m159.79909[39m | [39m6