<a href="https://colab.research.google.com/github/kahram-y/AML_project/blob/main/AML(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

금융결제원 AML 탐지 프로젝트

IBM Transactions for Anti Money Laundering 데이터셋 활용

목표: 자금세탁 계좌 탐지 (노드 분류)

프로젝트 구조:
1. 데이터 탐색 및 전처리
2. 문제 정의
3. 피쳐 생성 (집계 피쳐 + 그래프 피쳐)
4. 모델 학습 및 평가
   - Baseline: XGBoost/CatBoost
   - 시계열 모델 앙상블
   - 그래프 피쳐 추가
   - GNN 모델

In [1]:
pip install pandas numpy scikit-learn xgboost catboost lightgbm networkx matplotlib seaborn imbalanced-learn shap



In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb

import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# 파일 경로
trans_path = '/content/drive/MyDrive/HI-Small_Trans.csv'
accounts_path = '/content/drive/MyDrive/HI-Small_accounts.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# ====================================
# 1. 데이터 로드 및 탐색
# ====================================

class AMLDataLoader:
    """AML 데이터 로드 및 초기 탐색"""

    def __init__(self, trans_path, accounts_path):
        self.trans_path = trans_path
        self.accounts_path = accounts_path

    def load_data(self):
        """데이터 로드"""
        print("=" * 80)
        print("데이터 로딩 중...")

        # 거래 데이터 로드
        self.transactions = pd.read_csv(self.trans_path)

        # 계좌 데이터 로드
        self.accounts = pd.read_csv(self.accounts_path)

        print(f"거래 데이터 shape: {self.transactions.shape}")
        print(f"계좌 데이터 shape: {self.accounts.shape}")

        return self.transactions, self.accounts

    def explore_data(self, df_trans, df_accounts):
        """데이터 탐색"""
        print("\n" + "=" * 80)
        print("데이터 탐색")
        print("=" * 80)

        # 거래 데이터 기본 정보
        print("\n[거래 데이터 샘플]")
        print(df_trans.head())
        print(f"\n컬럼: {df_trans.columns.tolist()}")
        print(f"\n결측치:\n{df_trans.isnull().sum()}")

        # Is Laundering 분포
        if 'Is Laundering' in df_trans.columns:
            laundering_dist = df_trans['Is Laundering'].value_counts()
            print(f"\n[자금세탁 분포]")
            print(laundering_dist)
            print(f"자금세탁 비율: {laundering_dist[1] / len(df_trans) * 100:.4f}%")

        # 시간 정보 파싱 및 분포 확인
        df_trans['Timestamp'] = pd.to_datetime(df_trans['Timestamp'])
        df_trans['Year'] = df_trans['Timestamp'].dt.year
        df_trans['Month'] = df_trans['Timestamp'].dt.month
        df_trans['Day'] = df_trans['Timestamp'].dt.day
        df_trans['Hour'] = df_trans['Timestamp'].dt.hour
        df_trans['DayOfWeek'] = df_trans['Timestamp'].dt.dayofweek

        print(f"\n[시간 범위]")
        print(f"시작: {df_trans['Timestamp'].min()}")
        print(f"종료: {df_trans['Timestamp'].max()}")

        # Day별 자금세탁 분포 확인
        if 'Is Laundering' in df_trans.columns:
            day_dist = df_trans.groupby('Day')['Is Laundering'].agg(['sum', 'count', 'mean'])
            print(f"\n[일별 자금세탁 분포]")
            print(day_dist)

            # 치우침 확인
            if day_dist['mean'].std() > 0.1:
                print("\n⚠️ Day 피쳐의 label 분포가 치우쳐져 있습니다. 제거 고려 필요")

        # 계좌 데이터 정보
        print(f"\n[계좌 데이터 샘플]")
        print(df_accounts.head())

        # Bank Name 분포 확인
        if 'Bank Name' in df_accounts.columns:
            print(f"\n은행 분포:\n{df_accounts['Bank Name'].value_counts().head(10)}")

        return df_trans


# ====================================
# 2. 데이터 전처리 및 샘플링
# ====================================

class AMLPreprocessor:
    """데이터 전처리 및 샘플링"""

    def __init__(self, sample_ratio=0.1, random_state=42):
        self.sample_ratio = sample_ratio
        self.random_state = random_state

    def create_hourly_samples(self, df_trans):
        """시간 단위 배치로 모델 단위 생성
        각 계좌번호 + 시간 단위로 샘플 생성
        """
        print("\n" + "=" * 80)
        print("시간 단위 배치 샘플 생성")
        print("=" * 80)

        # From Account 기준 샘플 생성
        from_samples = df_trans.copy()
        from_samples['Account'] = from_samples['From Bank'].astype(str) + '_' + from_samples['Account'].astype(str)
        from_samples['TimeUnit'] = from_samples['Timestamp'].dt.strftime('%Y-%m-%d %H')
        from_samples['Direction'] = 'OUT'

        # To Account 기준 샘플 생성
        to_samples = df_trans.copy()
        to_samples['Account'] = to_samples['To Bank'].astype(str) + '_' + to_samples['Account.1'].astype(str)
        to_samples['TimeUnit'] = to_samples['Timestamp'].dt.strftime('%Y-%m-%d %H')
        to_samples['Direction'] = 'IN'

        # 합치기
        all_samples = pd.concat([from_samples, to_samples], ignore_index=True)

        # 계좌 + 시간 단위로 그룹화하여 label 결정
        # Is Laundering=1인 거래가 하나라도 있으면 해당 시간의 계좌는 suspicious
        account_time_labels = all_samples.groupby(['Account', 'TimeUnit']).agg({
            'Is Laundering': 'max',  # 하나라도 1이면 1
            'Timestamp': 'min'
        }).reset_index()

        account_time_labels.rename(columns={'Timestamp': 'TimeUnit_Start'}, inplace=True)

        print(f"총 샘플 수 (계좌-시간 단위): {len(account_time_labels)}")
        print(f"자금세탁 샘플: {account_time_labels['Is Laundering'].sum()}")

        return all_samples, account_time_labels

    def stratified_sample(self, df, target_col='Is Laundering'):
        """계층화 샘플링"""
        print(f"\n계층화 샘플링 (비율: {self.sample_ratio})")

        # 클래스별로 샘플링
        sampled_dfs = []
        for label in df[target_col].unique():
            label_df = df[df[target_col] == label]
            n_samples = int(len(label_df) * self.sample_ratio)
            sampled = label_df.sample(n=n_samples, random_state=self.random_state)
            sampled_dfs.append(sampled)

        result = pd.concat(sampled_dfs, ignore_index=True)
        print(f"샘플링 후 크기: {len(result)}")
        print(f"자금세탁 비율: {result[target_col].sum() / len(result) * 100:.4f}%")

        return result

    def analyze_laundering_patterns(self, df_trans):
        """자금세탁 건과 정상 건의 차이 분석"""
        print("\n" + "=" * 80)
        print("자금세탁 패턴 분석")
        print("=" * 80)

        laundering = df_trans[df_trans['Is Laundering'] == 1]
        normal = df_trans[df_trans['Is Laundering'] == 0]

        # 거래 금액 비교
        print(f"\n[거래 금액 통계]")
        print(f"자금세탁 - 평균: ${laundering['Amount Received'].mean():.2f}, "
              f"중앙값: ${laundering['Amount Received'].median():.2f}")
        print(f"정상 거래 - 평균: ${normal['Amount Received'].mean():.2f}, "
              f"중앙값: ${normal['Amount Received'].median():.2f}")

        # 결제 수단 분포
        print(f"\n[결제 수단 분포]")
        print("자금세탁:")
        print(laundering['Receiving Currency'].value_counts().head())
        print("\n정상 거래:")
        print(normal['Receiving Currency'].value_counts().head())

        # 시간대 분포
        print(f"\n[시간대 분포]")
        print("자금세탁 - 시간대별:")
        print(laundering['Hour'].value_counts().sort_index())

        return laundering, normal


# ====================================
# 3. 피쳐 생성
# ====================================

class FeatureEngineer:
    """집계 피쳐 및 그래프 피쳐 생성"""

    def __init__(self):
        self.feature_names = []

    def create_aggregation_features(self, df_trans, account_time_df):
        """50개 이상의 집계 피쳐 생성
        과거 정보만 사용 (Data Leakage 방지)
        """
        print("\n" + "=" * 80)
        print("집계 피쳐 생성")
        print("=" * 80)

        features_list = []

        for idx, row in account_time_df.iterrows():
            if idx % 10000 == 0:
                print(f"진행: {idx}/{len(account_time_df)}")

            account = row['Account']
            time_unit = pd.to_datetime(row['TimeUnit_Start'])

            # 해당 계좌의 과거 거래만 필터링
            past_trans = df_trans[
                (df_trans['Account'] == account) &
                (df_trans['Timestamp'] < time_unit)
            ].copy()

            if len(past_trans) == 0:
                # 과거 거래가 없으면 기본값
                features = self._default_features(account, time_unit)
            else:
                features = self._compute_features(past_trans, account, time_unit)

            features_list.append(features)

            if idx >= 1000:  # 시연용으로 1000건만
                break

        feature_df = pd.DataFrame(features_list)
        self.feature_names = [c for c in feature_df.columns
                             if c not in ['Account', 'TimeUnit']]

        print(f"\n생성된 피쳐 수: {len(self.feature_names)}")
        print(f"피쳐 목록 (처음 10개): {self.feature_names[:10]}")

        return feature_df

    def _compute_features(self, past_trans, account, time_unit):
        """실제 피쳐 계산"""
        features = {'Account': account, 'TimeUnit': str(time_unit)}

        # 시간 윈도우 정의
        windows = {
            '1h': timedelta(hours=1),
            '3h': timedelta(hours=3),
            '1d': timedelta(days=1),
            '7d': timedelta(days=7)
        }

        for window_name, window_delta in windows.items():
            window_start = time_unit - window_delta
            window_trans = past_trans[past_trans['Timestamp'] >= window_start]

            # OUT 거래 (송금)
            out_trans = window_trans[window_trans['Direction'] == 'OUT']
            features[f'out_count_{window_name}'] = len(out_trans)
            features[f'out_amount_sum_{window_name}'] = float(out_trans['Amount Paid'].sum()) if len(out_trans) > 0 else 0.0
            features[f'out_amount_mean_{window_name}'] = float(out_trans['Amount Paid'].mean()) if len(out_trans) > 0 else 0.0
            features[f'out_amount_std_{window_name}'] = float(out_trans['Amount Paid'].std()) if len(out_trans) > 0 else 0.0
            features[f'out_amount_max_{window_name}'] = float(out_trans['Amount Paid'].max()) if len(out_trans) > 0 else 0.0

            # IN 거래 (입금)
            in_trans = window_trans[window_trans['Direction'] == 'IN']
            features[f'in_count_{window_name}'] = len(in_trans)
            features[f'in_amount_sum_{window_name}'] = float(in_trans['Amount Received'].sum()) if len(in_trans) > 0 else 0.0
            features[f'in_amount_mean_{window_name}'] = float(in_trans['Amount Received'].mean()) if len(in_trans) > 0 else 0.0
            features[f'in_amount_std_{window_name}'] = float(in_trans['Amount Received'].std()) if len(in_trans) > 0 else 0.0
            features[f'in_amount_max_{window_name}'] = float(in_trans['Amount Received'].max()) if len(in_trans) > 0 else 0.0

            # 순 흐름
            features[f'net_flow_{window_name}'] = (
                features[f'in_amount_sum_{window_name}'] -
                features[f'out_amount_sum_{window_name}']
            )

            # 외화 거래
            foreign_curr = window_trans[
                window_trans['Payment Currency'] != window_trans['Receiving Currency']
            ]
            features[f'foreign_count_{window_name}'] = len(foreign_curr)
            features[f'foreign_ratio_{window_name}'] = (
                len(foreign_curr) / len(window_trans) if len(window_trans) > 0 else 0
            )

        # 전체 거래 통계
        features['total_trans_count'] = len(past_trans)
        out_total = past_trans[past_trans['Direction'] == 'OUT']['Amount Paid'].sum()
        in_total = past_trans[past_trans['Direction'] == 'IN']['Amount Received'].sum()
        features['total_out_amount'] = float(out_total) if pd.notna(out_total) else 0.0
        features['total_in_amount'] = float(in_total) if pd.notna(in_total) else 0.0

        # 거래 상대방 다양성 - 'Account' 컬럼이 충돌할 수 있으므로 다른 방법 사용
        # From/To 계좌의 유니크 수 계산
        unique_from = past_trans[past_trans['Direction'] == 'OUT']['From Bank'].astype(str) + '_' + past_trans[past_trans['Direction'] == 'OUT']['Account'].astype(str)
        unique_to = past_trans[past_trans['Direction'] == 'IN']['To Bank'].astype(str) + '_' + past_trans[past_trans['Direction'] == 'IN']['Account.1'].astype(str)
        features['unique_counterparties'] = len(set(unique_from.tolist() + unique_to.tolist()))

        # 결제 수단 다양성
        features['unique_currencies'] = past_trans['Payment Currency'].nunique()

        # 시간대 분포
        hour_dist = past_trans['Hour'].value_counts()
        features['night_trans_ratio'] = (
            hour_dist[(hour_dist.index >= 0) & (hour_dist.index < 6)].sum() /
            len(past_trans) if len(past_trans) > 0 else 0
        )

        return features

    def _default_features(self, account, time_unit):
        """과거 거래가 없을 때 기본 피쳐"""
        features = {'Account': account, 'TimeUnit': str(time_unit)}

        # 모든 피쳐를 0으로 초기화
        windows = ['1h', '3h', '1d', '7d']
        for window in windows:
            for prefix in ['out', 'in']:
                for metric in ['count', 'amount_sum', 'amount_mean', 'amount_std', 'amount_max']:
                    features[f'{prefix}_{metric}_{window}'] = 0
            features[f'net_flow_{window}'] = 0
            features[f'foreign_count_{window}'] = 0
            features[f'foreign_ratio_{window}'] = 0

        features['total_trans_count'] = 0
        features['total_out_amount'] = 0
        features['total_in_amount'] = 0
        features['unique_counterparties'] = 0
        features['unique_currencies'] = 0
        features['night_trans_ratio'] = 0

        return features

    def create_graph_features(self, df_trans, account_time_df):
        """그래프 기반 피쳐 생성
        - Centrality 기반 (Degree, Closeness, Betweenness)
        - Path & Flow 패턴
        - Community 구조
        """
        print("\n" + "=" * 80)
        print("그래프 피쳐 생성")
        print("=" * 80)

        # 거래 네트워크 구축
        G = nx.DiGraph()

        for _, row in df_trans.iterrows():
            from_acc = str(row['From Bank']) + '_' + str(row['Account'])
            to_acc = str(row['To Bank']) + '_' + str(row['Account.1'])

            # Amount Paid를 float으로 변환
            try:
                amount = float(row['Amount Paid'])
            except (ValueError, TypeError):
                amount = 0.0

            if G.has_edge(from_acc, to_acc):
                G[from_acc][to_acc]['weight'] += amount
                G[from_acc][to_acc]['count'] += 1
            else:
                G.add_edge(from_acc, to_acc, weight=amount, count=1)

        print(f"그래프 구축 완료 - 노드: {G.number_of_nodes()}, 엣지: {G.number_of_edges()}")

        # Centrality 계산
        print("Centrality 계산 중...")
        degree_centrality = nx.degree_centrality(G)
        in_degree_centrality = nx.in_degree_centrality(G)
        out_degree_centrality = nx.out_degree_centrality(G)

        # Betweenness는 계산량이 많으므로 샘플링
        sample_nodes = list(G.nodes())[:min(1000, len(G.nodes()))]
        betweenness_centrality = nx.betweenness_centrality(
            G.subgraph(sample_nodes),
            weight='weight'
        )

        # PageRank
        pagerank = nx.pagerank(G, weight='weight')

        # 그래프 피쳐를 데이터프레임에 추가
        graph_features = []

        for _, row in account_time_df.iterrows():
            account = row['Account']

            features = {
                'Account': account,
                'TimeUnit': row['TimeUnit'],
                'degree_centrality': degree_centrality.get(account, 0),
                'in_degree_centrality': in_degree_centrality.get(account, 0),
                'out_degree_centrality': out_degree_centrality.get(account, 0),
                'betweenness_centrality': betweenness_centrality.get(account, 0),
                'pagerank': pagerank.get(account, 0),
            }

            # 이웃 노드 정보
            if account in G:
                successors = list(G.successors(account))
                predecessors = list(G.predecessors(account))

                features['num_successors'] = len(successors)
                features['num_predecessors'] = len(predecessors)
                features['total_out_weight'] = float(sum(G[account][s]['weight'] for s in successors))
                features['total_in_weight'] = float(sum(G[p][account]['weight'] for p in predecessors))
            else:
                features['num_successors'] = 0
                features['num_predecessors'] = 0
                features['total_out_weight'] = 0.0
                features['total_in_weight'] = 0.0

            graph_features.append(features)

        graph_feature_df = pd.DataFrame(graph_features)

        print(f"그래프 피쳐 생성 완료 - 피쳐 수: {len(graph_feature_df.columns) - 2}")

        return graph_feature_df


# ====================================
# 4. 모델 학습 및 평가
# ====================================

class AMLModelTrainer:
    """모델 학습 및 평가"""

    def __init__(self, random_state=42):
        self.random_state = random_state
        self.models = {}
        self.results = {}

    def prepare_train_test_split(self, feature_df, label_df, test_size=0.3):
        """시계열 기준 train/test 분할"""
        print("\n" + "=" * 80)
        print("Train/Test 분할")
        print("=" * 80)

        # 피쳐와 라벨 병합
        merged = feature_df.merge(
            label_df[['Account', 'TimeUnit', 'Is Laundering']],
            on=['Account', 'TimeUnit'],
            how='inner'
        )

        # 시간순 정렬
        merged = merged.sort_values('TimeUnit')

        # 시간 기준 분할
        split_idx = int(len(merged) * (1 - test_size))
        train_df = merged.iloc[:split_idx]
        test_df = merged.iloc[split_idx:]

        print(f"Train set: {len(train_df)} (자금세탁: {train_df['Is Laundering'].sum()})")
        print(f"Test set: {len(test_df)} (자금세탁: {test_df['Is Laundering'].sum()})")
        print(f"Train 자금세탁 비율: {train_df['Is Laundering'].mean():.4%}")
        print(f"Test 자금세탁 비율: {test_df['Is Laundering'].mean():.4%}")

        # Feature와 Label 분리
        feature_cols = [c for c in merged.columns
                       if c not in ['Account', 'TimeUnit', 'Is Laundering']]

        X_train = train_df[feature_cols]
        y_train = train_df['Is Laundering']
        X_test = test_df[feature_cols]
        y_test = test_df['Is Laundering']

        return X_train, X_test, y_train, y_test, test_df

    def train_baseline_model(self, X_train, y_train, X_test, y_test,
                            use_smote=False, scale_pos_weight=None):
        """Baseline: XGBoost/CatBoost"""
        print("\n" + "=" * 80)
        print("Baseline 모델 학습 (XGBoost)")
        print("=" * 80)

        # 데이터 검증
        print(f"학습 데이터: {len(X_train)}건")
        print(f"테스트 데이터: {len(X_test)}건")
        print(f"Positive 비율 (Train): {y_train.sum() / len(y_train) * 100:.4f}%")
        print(f"Positive 비율 (Test): {y_test.sum() / len(y_test) * 100:.4f}%")

        # NaN 값 처리
        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)

        # Inf 값 처리
        X_train = X_train.replace([np.inf, -np.inf], 0)
        X_test = X_test.replace([np.inf, -np.inf], 0)

        # SMOTE 적용 여부
        if use_smote and y_train.sum() > 0:
            print("SMOTE 오버샘플링 적용 중...")
            try:
                smote = SMOTE(random_state=self.random_state)
                X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
                print(f"SMOTE 후 - Positive: {y_train_res.sum()}, Negative: {len(y_train_res) - y_train_res.sum()}")
            except Exception as e:
                print(f"⚠️ SMOTE 실패: {e}")
                print("원본 데이터로 학습합니다.")
                X_train_res, y_train_res = X_train, y_train
        else:
            X_train_res, y_train_res = X_train, y_train

        # scale_pos_weight 계산
        if scale_pos_weight is None:
            if y_train_res.sum() > 0:
                scale_pos_weight = (len(y_train_res) - y_train_res.sum()) / y_train_res.sum()
            else:
                print("⚠️ 학습 데이터에 Positive 샘플이 없습니다!")
                scale_pos_weight = 1.0

        print(f"scale_pos_weight: {scale_pos_weight:.2f}")

        # base_score 계산 (0과 1 사이로 제한)
        positive_ratio = y_train_res.sum() / len(y_train_res)
        base_score = max(0.01, min(0.99, positive_ratio))  # 0.01 ~ 0.99 범위로 제한

        print(f"base_score: {base_score:.4f}")

        # XGBoost 학습
        xgb_model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            scale_pos_weight=scale_pos_weight,
            base_score=base_score,  # 명시적으로 설정
            random_state=self.random_state,
            eval_metric='logloss',  # auc 대신 logloss 사용
            use_label_encoder=False
        )

        try:
            xgb_model.fit(X_train_res, y_train_res, verbose=False)
        except Exception as e:
            print(f"⚠️ XGBoost 학습 실패: {e}")
            print("CatBoost로 전환합니다...")

            # CatBoost로 대체
            from catboost import CatBoostClassifier
            xgb_model = CatBoostClassifier(
                iterations=100,
                depth=6,
                learning_rate=0.1,
                loss_function='Logloss',
                random_state=self.random_state,
                verbose=False
            )
            xgb_model.fit(X_train_res, y_train_res)

        # 예측
        y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

        # 모델 저장
        self.models['baseline_xgb'] = xgb_model

        print("학습 완료!")

        return xgb_model, y_pred_proba

    def evaluate_topk(self, y_true, y_pred_proba, test_df, k_values=[50, 100, 200, 500]):
        """Top-K 평가"""
        print("\n" + "=" * 80)
        print("Top-K 평가")
        print("=" * 80)

        results = {}

        # 점수 스케일링 (1000점 만점)
        scores = (y_pred_proba - y_pred_proba.min()) / (y_pred_proba.max() - y_pred_proba.min()) * 1000

        for k in k_values:
            # Top K 선택
            top_k_idx = np.argsort(y_pred_proba)[-k:]
            y_pred_topk = np.zeros(len(y_true))
            y_pred_topk[top_k_idx] = 1

            # 메트릭 계산
            precision = precision_score(y_true, y_pred_topk, zero_division=0)
            recall = recall_score(y_true, y_pred_topk, zero_division=0)
            f1 = f1_score(y_true, y_pred_topk, zero_division=0)

            detected_laundering = y_true[top_k_idx].sum()
            total_laundering = y_true.sum()

            results[f'top_{k}'] = {
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'detected': detected_laundering,
                'total': total_laundering
            }

            print(f"\nTop-{k} 결과:")
            print(f"  Precision: {precision:.4f}")
            print(f"  Recall: {recall:.4f}")
            print(f"  F1-Score: {f1:.4f}")
            print(f"  탐지된 자금세탁: {detected_laundering}/{total_laundering}")

        # 점수 구간별 분포
        print("\n" + "=" * 80)
        print("점수 구간별 분포")
        print("=" * 80)

        bins = range(0, 1001, 100)
        score_bins = pd.cut(scores, bins=bins, right=False)

        for bin_range in score_bins.cat.categories:
            mask = score_bins == bin_range
            bin_positive = y_true[mask].sum()
            bin_negative = len(y_true[mask]) - bin_positive

            print(f"{bin_range}: 정상={bin_negative}, 자금세탁={bin_positive}")

        return results

    def explain_with_shap(self, model, X_train, X_test, feature_names):
        """SHAP을 이용한 피쳐 중요도 분석"""
        print("\n" + "=" * 80)
        print("XAI: SHAP 피쳐 중요도 분석")
        print("=" * 80)

        try:
            import shap

            # SHAP 값 계산
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test.iloc[:100])  # 샘플만

            # 피쳐 중요도
            feature_importance = pd.DataFrame({
                'feature': feature_names,
                'importance': np.abs(shap_values).mean(axis=0)
            }).sort_values('importance', ascending=False)

            print("\nTop 20 중요 피쳐:")
            print(feature_importance.head(20))

            return feature_importance
        except ImportError:
            print("SHAP 라이브러리가 설치되지 않았습니다. pip install shap")
            return None


# ====================================
# 5. 메인 실행 파이프라인
# ====================================

def main():
    """전체 파이프라인 실행"""

    print("=" * 80)
    print("금융결제원 AML 탐지 프로젝트 시작")
    print("=" * 80)

    # ========== 1. 데이터 로드 ==========
    loader = AMLDataLoader(
        trans_path=trans_path,
        accounts_path=accounts_path
    )

    df_trans, df_accounts = loader.load_data()
    df_trans = loader.explore_data(df_trans, df_accounts)

    # ========== 2. 전처리 및 샘플링 ==========
    preprocessor = AMLPreprocessor(sample_ratio=0.1)

    # 시간 단위 배치 샘플 생성
    all_trans, account_time_labels = preprocessor.create_hourly_samples(df_trans)

    # 자금세탁 패턴 분석
    laundering, normal = preprocessor.analyze_laundering_patterns(df_trans)

    # 샘플링 (큰 데이터를 작게)
    sampled_labels = preprocessor.stratified_sample(account_time_labels)

    # ========== 3. 피쳐 생성 ==========
    feature_engineer = FeatureEngineer()

    # 집계 피쳐 생성
    print("\n⚠️ 주의: 전체 데이터에 대해 피쳐를 생성하려면 시간이 오래 걸립니다.")
    print("시연을 위해 1000건만 생성합니다.")
    agg_features = feature_engineer.create_aggregation_features(
        all_trans,
        sampled_labels
    )

    # 그래프 피쳐 생성
    graph_features = feature_engineer.create_graph_features(
        all_trans,
        sampled_labels
    )

    # 피쳐 병합
    all_features = agg_features.merge(
        graph_features,
        on=['Account', 'TimeUnit'],
        how='inner'
    )

    print(f"\n전체 피쳐 수: {len([c for c in all_features.columns if c not in ['Account', 'TimeUnit']])}")

    # ========== 4. 모델 학습 ==========
    trainer = AMLModelTrainer()

    # Train/Test 분할
    X_train, X_test, y_train, y_test, test_df = trainer.prepare_train_test_split(
        all_features,
        sampled_labels
    )

    # Baseline 모델 학습
    baseline_model, y_pred_proba = trainer.train_baseline_model(
        X_train, y_train, X_test, y_test,
        use_smote=False,  # SMOTE 사용 여부
        scale_pos_weight=None  # Auto 계산
    )

    # ========== 5. 평가 ==========
    # Top-K 평가
    topk_results = trainer.evaluate_topk(
        y_test.values,
        y_pred_proba,
        test_df,
        k_values=[50, 100, 200]
    )

    # Feature Importance (XAI)
    feature_names = [c for c in X_train.columns]
    feature_importance = trainer.explain_with_shap(
        baseline_model,
        X_train,
        X_test,
        feature_names
    )

    # ========== 6. 결과 저장 ==========
    print("\n" + "=" * 80)
    print("프로젝트 완료!")
    print("=" * 80)

    print("\n다음 단계:")
    print("1. 그래프 피쳐 추가 전후 성능 비교")
    print("2. 시계열 모델 앙상블")
    print("3. GNN 모델 적용")
    print("4. K-Fold Cross Validation으로 안정성 검증")

    return {
        'baseline_model': baseline_model,
        'topk_results': topk_results,
        'feature_importance': feature_importance
    }


# ====================================
# 추가: GNN 모델 (향후 구현)
# ====================================

class GNNModel:
    """Graph Neural Network for AML Detection
    향후 구현 예정:
    - GraphSAGE
    - GAT (Graph Attention Network)
    - Temporal GNN
    """

    def __init__(self):
        print("GNN 모델은 PyTorch Geometric 라이브러리 필요")
        print("pip install torch-geometric")

    def build_model(self):
        """GNN 모델 구축"""
        pass

    def train(self):
        """GNN 학습"""
        pass


if __name__ == "__main__":
    # 실행
    results = main()

    print("\n" + "=" * 80)
    print("사용 예시:")
    print("=" * 80)
    print("""
    # 1. 데이터 로드
    loader = AMLDataLoader('HI-Small_Trans.csv', 'HI-Small_accounts.csv')
    df_trans, df_accounts = loader.load_data()

    # 2. 전처리
    preprocessor = AMLPreprocessor(sample_ratio=0.1)
    all_trans, account_time_labels = preprocessor.create_hourly_samples(df_trans)

    # 3. 피쳐 생성
    feature_engineer = FeatureEngineer()
    features = feature_engineer.create_aggregation_features(all_trans, account_time_labels)

    # 4. 모델 학습
    trainer = AMLModelTrainer()
    X_train, X_test, y_train, y_test, test_df = trainer.prepare_train_test_split(features, account_time_labels)
    model, predictions = trainer.train_baseline_model(X_train, y_train, X_test, y_test)

    # 5. 평가
    results = trainer.evaluate_topk(y_test, predictions, test_df)
    """)

금융결제원 AML 탐지 프로젝트 시작
데이터 로딩 중...
거래 데이터 shape: (5078345, 11)
계좌 데이터 shape: (518581, 5)

데이터 탐색

[거래 데이터 샘플]
          Timestamp  From Bank    Account  To Bank  Account.1  \
0  2022/09/01 00:20         10  8000EBD30       10  8000EBD30   
1  2022/09/01 00:20       3208  8000F4580        1  8000F5340   
2  2022/09/01 00:00       3209  8000F4670     3209  8000F4670   
3  2022/09/01 00:02         12  8000F5030       12  8000F5030   
4  2022/09/01 00:06         10  8000F5200       10  8000F5200   

   Amount Received Receiving Currency  Amount Paid Payment Currency  \
0          3697.34          US Dollar      3697.34        US Dollar   
1             0.01          US Dollar         0.01        US Dollar   
2         14675.57          US Dollar     14675.57        US Dollar   
3          2806.97          US Dollar      2806.97        US Dollar   
4         36682.97          US Dollar     36682.97        US Dollar   

  Payment Format  Is Laundering  
0   Reinvestment              0  
1     

ValueError: zero-size array to reduction operation minimum which has no identity