In [16]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)

In [17]:
def LoadCSV(dir_path: str):
    """
    讀取交易資料與警示帳戶註記
    """
    df_txn = pd.read_csv(os.path.join(dir_path, "filtered_output.csv"))
    df_alert = pd.read_csv(os.path.join(dir_path, "acct_alert.csv"))

    print("(Finish) Load Dataset.")
    return df_txn, df_alert


In [18]:
def PreProcessing(df: pd.DataFrame) -> pd.DataFrame:
    """
    為每個帳戶做特徵工程：
    1. 金額統計：total / max / min / avg (send/recv)
    2. 交易筆數：send_cnt / recv_cnt
    3. unique 對手帳戶數：unique_to_cnt / unique_from_cnt
    4. 衍生旗標：only_send / only_recv
    5. 比例類特徵：avg_per_txn, flow_ratio, cnt_ratio
    6. is_esun (是否玉山帳戶)
    """

    # ---- 1. 基本金額統計 ----
    send = df.groupby("from_acct")["txn_amt"].sum().rename("total_send_amt")
    recv = df.groupby("to_acct")["txn_amt"].sum().rename("total_recv_amt")

    max_send = df.groupby("from_acct")["txn_amt"].max().rename("max_send_amt")
    min_send = df.groupby("from_acct")["txn_amt"].min().rename("min_send_amt")
    avg_send = df.groupby("from_acct")["txn_amt"].mean().rename("avg_send_amt")

    max_recv = df.groupby("to_acct")["txn_amt"].max().rename("max_recv_amt")
    min_recv = df.groupby("to_acct")["txn_amt"].min().rename("min_recv_amt")
    avg_recv = df.groupby("to_acct")["txn_amt"].mean().rename("avg_recv_amt")

    # ---- 2. 交易筆數 (degree-ish) ----
    send_cnt = df.groupby("from_acct").size().rename("send_cnt")
    recv_cnt = df.groupby("to_acct").size().rename("recv_cnt")

    # ---- 3. unique 對手帳戶數（graph 特徵的入門版） ----
    # from_acct 對多少不同 to_acct 匯款
    unique_to_cnt = (
        df.groupby("from_acct")["to_acct"]
        .nunique()
        .rename("unique_to_cnt")
    )
    # to_acct 從多少不同 from_acct 收款
    unique_from_cnt = (
        df.groupby("to_acct")["from_acct"]
        .nunique()
        .rename("unique_from_cnt")
    )

    # ---- 4. 合併帳戶層級特徵 ----
    df_result = (
        pd.concat(
            [
                max_send,
                min_send,
                avg_send,
                max_recv,
                min_recv,
                avg_recv,
                send,
                recv,
                send_cnt,
                recv_cnt,
                unique_to_cnt,
                unique_from_cnt,
            ],
            axis=1,
        )
        .fillna(0)
        .reset_index()
    )
    df_result.rename(columns={"index": "acct"}, inplace=True)

    # ---- 5. 是否玉山帳戶 is_esun ----
    df_from = df[["from_acct", "from_acct_type"]].rename(
        columns={"from_acct": "acct", "from_acct_type": "is_esun"}
    )
    df_to = df[["to_acct", "to_acct_type"]].rename(
        columns={"to_acct": "acct", "to_acct_type": "is_esun"}
    )
    df_acc = (
        pd.concat([df_from, df_to], ignore_index=True)
        .drop_duplicates()
        .reset_index(drop=True)
    )

    df_result = pd.merge(df_result, df_acc, on="acct", how="left")
    df_result["is_esun"] = df_result["is_esun"].fillna(0).astype(int)

    # ---- 6. 衍生特徵 ----
    # 避免除以 0，加 1
    df_result["avg_send_per_txn"] = df_result["total_send_amt"] / (
        df_result["send_cnt"] + 1
    )
    df_result["avg_recv_per_txn"] = df_result["total_recv_amt"] / (
        df_result["recv_cnt"] + 1
    )

    # 收入 / 支出 金額比例
    df_result["flow_ratio"] = (df_result["total_recv_amt"] + 1) / (
        df_result["total_send_amt"] + 1
    )

    # 出 / 入 筆數比例
    df_result["cnt_ratio"] = (df_result["send_cnt"] + 1) / (
        df_result["recv_cnt"] + 1
    )

    # 是否只收不付、只付不收
    df_result["only_recv"] = (df_result["send_cnt"] == 0).astype(int)
    df_result["only_send"] = (df_result["recv_cnt"] == 0).astype(int)

    # 總共接觸過多少不同 counterparty
    df_result["neighbor_cnt"] = (
        df_result["unique_to_cnt"] + df_result["unique_from_cnt"]
    )

    print("(Finish) PreProcessing.")
    return df_result

In [19]:
def TrainTestSplit(df_X: pd.DataFrame, df_alert: pd.DataFrame):
    """
    1. 用 acct_alert 標記 label (0/1)
    2. 僅使用 is_esun == 1 的帳戶
    3. stratified train/test split
    """
    df = df_X.copy()
    df["label"] = df["acct"].isin(df_alert["acct"]).astype(int)

    # 僅玉山戶（符合比賽設定）
    df = df[df["is_esun"] == 1].reset_index(drop=True)

    pos = df["label"].sum()
    neg = len(df) - pos
    print(
        f"Total accounts (esun only): {len(df)}, "
        f"positives={pos}, negatives={neg}, ratio={pos/(pos+neg+1e-9):.6f}"
    )

    train_df, test_df = train_test_split(
        df,
        test_size=0.3,
        random_state=42,
        stratify=df["label"],
    )

    X_train = train_df.drop(columns=["label"])
    y_train = train_df["label"]

    X_test = test_df.drop(columns=["label"])
    y_test = test_df["label"]

    print(
        f"(Finish) Train-Test Split. "
        f"Train={len(X_train)}, Test={len(X_test)}"
    )
    return X_train, X_test, y_train, y_test

In [20]:
def RandomForest_Modeling(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame):
    """
    RandomForest + validation threshold tuning:
    1. 從 training set 再切出 validation
    2. 在 validation 上掃描 threshold，找 F1 最高
    3. 用整個 training 重練，再在 test 上用該 threshold
    """

    feature_cols = [c for c in X_train.columns if c not in ["acct"]]

    # train / val split
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train[feature_cols],
        y_train,
        test_size=0.2,
        random_state=42,
        stratify=y_train,
    )

    model = RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        n_jobs=-1,
        random_state=42,
        class_weight="balanced_subsample",
    )

    model.fit(X_tr, y_tr)

    # validation 上掃 threshold
    val_proba = model.predict_proba(X_val)[:, 1]

    best_t = 0.5
    best_f1 = 0.0

    for t in np.linspace(0.05, 0.95, 19):  # 0.05, 0.10, ..., 0.95
        y_val_pred = (val_proba >= t).astype(int)
        f1 = f1_score(y_val, y_val_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_t = t

    print(f"[Validation] Best threshold = {best_t:.3f}, F1 = {best_f1:.4f}")

    # 用整個 train 重練
    model.fit(X_train[feature_cols], y_train)

    # 在 test 上用 best threshold
    test_proba = model.predict_proba(X_test[feature_cols])[:, 1]
    y_test_pred = (test_proba >= best_t).astype(int)

    return y_test_pred, best_t


In [21]:
##f1 score 0.38
def GBDT_Modeling(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame):
    """
    進階模型版本：
    GradientBoosting (sklearn GBDT) + 類別權重 + threshold tuning

    步驟：
    1. 從 training 中切出 validation
    2. 用 GBDT 訓練（對正類給比較大的權重）
    3. 在 validation 上掃 threshold → maximize F1
    4. 用整個 training 重訓
    5. 在 test 上用最佳 threshold 預測
    """

    feature_cols = [c for c in X_train.columns if c not in ["acct"]]

    # 1. train / val split
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train[feature_cols],
        y_train,
        test_size=0.2,
        random_state=42,
        stratify=y_train
    )

    # 2. 準備 sample_weight 來處理不平衡
    #    讓少數正類(1)的權重比較大
    pos = (y_tr == 1).sum()
    neg = (y_tr == 0).sum()
    # 反比於出現頻率
    w_pos = neg / (pos + 1e-9)
    w_neg = 1.0

    sample_weight = np.where(y_tr == 1, w_pos, w_neg)

    # 3. 建 GBDT 模型
    model = GradientBoostingClassifier(
        learning_rate=0.05,
        n_estimators=500,
        max_depth=3,
        subsample=0.8,
        max_features=0.8,
        random_state=42
    )

    model.fit(X_tr, y_tr, sample_weight=sample_weight)

    # 4. validation 上掃描 threshold 找最佳 F1
    val_proba = model.predict_proba(X_val)[:, 1]

    best_t, best_f1 = 0.5, 0.0
    for t in np.linspace(0.01, 0.99, 99):
        y_pred_t = (val_proba >= t).astype(int)
        f1 = f1_score(y_val, y_pred_t)
        if f1 > best_f1:
            best_f1 = f1
            best_t = t

    print(f"[Validation] Best threshold = {best_t:.3f}, F1 = {best_f1:.4f}")

    # 5. 用整個 training 重訓
    #    重新算一次 sample_weight（用全部 training）
    pos_all = (y_train == 1).sum()
    neg_all = (y_train == 0).sum()
    w_pos_all = neg_all / (pos_all + 1e-9)
    w_neg_all = 1.0
    sample_weight_all = np.where(y_train == 1, w_pos_all, w_neg_all)

    model.fit(X_train[feature_cols], y_train, sample_weight=sample_weight_all)

    # 6. 在 test 上用 best threshold 預測
    test_proba = model.predict_proba(X_test[feature_cols])[:, 1]
    y_test_pred = (test_proba >= best_t).astype(int)

    return y_test_pred, best_t

In [22]:
##f1 score 0.42
def HGB_Modeling(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame):
    """
    更進階模型：
    HistGradientBoostingClassifier + K-fold OOF threshold tuning

    流程：
    1. 使用 K 折交叉驗證 (StratifiedKFold)，對每一折做訓練與 out-of-fold 預測
    2. 收集所有 OOF 機率，掃描 threshold，找整體 F1-score 最高的門檻
    3. 用最佳超參數與 sample_weight 在「整個 training」上重訓
    4. 對 test 做預測，並用最佳 threshold 轉成 0/1 label
    """

    # 不要把 acct 這種 ID 當特徵
    feature_cols = [c for c in X_train.columns if c not in ["acct"]]

    X = X_train[feature_cols].values
    y = y_train.values

    # ---------- 1. 準備 K 折 ---------- #
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 存 out-of-fold 預測機率
    oof_proba = np.zeros_like(y, dtype=float)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        # 計算 sample_weight：讓正類(1)權重較大，處理不平衡
        pos = (y_tr == 1).sum()
        neg = (y_tr == 0).sum()
        w_pos = neg / (pos + 1e-9)
        w_neg = 1.0
        sample_weight = np.where(y_tr == 1, w_pos, w_neg)

        model = HistGradientBoostingClassifier(
            learning_rate=0.05,
            max_depth=6,
            max_iter=400,
            max_leaf_nodes=63,
            min_samples_leaf=20,
            l2_regularization=1.0,
            validation_fraction=None,  # 我們自己做 K-fold，不用內建 validation
            random_state=42,
        )

        model.fit(X_tr, y_tr, sample_weight=sample_weight)

        # 這裡預測 validation fold 機率（out-of-fold）
        val_proba = model.predict_proba(X_val)[:, 1]
        oof_proba[val_idx] = val_proba

        print(f"[KFold] fold {fold} done.")

    # ---------- 2. 用 OOF 機率找最佳 threshold ---------- #
    best_t, best_f1 = 0.5, 0.0
    for t in np.linspace(0.01, 0.99, 99):
        y_oof_pred = (oof_proba >= t).astype(int)
        f1 = f1_score(y, y_oof_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_t = t

    print(f"[OOF] Best threshold = {best_t:.3f}, F1 = {best_f1:.4f}")

    # ---------- 3. 用整個 training 重訓最終模型 ---------- #
    pos_all = (y == 1).sum()
    neg_all = (y == 0).sum()
    w_pos_all = neg_all / (pos_all + 1e-9)
    w_neg_all = 1.0
    sample_weight_all = np.where(y == 1, w_pos_all, w_neg_all)

    final_model = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=6,
        max_iter=400,
        max_leaf_nodes=63,
        min_samples_leaf=20,
        l2_regularization=1.0,
        validation_fraction=None,
        random_state=42,
    )

    final_model.fit(X, y, sample_weight=sample_weight_all)

    # ---------- 4. 在 test 上用 best_t 預測 ---------- #
    X_test_arr = X_test[feature_cols].values
    test_proba = final_model.predict_proba(X_test_arr)[:, 1]
    y_test_pred = (test_proba >= best_t).astype(int)

    return y_test_pred, best_t

In [None]:
# f1 score 0.38 
# f1
def Stacking_Ensemble_Modeling(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame):
    """
    超進階模型：
    RandomForest + GBDT + HistGBDT 的 Stacking Ensemble
    + Stratified K-Fold OOF
    + OOF 上調 threshold maximize F1
    """

    feature_cols = [c for c in X_train.columns if c not in ["acct"]]

    X = X_train[feature_cols].values
    y = y_train.values
    X_test_arr = X_test[feature_cols].values

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # OOF 機率（每個模型一個）
    oof_rf   = np.zeros(len(y))
    oof_gbdt = np.zeros(len(y))
    oof_hgb  = np.zeros(len(y))

    # test 機率（每 fold 訓練一次，最後取平均）
    test_rf   = np.zeros((5, len(X_test_arr)))
    test_gbdt = np.zeros((5, len(X_test_arr)))
    test_hgb  = np.zeros((5, len(X_test_arr)))

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        # --------- 不平衡權重 ---------
        pos = (y_tr == 1).sum()
        neg = (y_tr == 0).sum()
        w_pos = neg / (pos + 1e-9)
        w_neg = 1.0
        sample_weight = np.where(y_tr == 1, w_pos, w_neg)

        # --------- Model 1: RandomForest ---------
        rf = RandomForestClassifier(
            n_estimators=400,
            max_depth=None,
            n_jobs=-1,
            random_state=42,
            class_weight=None
        )
        rf.fit(X_tr, y_tr, sample_weight=sample_weight)
        oof_rf[val_idx] = rf.predict_proba(X_val)[:, 1]
        test_rf[fold-1] = rf.predict_proba(X_test_arr)[:, 1]

        # --------- Model 2: GradientBoosting ---------
        gbdt = GradientBoostingClassifier(
            learning_rate=0.05,
            n_estimators=500,
            max_depth=3,
            subsample=0.8,
            max_features=0.8,
            random_state=42
        )
        gbdt.fit(X_tr, y_tr, sample_weight=sample_weight)
        oof_gbdt[val_idx] = gbdt.predict_proba(X_val)[:, 1]
        test_gbdt[fold-1] = gbdt.predict_proba(X_test_arr)[:, 1]

        # --------- Model 3: HistGradientBoosting ---------
        hgb = HistGradientBoostingClassifier(
            learning_rate=0.05,
            max_depth=6,
            max_iter=400,
            max_leaf_nodes=63,
            min_samples_leaf=20,
            l2_regularization=1.0,
            validation_fraction=None,
            random_state=42
        )
        hgb.fit(X_tr, y_tr, sample_weight=sample_weight)
        oof_hgb[val_idx] = hgb.predict_proba(X_val)[:, 1]
        test_hgb[fold-1] = hgb.predict_proba(X_test_arr)[:, 1]

        print(f"[Stacking] Fold {fold} done.")

    # --------- 第二層資料（OOF 作為新特徵）---------
    X_meta = np.column_stack([oof_rf, oof_gbdt, oof_hgb])

    # --------- 第二層模型（Logistic Regression）---------
    meta_model = LogisticRegression(max_iter=1000, class_weight="balanced")
    meta_model.fit(X_meta, y)

    oof_meta_proba = meta_model.predict_proba(X_meta)[:, 1]

    # --------- OOF 上找最佳 threshold ---------
    best_t, best_f1 = 0.5, 0.0
    for t in np.linspace(0.01, 0.99, 99):
        y_oof_pred = (oof_meta_proba >= t).astype(int)
        f1 = f1_score(y, y_oof_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_t = t

    print(f"[OOF-STACK] Best threshold = {best_t:.3f}, F1 = {best_f1:.4f}")

    # --------- 用全訓練資料 + 平均 test 機率 做最終預測 ---------
    test_rf_mean   = test_rf.mean(axis=0)
    test_gbdt_mean = test_gbdt.mean(axis=0)
    test_hgb_mean  = test_hgb.mean(axis=0)

    X_test_meta = np.column_stack([
        test_rf_mean,
        test_gbdt_mean,
        test_hgb_mean
    ])

    test_meta_proba = meta_model.predict_proba(X_test_meta)[:, 1]
    y_test_pred = (test_meta_proba >= best_t).astype(int)

    return y_test_pred, best_t

In [24]:
if __name__ == "__main__":
    # 資料路徑
    dir_path = "./data/raw"

    # 1. 讀資料
    df_txn, df_alert = LoadCSV(dir_path)

    # 2. 特徵工程
    df_X = PreProcessing(df_txn)

    # 3. 切 train/test
    X_train, X_test, y_train, y_test = TrainTestSplit(df_X, df_alert)

    # 4. 模型訓練 + threshold tuning
    y_pred, best_t =  Stacking_Ensemble_Modeling(X_train, y_train, X_test)

    # 5. 評估
    print("\n=== Evaluation on TEST set ===")
    print(f"Best threshold used on test = {best_t:.3f}")
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:   ", recall_score(y_test, y_pred))
    print("F1-score: ", f1_score(y_test, y_pred))
    print("\nDetailed Report:\n", classification_report(y_test, y_pred))

(Finish) Load Dataset.
(Finish) PreProcessing.
Total accounts (esun only): 317313, positives=1004, negatives=316309, ratio=0.003164
(Finish) Train-Test Split. Train=222119, Test=95194
[Stacking] Fold 1 done.
[Stacking] Fold 2 done.
[Stacking] Fold 3 done.
[Stacking] Fold 4 done.
[Stacking] Fold 5 done.
[OOF-STACK] Best threshold = 0.980, F1 = 0.3867

=== Evaluation on TEST set ===
Best threshold used on test = 0.980
Precision: 0.6268656716417911
Recall:    0.27906976744186046
F1-score:  0.38620689655172413

Detailed Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     94893
           1       0.63      0.28      0.39       301

    accuracy                           1.00     95194
   macro avg       0.81      0.64      0.69     95194
weighted avg       1.00      1.00      1.00     95194

