<a href="https://colab.research.google.com/github/lianjin1014/Data-Literacy-Project-by-lian/blob/main/%E6%AC%A2%E8%BF%8E%E4%BD%BF%E7%94%A8_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, time, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score,
                             recall_score, f1_score, confusion_matrix, roc_curve)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt

DATA_DIR = "."

RANDOM_STATE = 42
TEST_SIZE    = 0.2

MAX_ROWS     = 120_000    # LR / RF / XGB 的训练上限
SVM_MAX_ROWS = 25_000     # SVM 更耗时，用更小上限

# 是否对线性/核方法启用类不平衡权重
USE_CLASS_WEIGHT_BALANCED = True

In [None]:
# -------------------- 路径 --------------------
TRAIN_TRANS_PATH = os.path.join(DATA_DIR, "train_transaction.csv")
TRAIN_ID_PATH    = os.path.join(DATA_DIR, "train_identity.csv")
TEST_TRANS_PATH  = os.path.join(DATA_DIR, "test_transaction.csv")
TEST_ID_PATH     = os.path.join(DATA_DIR, "test_identity.csv")
SAMPLE_SUB_PATH  = os.path.join(DATA_DIR, "sample_submission.csv")

# -------------------- 输出 --------------------
OUT_METRICS_CSV  = os.path.join(DATA_DIR, "model_metrics.csv")
OUT_ROC_PNG      = os.path.join(DATA_DIR, "roc_curves.png")
OUT_CM_PNG       = os.path.join(DATA_DIR, "confusion_matrix_best.png")
OUT_SUBMISSION   = os.path.join(DATA_DIR, "submission_best_model.csv")


In [None]:
xgb_available = True
try:
    from xgboost import XGBClassifier
except Exception as e:
    xgb_available = False
    xgb_import_error = str(e)

def load_and_merge(trans_path, id_path):
    trans = pd.read_csv(trans_path)
    ident = pd.read_csv(id_path)
    return trans.merge(ident, on="TransactionID", how="left")

def stratified_cap(X, y, cap, seed=RANDOM_STATE):
    if (cap is None) or (len(X) <= cap):
        return X, y
    Xs, _, ys, _ = train_test_split(X, y, train_size=cap, stratify=y, random_state=seed)
    return Xs, ys

def align_columns_for_inference(X_df, num_cols, cat_cols):
    X = X_df.copy()
    wanted = list(num_cols) + list(cat_cols)

    # add missing
    for c in wanted:
        if c not in X.columns:
            if c in num_cols:
                X[c] = np.nan
            else:
                X[c] = pd.Series([np.nan] * len(X), dtype="object")

    # drop extras & reorder
    X = X[wanted]
    return X

def evaluate_model(name, pipe, Xtr, ytr, Xva, yva, store_curves):
    pipe.fit(Xtr, ytr)
    if hasattr(pipe, "predict_proba"):
        proba = pipe.predict_proba(Xva)[:, 1]
    elif hasattr(pipe, "decision_function"):
        dfun  = pipe.decision_function(Xva)
        proba = (dfun - dfun.min()) / (dfun.max() - dfun.min() + 1e-9)
    else:
        proba = pipe.predict(Xva)

    preds = (proba >= 0.5).astype(int)

    auc  = roc_auc_score(yva, proba)
    acc  = accuracy_score(yva, preds)
    prec = precision_score(yva, preds, zero_division=0)
    rec  = recall_score(yva, preds, zero_division=0)
    f1   = f1_score(yva, preds, zero_division=0)
    cm   = confusion_matrix(yva, preds)
    fpr, tpr, _ = roc_curve(yva, proba)
    store_curves[name] = (fpr, tpr, auc)

    return {"model": name, "ROC_AUC": auc, "Accuracy": acc,
            "Precision": prec, "Recall": rec, "F1": f1, "cm": cm, "pipe": pipe}


In [None]:
def main():
    t0 = time.time()
    print("Loading & merging...")
    train = load_and_merge(TRAIN_TRANS_PATH, TRAIN_ID_PATH)
    test  = load_and_merge(TEST_TRANS_PATH, TEST_ID_PATH)

    assert "isFraud" in train.columns, "训练集必须包含目标列 isFraud"
    id_col = "TransactionID"

    y_full = train["isFraud"].astype(int)
    X_full = train.drop(columns=["isFraud"])
    assert id_col in X_full.columns and id_col in test.columns

    # 分层抽样（整体）
    X_cap, y_cap = stratified_cap(X_full, y_full, MAX_ROWS)

    # 训练/验证切分
    X_tr, X_va, y_tr, y_va = train_test_split(
        X_cap, y_cap, test_size=TEST_SIZE, stratify=y_cap, random_state=RANDOM_STATE
    )

    # 列类型识别
    num_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
    if id_col in num_cols: num_cols.remove(id_col)
    cat_cols = [c for c in X_tr.columns if c not in num_cols and c != id_col]

    # 预处理：线性模型用标准化；树模型不用标准化
    num_imputer_scaler = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])
    num_imputer_only = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
    ])
    cat_ordinal = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("enc", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ])
    preproc_linear = ColumnTransformer([
        ("num", num_imputer_scaler, num_cols),
        ("cat", cat_ordinal,       cat_cols),
    ], remainder="drop")
    preproc_tree = ColumnTransformer([
        ("num", num_imputer_only,  num_cols),
        ("cat", cat_ordinal,       cat_cols),
    ], remainder="drop")

    class_weight = "balanced" if USE_CLASS_WEIGHT_BALANCED else None

    # 四模型
    models = {}
    models["LogisticRegression"] = Pipeline([
        ("prep", preproc_linear),
        ("clf", LogisticRegression(
            solver="saga", penalty="l2", C=1.0, max_iter=300,
            class_weight=class_weight, random_state=RANDOM_STATE))
    ])
    models["RandomForest"] = Pipeline([
        ("prep", preproc_tree),
        ("clf", RandomForestClassifier(
            n_estimators=200, n_jobs=-1, random_state=RANDOM_STATE))
    ])

    X_svm_tr, y_svm_tr = stratified_cap(
        X_tr.drop(columns=[id_col], errors="ignore"), y_tr, SVM_MAX_ROWS)
    X_svm_va, y_svm_va = X_va.drop(columns=[id_col], errors="ignore"), y_va

    models["SVM"] = Pipeline([
        ("prep", preproc_linear),
        ("clf", SVC(kernel="rbf", C=1.0, gamma="scale",
                    probability=True, class_weight=class_weight,
                    random_state=RANDOM_STATE))
    ])

    if xgb_available:
        # 按训练分布设置 scale_pos_weight（提升召回）
        pos = (y_tr == 1).sum()
        neg = (y_tr == 0).sum()
        spw = max((neg / max(pos, 1)), 1.0)

        models["XGBoost"] = Pipeline([
            ("prep", preproc_tree),
            ("clf", XGBClassifier(
                n_estimators=300, learning_rate=0.08, max_depth=8,
                subsample=0.8, colsample_bytree=0.8,
                eval_metric="logloss", tree_method="hist",
                scale_pos_weight=spw,
                n_jobs=-1, random_state=RANDOM_STATE))
        ])
    else:
        print("XGBoost 未安装，已跳过：", xgb_import_error)

    # 训练
    print("Training & evaluating...")
    curves, rows, fitted = {}, [], {}
    for name, pipe in models.items():
        if name == "SVM":
            res = evaluate_model(
                name, pipe,
                X_svm_tr, y_svm_tr,
                X_svm_va, y_svm_va,
                curves
            )
        else:
            res = evaluate_model(
                name, pipe,
                X_tr.drop(columns=[id_col], errors="ignore"), y_tr,
                X_va.drop(columns=[id_col], errors="ignore"), y_va,
                curves
            )
        fitted[name] = res["pipe"]
        rows.append({k: v for k, v in res.items() if k not in ("pipe", "cm")})

    metrics_df = pd.DataFrame(rows).sort_values("ROC_AUC", ascending=False)
    metrics_df.to_csv(OUT_METRICS_CSV, index=False)
    print("\nMetrics:\n", metrics_df)

    # ROC 曲线
    plt.figure()
    for name, (fpr, tpr, auc) in curves.items():
        plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curves (Validation)")
    plt.legend()
    plt.tight_layout()
    plt.savefig(OUT_ROC_PNG)
    plt.close()

    # 最近模型的混淆矩阵
    best_name = metrics_df.iloc[0]["model"]
    best_pipe = fitted[best_name]

    X_cap_features = X_cap.drop(columns=[id_col], errors="ignore")

    X_cap_aligned = align_columns_for_inference(X_cap_features, num_cols, cat_cols)

    print(f"\nBest model by AUC: {best_name}  Retraining on capped full train...")
    best_pipe.fit(X_cap_aligned, y_cap)

    if best_name == "SVM":
        Xv = align_columns_for_inference(X_svm_va, num_cols, cat_cols)
        yv = y_svm_va
    else:
        Xv = align_columns_for_inference(X_va.drop(columns=[id_col], errors="ignore"), num_cols, cat_cols)
        yv = y_va

    if hasattr(best_pipe, "predict_proba"):
        proba_best = best_pipe.predict_proba(Xv)[:, 1]
    elif hasattr(best_pipe, "decision_function"):
        dfun = best_pipe.decision_function(Xv)
        proba_best = (dfun - dfun.min()) / (dfun.max() - dfun.min() + 1e-9)
    else:
        proba_best = best_pipe.predict(Xv)
    preds_best = (proba_best >= 0.5).astype(int)
    cm_best = confusion_matrix(yv, preds_best)

    plt.figure()
    plt.imshow(cm_best, interpolation="nearest")
    plt.title(f"Confusion Matrix: {best_name}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    for (i, j), z in np.ndenumerate(cm_best):
        plt.text(j, i, f"{z}", ha="center", va="center")
    plt.tight_layout()
    plt.savefig(OUT_CM_PNG)
    plt.close()

    test_ids = test[id_col].values
    X_test_raw = test.drop(columns=[id_col])
    X_test     = align_columns_for_inference(X_test_raw, num_cols, cat_cols)

    if hasattr(best_pipe, "predict_proba"):
        test_proba = best_pipe.predict_proba(X_test)[:, 1]
    elif hasattr(best_pipe, "decision_function"):
        dfun = best_pipe.decision_function(X_test)
        test_proba = (dfun - dfun.min()) / (dfun.max() - dfun.min() + 1e-9)
    else:
        test_proba = best_pipe.predict(X_test)

    # 保存提交文件
    sample = pd.read_csv(SAMPLE_SUB_PATH)
    submission = pd.DataFrame({
        sample.columns[0]: test_ids,
        sample.columns[1]: test_proba
    })
    submission.to_csv(OUT_SUBMISSION, index=False)
    import seaborn as sns

    # 1. 提取模型指标
    metrics_summary = metrics_df.set_index('model')[['Accuracy', 'Precision', 'Recall', 'F1']]
    print("模型对比指标：\n", metrics_summary)

    # 2. 绘制柱状对比图
    metrics_summary.plot(kind='bar', figsize=(10,6))
    plt.title('Models Performance Comparison')
    plt.ylabel('Score')
    plt.tight_layout()
    plt.savefig(os.path.join(DATA_DIR, "model_comparison_barplot.png"))
    plt.close()

    # 3. 每个模型的混淆矩阵（已在之前代码中实现，现在可以保存或显示）

    # (可选) 4. 相关性热图
    # 计算全部特征的相关性
    full_features = X_full.drop(columns=[id_col])
    plt.figure(figsize=(12,10))
    sns.heatmap(full_features.corr(), annot=False, cmap='coolwarm')
    plt.title("Feature Correlation Heatmap")
    plt.tight_layout()
    plt.savefig(os.path.join(DATA_DIR, "feature_correlation_heatmap.png"))
    plt.close()

    print("\nArtifacts saved:")
    print(f"- {OUT_METRICS_CSV}")
    print(f"- {OUT_ROC_PNG}")
    print(f"- {OUT_CM_PNG}")
    print(f"- {OUT_SUBMISSION}")
    print(f"Done in {time.time()-t0:.1f}s")


In [None]:

if __name__ == "__main__":
    main()