<a href="https://colab.research.google.com/github/lianjin1014/Data-Literacy-Project-by-lian/blob/main/%E6%AC%A2%E8%BF%8E%E4%BD%BF%E7%94%A8_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os, time, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score,
                             recall_score, f1_score, confusion_matrix, roc_curve)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
DATA_DIR = "."

RANDOM_STATE = 42
TEST_SIZE    = 0.2

MAX_ROWS     = 120_000    # LR / RF / XGB 的训练上限
SVM_MAX_ROWS = 25_000     # SVM 更耗时，用更小上限

# 是否对线性/核方法启用类不平衡权重
USE_CLASS_WEIGHT_BALANCED = True

In [2]:
# -------------------- 路径 --------------------
TRAIN_TRANS_PATH = os.path.join(DATA_DIR, "train_transaction.csv")
TRAIN_ID_PATH    = os.path.join(DATA_DIR, "train_identity.csv")
TEST_TRANS_PATH  = os.path.join(DATA_DIR, "test_transaction.csv")
TEST_ID_PATH     = os.path.join(DATA_DIR, "test_identity.csv")
SAMPLE_SUB_PATH  = os.path.join(DATA_DIR, "sample_submission.csv")

# -------------------- 输出 --------------------
OUT_METRICS_CSV  = os.path.join(DATA_DIR, "model_metrics.csv")
OUT_ROC_PNG      = os.path.join(DATA_DIR, "roc_curves.png")
OUT_CM_PNG       = os.path.join(DATA_DIR, "confusion_matrix_best.png")
OUT_SUBMISSION   = os.path.join(DATA_DIR, "submission_best_model.csv")


In [3]:
xgb_available = True
try:
    from xgboost import XGBClassifier
except Exception as e:
    xgb_available = False
    xgb_import_error = str(e)

def load_and_merge(trans_path, id_path):
    trans = pd.read_csv(trans_path)
    ident = pd.read_csv(id_path)
    return trans.merge(ident, on="TransactionID", how="left")

def stratified_cap(X, y, cap, seed=RANDOM_STATE):
    if (cap is None) or (len(X) <= cap):
        return X, y
    Xs, _, ys, _ = train_test_split(X, y, train_size=cap, stratify=y, random_state=seed)
    return Xs, ys

def align_columns_for_inference(X_df, num_cols, cat_cols):
    X = X_df.copy()
    wanted = list(num_cols) + list(cat_cols)

    # add missing
    for c in wanted:
        if c not in X.columns:
            if c in num_cols:
                X[c] = np.nan
            else:
                X[c] = pd.Series([np.nan] * len(X), dtype="object")

    # drop extras & reorder
    X = X[wanted]
    return X

def evaluate_model(name, pipe, Xtr, ytr, Xva, yva, store_curves):
    pipe.fit(Xtr, ytr)
    if hasattr(pipe, "predict_proba"):
        proba = pipe.predict_proba(Xva)[:, 1]
    elif hasattr(pipe, "decision_function"):
        dfun  = pipe.decision_function(Xva)
        proba = (dfun - dfun.min()) / (dfun.max() - dfun.min() + 1e-9)
    else:
        proba = pipe.predict(Xva)

    preds = (proba >= 0.5).astype(int)

    auc  = roc_auc_score(yva, proba)
    acc  = accuracy_score(yva, preds)
    prec = precision_score(yva, preds, zero_division=0)
    rec  = recall_score(yva, preds, zero_division=0)
    f1   = f1_score(yva, preds, zero_division=0)
    cm   = confusion_matrix(yva, preds)
    fpr, tpr, _ = roc_curve(yva, proba)
    store_curves[name] = (fpr, tpr, auc)

    return {"model": name, "ROC_AUC": auc, "Accuracy": acc,
            "Precision": prec, "Recall": rec, "F1": f1, "cm": cm, "pipe": pipe}


In [4]:
from sklearn.preprocessing import OneHotEncoder

def build_preprocessing_pipeline(num_imputation_strategy="median", cat_encoding_strategy="ordinal", numerical_scaling=True, num_cols=None, cat_cols=None):
    """
    Builds a preprocessing pipeline based on specified strategies.

    Args:
        num_imputation_strategy (str): Strategy for numerical imputation ('mean' or 'median').
        cat_encoding_strategy (str): Strategy for categorical encoding ('ordinal' or 'onehot').
        numerical_scaling (bool): Whether to apply StandardScaler to numerical features.
        num_cols (list): List of numerical column names.
        cat_cols (list): List of categorical column names.

    Returns:
        ColumnTransformer: The constructed preprocessing pipeline.
    """
    if num_cols is None or cat_cols is None:
        raise ValueError("num_cols and cat_cols must be provided.")

    # Numerical pipeline
    num_imputer = SimpleImputer(strategy=num_imputation_strategy)
    if numerical_scaling:
        num_pipeline = Pipeline([
            ("imputer", num_imputer),
            ("scaler", StandardScaler()),
        ])
    else:
        num_pipeline = Pipeline([
            ("imputer", num_imputer),
        ])

    # Categorical pipeline
    cat_imputer = SimpleImputer(strategy="most_frequent") # Often good default for categorical
    if cat_encoding_strategy == "ordinal":
        cat_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    elif cat_encoding_strategy == "onehot":
        cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False) # Use sparse_output=False for dense output
    else:
        raise ValueError("Invalid cat_encoding_strategy. Use 'ordinal' or 'onehot'.")

    cat_pipeline = Pipeline([
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ])

    # Column Transformer
    preprocessor = ColumnTransformer([
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols),
    ], remainder="drop")

    return preprocessor

**Reasoning**:
Import the SMOTE class and update the main function to include the SMOTE application based on a flag.



In [None]:
# Update the DATA_DIR variable to the correct path where the data files are located
DATA_DIR = "/content/"
USE_SMOTE = True
def main():
    t0 = time.time()
    print("Loading & merging...")

    # Redefine file paths using the updated DATA_DIR
    TRAIN_TRANS_PATH = os.path.join(DATA_DIR, "train_transaction.csv")
    TRAIN_ID_PATH    = os.path.join(DATA_DIR, "train_identity.csv")
    TEST_TRANS_PATH  = os.path.join(DATA_DIR, "test_transaction.csv")
    TEST_ID_PATH     = os.path.join(DATA_DIR, "test_identity.csv")
    SAMPLE_SUB_PATH  = os.path.join(DATA_DIR, "sample_submission.csv")

    # -------------------- Output Paths --------------------
    OUT_METRICS_CSV  = os.path.join(DATA_DIR, "model_metrics.csv")
    OUT_ROC_PNG      = os.path.join(DATA_DIR, "roc_curves.png")
    OUT_CM_PNG       = os.path.join(DATA_DIR, "confusion_matrix_best.png")
    OUT_SUBMISSION   = os.path.join(DATA_DIR, "submission_best_model.csv")


    train = load_and_merge(TRAIN_TRANS_PATH, TRAIN_ID_PATH)
    test  = load_and_merge(TEST_TRANS_PATH, TEST_ID_PATH)

    assert "isFraud" in train.columns, "训练集必须包含目标列 isFraud"
    id_col = "TransactionID"

    y_full = train["isFraud"].astype(int)
    X_full = train.drop(columns=["isFraud"])
    assert id_col in X_full.columns and id_col in test.columns

    #分层抽样（整体）
    X_cap, y_cap = stratified_cap(X_full, y_full, MAX_ROWS)

    # 训练/验证切分
    X_tr, X_va, y_tr, y_va = train_test_split(
        X_cap, y_cap, test_size=TEST_SIZE, stratify=y_cap, random_state=RANDOM_STATE
    )

    # 列类型识别
    num_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
    if id_col in num_cols: num_cols.remove(id_col)
    cat_cols = [c for c in X_tr.columns if c not in num_cols and c != id_col]

    # Define preprocessing pipelines using the new function
    preproc_linear = build_preprocessing_pipeline(
        num_imputation_strategy="median",
        cat_encoding_strategy="ordinal",
        numerical_scaling=True,
        num_cols=num_cols,
        cat_cols=cat_cols
    )
    preproc_tree = build_preprocessing_pipeline(
        num_imputation_strategy="median",
        cat_encoding_strategy="ordinal",
        numerical_scaling=False, # Tree models don't need scaling
        num_cols=num_cols,
        cat_cols=cat_cols
    )


    # Apply preprocessing and SMOTE if needed
    X_tr_processed_linear = preproc_linear.fit_transform(X_tr.drop(columns=[id_col], errors="ignore"))
    X_va_processed_linear = preproc_linear.transform(X_va.drop(columns=[id_col], errors="ignore"))

    X_tr_processed_tree = preproc_tree.fit_transform(X_tr.drop(columns=[id_col], errors="ignore"))
    X_va_processed_tree = preproc_tree.transform(X_va.drop(columns=[id_col], errors="ignore"))

    if USE_SMOTE:
        print("Applying SMOTE...")
        smote = SMOTE(random_state=RANDOM_STATE)
        # SMOTE works on numerical data after preprocessing
        # Need to preprocess X_tr *before* SMOTE
        # Let's define preprocessing pipelines first

    # Define preprocessing pipelines using the new function
    preproc_linear = build_preprocessing_pipeline(
        num_imputation_strategy="median",
        cat_encoding_strategy="ordinal",
        numerical_scaling=True,
        num_cols=num_cols,
        cat_cols=cat_cols
    )
    preproc_tree = build_preprocessing_pipeline(
        num_imputation_strategy="median",
        cat_encoding_strategy="ordinal",
        numerical_scaling=False, # Tree models don't need scaling
        num_cols=num_cols,
        cat_cols=cat_cols
    )


    # Apply preprocessing and SMOTE if needed
    X_tr_processed_linear = preproc_linear.fit_transform(X_tr.drop(columns=[id_col], errors="ignore"))
    X_va_processed_linear = preproc_linear.transform(X_va.drop(columns=[id_col], errors="ignore"))

    X_tr_processed_tree = preproc_tree.fit_transform(X_tr.drop(columns=[id_col], errors="ignore"))
    X_va_processed_tree = preproc_tree.transform(X_va.drop(columns=[id_col], errors="ignore"))

    if USE_SMOTE:
        print("Applying SMOTE to preprocessed tree data...")
        smote = SMOTE(random_state=RANDOM_STATE)
        X_res_tree, y_res_tree = smote.fit_resample(X_tr_processed_tree, y_tr)
        print(f"Original training size: {len(y_tr)}, SMOTE resampled size: {len(y_res_tree)}")

        # Update training data for tree models
        X_tr_tree_final, y_tr_tree_final = X_res_tree, y_res_tree
        # For linear models, we might not use SMOTE, or use it differently.
        # For simplicity, let's apply SMOTE only to the tree model data for now as an example.
        X_tr_linear_final, y_tr_linear_final = X_tr_processed_linear, y_tr

    else:
        X_tr_linear_final, y_tr_linear_final = X_tr_processed_linear, y_tr
        X_tr_tree_final, y_tr_tree_final = X_tr_processed_tree, y_tr


    class_weight = "balanced" if USE_CLASS_WEIGHT_BALANCED else None

    # Four models
    models = {}
    # Logistic Regression uses linear preproc
    models["LogisticRegression"] = Pipeline([
        ("clf", LogisticRegression(
            solver="saga", penalty="l2", C=1.0, max_iter=300,
            class_weight=class_weight, random_state=RANDOM_STATE))
    ])
    # RandomForest uses tree preproc
    models["RandomForest"] = Pipeline([
        ("clf", RandomForestClassifier(
            n_estimators=200, n_jobs=-1, random_state=RANDOM_STATE))
    ])

    # SVM uses linear preproc, and potentially smaller subset
    X_svm_tr, y_svm_tr = stratified_cap(
        # Need to process SVM data separately if using SMOTE on full tree data
        # Let's apply the appropriate preprocessor here
        X_tr.drop(columns=[id_col], errors="ignore"), y_tr, SVM_MAX_ROWS)

    # Apply linear preprocessing to SVM capped data
    X_svm_tr_processed_linear = preproc_linear.fit_transform(X_svm_tr)
    X_svm_va_processed_linear = preproc_linear.transform(X_va.drop(columns=[id_col], errors="ignore")) # Use full validation set

    # If using SMOTE, decide if we apply it to the capped SVM data
    # Let's NOT apply SMOTE to the capped SVM data for now, as SMOTE is better on larger datasets
    X_tr_svm_final, y_tr_svm_final = X_svm_tr_processed_linear, y_svm_tr
    X_va_svm_final, y_va_svm_final = X_svm_va_processed_linear, y_va


    models["SVM"] = Pipeline([
        ("clf", SVC(kernel="rbf", C=1.0, gamma="scale",
                    probability=True, class_weight=class_weight,
                    random_state=RANDOM_STATE))
    ])

    if xgb_available:
        # 按训练分布设置 scale_pos_weight（提升召回）
        # If using SMOTE, the balance changes, so scale_pos_weight should be recalculated
        # based on the resampled data if applied to XGBoost.
        # If SMOTE is applied to tree models, we calculate spw from the resampled data.
        if USE_SMOTE:
             pos_res = (y_tr_tree_final == 1).sum()
             neg_res = (y_tr_tree_final == 0).sum()
             spw = max((neg_res / max(pos_res, 1)), 1.0)
             print(f"XGBoost scale_pos_weight after SMOTE: {spw}")
        else:
            pos = (y_tr == 1).sum()
            neg = (y_tr == 0).sum()
            spw = max((neg / max(pos, 1)), 1.0)
            print(f"XGBoost scale_pos_weight without SMOTE: {spw}")


        models["XGBoost"] = Pipeline([
            ("clf", XGBClassifier(
                n_estimators=300, learning_rate=0.08, max_depth=8,
                subsample=0.8, colsample_bytree=0.8,
                eval_metric="logloss", tree_method="hist",
                scale_pos_weight=spw,
                n_jobs=-1, random_state=RANDOM_STATE))
        ])
    else:
        print("XGBoost error，已跳过：", xgb_import_error)

    # 训练
    print("Training & evaluating...")
    curves, rows, fitted = {}, [], {}
    # Store confusion matrices for each model
    cms = {}
    for name, pipe in models.items():
        print(f"Training {name}...")
        # Use the correct preprocessed data for each model
        if name == "SVM":
             # SVM uses its specific capped and processed data
             res = evaluate_model(
                name, pipe,
                X_tr_svm_final, y_tr_svm_final,
                X_va_svm_final, y_va_svm_final,
                curves
            )
        elif name == "LogisticRegression":
             # Logistic Regression uses linear preprocessed data
             res = evaluate_model(
                name, pipe,
                X_tr_linear_final, y_tr_linear_final,
                X_va_processed_linear, y_va, # Validation data is not SMOTEd
                curves
            )
        else: # RandomForest and XGBoost use tree preprocessed data (potentially SMOTEd)
            res = evaluate_model(
                name, pipe,
                X_tr_tree_final, y_tr_tree_final,
                X_va_processed_tree, y_va, # Validation data is not SMOTEd
                curves
            )
        fitted[name] = res["pipe"]
        # Store the confusion matrix for this model
        cms[name] = res["cm"]
        rows.append({k: v for k, v in res.items() if k not in ("pipe", "cm")})

    metrics_df = pd.DataFrame(rows).sort_values("ROC_AUC", ascending=False)
    metrics_df.to_csv(OUT_METRICS_CSV, index=False)
    print("\nMetrics:\n", metrics_df)

    # Print and save confusion matrix for each model
    print("\nConfusion Matrices:")
    for name, cm in cms.items():
        print(f"\n{name}:\n{cm}")
        plt.figure()
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {name}")
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.tight_layout()
        plt.savefig(os.path.join(DATA_DIR, f"confusion_matrix_{name}.png"))
        plt.close()


    # ROC 曲线
    plt.figure()
    for name, (fpr, tpr, auc) in curves.items():
        plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curves (Validation)")
    plt.legend()
    plt.tight_layout()
    plt.savefig(OUT_ROC_PNG)
    plt.close()


    # Prepare test data for prediction
    test_ids = test[id_col].values
    X_test_raw = test.drop(columns=[id_col])

    # Align test data columns with training data columns before preprocessing
    X_test_aligned = align_columns_for_inference(X_test_raw, num_cols, cat_cols)


    # Use the appropriate preprocessor for the best model to transform test data
    if best_name == "SVM" or best_name == "LogisticRegression":
        X_test_processed = preproc_linear.transform(X_test_aligned)
    else: # RandomForest, XGBoost
        X_test_processed = preproc_tree.transform(X_test_aligned)


    if hasattr(best_pipe, "predict_proba"):
        test_proba = best_pipe.predict_proba(X_test_processed)[:, 1]
    elif hasattr(best_pipe, "decision_function"):
        dfun = best_pipe.decision_function(X_test_processed)
        # Apply min-max scaling to decision function output
        test_proba = (dfun - dfun.min()) / (dfun.max() - dfun.min() + 1e-9)
    else:
        test_proba = best_pipe.predict(X_test_processed)

    # 保存提交文件
    sample = pd.read_csv(SAMPLE_SUB_PATH)
    submission = pd.DataFrame({
        sample.columns[0]: test_ids,
        sample.columns[1]: test_proba
    })
    submission.to_csv(OUT_SUBMISSION, index=False)

    import seaborn as sns

    # 1. 提取模型指标
    metrics_summary = metrics_df.set_index('model')[['Accuracy', 'Precision', 'Recall', 'F1']]
    print("模型对比指标：\n", metrics_summary)

    # 2. 绘制柱状对比图
    metrics_summary.plot(kind='bar', figsize=(10,6))
    plt.title('Models Performance Comparison')
    plt.ylabel('Score')
    plt.tight_layout()
    plt.savefig(os.path.join(DATA_DIR, "model_comparison_barplot.png"))
    plt.close()


    temp_imputer = SimpleImputer(strategy="median")
    # Need to select only numerical columns from X_cap before imputing
    X_cap_num_only = X_cap.select_dtypes(include=np.number)
    if id_col in X_cap_num_only.columns:
        X_cap_num_only = X_cap_num_only.drop(columns=[id_col])

    # Ensure the columns match the num_cols list used in preprocessing
    X_cap_num_only = X_cap_num_only[num_cols]


    X_cap_imputed_num = temp_imputer.fit_transform(X_cap_num_only)
    X_cap_imputed_num_df = pd.DataFrame(X_cap_imputed_num, columns=num_cols, index=X_cap_num_only.index)


    plt.figure(figsize=(12,10))
    # Calculate correlation on the imputed numerical features
    sns.heatmap(X_cap_imputed_num_df.corr(), annot=False, cmap='coolwarm')
    plt.title("Numerical Feature Correlation Heatmap (after Median Imputation)")
    plt.tight_layout()
    plt.savefig(os.path.join(DATA_DIR, "feature_correlation_heatmap.png"))
    plt.close()


    print("\nArtifacts saved:")
    print(f"- {OUT_METRICS_CSV}")
    print(f"- {OUT_ROC_PNG}")
    print(f"- {OUT_CM_PNG}")
    print(f"- {OUT_SUBMISSION}")
    print(f"- {os.path.join(DATA_DIR, 'model_comparison_barplot.png')}")
    print(f"- {os.path.join(DATA_DIR, 'feature_correlation_heatmap.png')}")
    # List saved confusion matrix files
    for name in cms.keys():
        print(f"- {os.path.join(DATA_DIR, f'confusion_matrix_{name}.png')}")


    print(f"Done in {time.time()-t0:.1f}s")

if __name__ == "__main__":
    main()

Loading & merging...
Applying SMOTE...
Applying SMOTE to preprocessed tree data...
Original training size: 85434, SMOTE resampled size: 166576
XGBoost scale_pos_weight after SMOTE: 1.0
Training & evaluating...
Training LogisticRegression...
Training RandomForest...
Training SVM...


In [None]:
test_ids = test[id_col].values
X_test_raw = test.drop(columns=[id_col])

# Align test data columns with training data columns before preprocessing
X_test_aligned = align_columns_for_inference(X_test_raw, num_cols, cat_cols)


# Use the appropriate preprocessor for the best model to transform test data
if best_name == "SVM" or best_name == "LogisticRegression":
    X_test_processed = preproc_linear.transform(X_test_aligned)
else: # RandomForest, XGBoost
    X_test_processed = preproc_tree.transform(X_test_aligned)