In [1]:
from train import load_config, load_dataset, add_label_from_metadata, train_and_evaluate, select_feature_columns, get_feature_names
from pathlib import Path
import sys
sys.path.append("/workspaces/tcr_structure_embedding/")
import pandas as pd
import numpy as np
import json
import joblib
import shap
import matplotlib.pyplot as plt
from typing import List

In [2]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

def load_preds(path: Path):
    df = pd.read_csv(path)
    if not {"y_true", "y_prob"}.issubset(df.columns):
        raise ValueError(f"Pred file {path} missing y_true/y_prob columns")
    return df

def plot_curves(paths: List[Path], labels: List[str], output: Path):
    # Create figure and two subplots: ax1 = ROC, ax2 = PR
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6, 10), sharex=False)

    for path, label in zip(paths, labels):
        df = load_preds(path)  # expects columns "y_true", "y_prob"

        # Precision–Recall
        precision, recall, _ = precision_recall_curve(df["y_true"], df["y_prob"])
        ap = average_precision_score(df["y_true"], df["y_prob"])

        # ROC
        fpr, tpr, _ = roc_curve(df["y_true"], df["y_prob"])
        roc_auc = auc(fpr, tpr)

        # Plot on the correct axes (don't overwrite ax1/ax2)
        ax1.plot(fpr, tpr, label=f"{label} (AUC={roc_auc:.3f})")
        ax2.plot(recall, precision, label=f"{label} (AP={ap:.3f})")

    # ROC plot formatting
    ax1.plot([0, 1], [0, 1], "k--", alpha=0.5)
    ax1.set_xlabel("False Positive Rate")
    ax1.set_ylabel("True Positive Rate")
    ax1.set_title("ROC comparison")
    ax1.legend()

    # PR plot formatting
    ax2.set_xlabel("Recall")
    ax2.set_ylabel("Precision")
    ax2.set_title("Precision–Recall comparison")
    ax2.legend()

    plt.tight_layout()
    output.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(output, dpi=300)
    plt.close(fig)
    print(f"Saved PR/ROC plot to {output}")

## Specificity

In [4]:
cfg = load_config(Path("/workspaces/tcr_structure_embedding/pipelines/maura_hnncc/config.yaml"))
cfg["seed"] = 42
tmp='Nils.cdr3b_aa.patient_id.stimulation_category_0307'

In [None]:
for feature, name in zip([
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
], ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]):
    
    model = "xgboost"
    model_suffix = f"specific_{model}_{name}"
    
    cfg["feature_files"] = feature
    
    df = load_dataset(cfg)
    df = add_label_from_metadata(df, cfg)
    # exclude columns listed in cfg["feature_columns"]
    df = df.loc[:, ~df.columns.isin(cfg["feature_columns"])]
    
    label_col = cfg.get("label_column")
    if not label_col or label_col not in df.columns:
        raise ValueError("label_column must be set in config and exist in the feature table.")

    if cfg.get("filter_na_labels", True) is True:
        df = df.loc[~df[label_col].isna()]
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    
    cols = select_feature_columns(df, label_col)
    feature_cols = cols["all"]
    
    result = train_and_evaluate(df, label_col, cfg, model_name=model)
    
    model_dir = Path(cfg["output_model_dir"])
    report_dir = Path(cfg["output_report_dir"])
    model_dir.mkdir(parents=True, exist_ok=True)
    report_dir.mkdir(parents=True, exist_ok=True)

    suffix_override = cfg.get("run_suffix_override")
    model_path = model_dir / f"maura_hnncc_{model_suffix}.joblib"
    joblib.dump(result["model"], model_path)

    metrics_path = report_dir / f"maura_hnncc_{model_suffix}_metric.json"
    with metrics_path.open("w") as f:
        json.dump(result["metrics"], f, indent=2)

    predictions_path = report_dir / f"maura_hnncc_{model_suffix}_metric_pred.csv"
    pd.DataFrame({
        "row_idx": result["X_test"].index.tolist(),
        "y_true": result["y_test"],
        "y_prob": result["test_prob"],
    }).to_csv(predictions_path, index=False)
    
    print(f"Saved model to {model_path}")
    print(f"Saved metrics to {metrics_path}")
    print(f"Saved predictions to {predictions_path}")
    
    model = result["model"]
    preproc = model.named_steps.get("preprocess") if "preprocess" in model.named_steps else (model[:-1] if len(model.steps) > 1 else None)
    clf = model.named_steps["clf"]
    X_test_proc = preproc.transform(result["X_test"]) if preproc is not None else result["X_test"]
    
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_test_proc)
    
    shap_vals_array = np.array(shap_values)
    if shap_vals_array.ndim == 3:  # rare multiclass case
        shap_vals_array = shap_vals_array[0]
    mean_abs = np.abs(shap_vals_array).mean(axis=0)
    feature_names = get_feature_names(preproc, cols["categorical"], cols["numerical"])
    
    shap_df = pd.concat([pd.DataFrame(mean_abs.tolist(), index=feature_names).T, 
                     pd.DataFrame(shap_vals_array.tolist(), columns=feature_names)], ignore_index=True)
    shap_df.insert(0, "row_idx", ["mean_abs_shap"] + result["X_test"].index.tolist())
    shap_df.to_csv(report_dir / f"maura_hnncc_{model_suffix}_shap.csv", index=False)
    
    shap.plots.violin(shap_values, features = X_test_proc, feature_names=feature_names, max_display=50, sort=True, plot_size=(25, 15), show=False)
    plt.savefig(report_dir / f"maura_hnncc_{model_suffix}_shap.png", dpi=500) #.png,.pdf will also support here
    plt.close()   
    
pred_lists = [
    "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_struct_metric_pred.csv",
    "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_inter_metric_pred.csv",
    "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_res_metric_pred.csv",
    "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_struct_inter_metric_pred.csv",
    "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_struct_res_metric_pred.csv",
    "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_inter_res_metric_pred.csv",
    "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_struct_inter_res_metric_pred.csv",
]
labels = [f"xgb_{x}" for x in ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]]

plot_curves(pred_lists, labels, Path("/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_all_roc.png"))

Loaded dataset with 1500 samples and 268 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_specific_xgboost_struct.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_struct_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_struct_metric_pred.csv
Loaded dataset with 1500 samples and 22 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_specific_xgboost_inter.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_inter_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_xgboost_inter_metric_pred.csv
Loaded dataset with 1500 samples and 221 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models

In [None]:
for feature, name in zip([
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
], ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]):
    
    model = "logreg"
    model_suffix = f"specific_{model}_{name}"
    
    cfg["feature_files"] = feature
    
    df = load_dataset(cfg)
    df = add_label_from_metadata(df, cfg)
    # exclude columns listed in cfg["feature_columns"]
    df = df.loc[:, ~df.columns.isin(cfg["feature_columns"])]
    
    label_col = cfg.get("label_column")
    if not label_col or label_col not in df.columns:
        raise ValueError("label_column must be set in config and exist in the feature table.")

    if cfg.get("filter_na_labels", True) is True:
        df = df.loc[~df[label_col].isna()]
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    
    cols = select_feature_columns(df, label_col)
    feature_cols = cols["all"]
    
    result = train_and_evaluate(df, label_col, cfg, model_name=model)
    
    model_dir = Path(cfg["output_model_dir"])
    report_dir = Path(cfg["output_report_dir"])
    model_dir.mkdir(parents=True, exist_ok=True)
    report_dir.mkdir(parents=True, exist_ok=True)

    suffix_override = cfg.get("run_suffix_override")
    model_path = model_dir / f"maura_hnncc_{model_suffix}.joblib"
    joblib.dump(result["model"], model_path)

    metrics_path = report_dir / f"maura_hnncc_{model_suffix}_metric.json"
    with metrics_path.open("w") as f:
        json.dump(result["metrics"], f, indent=2)

    predictions_path = report_dir / f"maura_hnncc_{model_suffix}_metric_pred.csv"
    pd.DataFrame({
        "row_idx": result["X_test"].index.tolist(),
        "y_true": result["y_test"],
        "y_prob": result["test_prob"],
    }).to_csv(predictions_path, index=False)
    
    print(f"Saved model to {model_path}")
    print(f"Saved metrics to {metrics_path}")
    print(f"Saved predictions to {predictions_path}")
    
    model = result["model"]
    preproc = model.named_steps.get("preprocess") if "preprocess" in model.named_steps else (model[:-1] if len(model.steps) > 1 else None)
    clf = model.named_steps["clf"]
    X_train_proc = preproc.transform(result["X_train"]) if preproc is not None else result["X_train"]
    X_test_proc = preproc.transform(result["X_test"]) if preproc is not None else result["X_test"]
    
    explainer = shap.LinearExplainer(clf, X_train_proc)
    shap_values = explainer.shap_values(X_test_proc)
    
    shap_vals_array = np.array(shap_values)
    if shap_vals_array.ndim == 3:  # rare multiclass case
        shap_vals_array = shap_vals_array[0]
    mean_abs = np.abs(shap_vals_array).mean(axis=0)
    feature_names = get_feature_names(preproc, cols["categorical"], cols["numerical"])
    
    shap_df = pd.concat([pd.DataFrame(mean_abs.tolist(), index=feature_names).T, 
                     pd.DataFrame(shap_vals_array.tolist(), columns=feature_names)], ignore_index=True)
    shap_df.insert(0, "row_idx", ["mean_abs_shap"] + result["X_test"].index.tolist())
    shap_df.to_csv(report_dir / f"maura_hnncc_{model_suffix}_shap.csv", index=False)
    
    shap.plots.violin(shap_values, features = X_test_proc, feature_names=feature_names, max_display=50, sort=True, plot_size=(25, 15), show=False)
    plt.savefig(report_dir / f"maura_hnncc_{model_suffix}_shap.png", dpi=500) #.png,.pdf will also support here
    plt.close()   

labels = [f"logreg_{x}" for x in ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]]
plot_curves([x.replace("_xgboost_","_logreg_") for x in pred_lists], labels, Path("/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_logreg_all_roc.png"))

Loaded dataset with 1500 samples and 268 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_specific_logreg_struct.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_logreg_struct_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_logreg_struct_metric_pred.csv
Loaded dataset with 1500 samples and 22 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_specific_logreg_inter.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_logreg_inter_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_logreg_inter_metric_pred.csv
Loaded dataset with 1500 samples and 221 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura

In [None]:
for feature, name in zip([
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
], ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]):
    
    model = "random_forest"
    model_suffix = f"specific_rf_{name}"
    
    cfg["feature_files"] = feature
    
    df = load_dataset(cfg)
    df = add_label_from_metadata(df, cfg)
    # exclude columns listed in cfg["feature_columns"]
    df = df.loc[:, ~df.columns.isin(cfg["feature_columns"])]
    
    label_col = cfg.get("label_column")
    if not label_col or label_col not in df.columns:
        raise ValueError("label_column must be set in config and exist in the feature table.")

    if cfg.get("filter_na_labels", True) is True:
        df = df.loc[~df[label_col].isna()]
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    
    cols = select_feature_columns(df, label_col)
    feature_cols = cols["all"]
    
    result = train_and_evaluate(df, label_col, cfg, model_name=model)
    
    model_dir = Path(cfg["output_model_dir"])
    report_dir = Path(cfg["output_report_dir"])
    model_dir.mkdir(parents=True, exist_ok=True)
    report_dir.mkdir(parents=True, exist_ok=True)

    suffix_override = cfg.get("run_suffix_override")
    model_path = model_dir / f"maura_hnncc_{model_suffix}.joblib"
    joblib.dump(result["model"], model_path)

    metrics_path = report_dir / f"maura_hnncc_{model_suffix}_metric.json"
    with metrics_path.open("w") as f:
        json.dump(result["metrics"], f, indent=2)

    predictions_path = report_dir / f"maura_hnncc_{model_suffix}_metric_pred.csv"
    pd.DataFrame({
        "row_idx": result["X_test"].index.tolist(),
        "y_true": result["y_test"],
        "y_prob": result["test_prob"],
    }).to_csv(predictions_path, index=False)
    
    print(f"Saved model to {model_path}")
    print(f"Saved metrics to {metrics_path}")
    print(f"Saved predictions to {predictions_path}")
    
    model = result["model"]
    preproc = model.named_steps.get("preprocess") if "preprocess" in model.named_steps else (model[:-1] if len(model.steps) > 1 else None)
    clf = model.named_steps["clf"]
    X_train_proc = preproc.transform(result["X_train"]) if preproc is not None else result["X_train"]
    X_test_proc = preproc.transform(result["X_test"]) if preproc is not None else result["X_test"]
    
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_test_proc)
    
    feature_names = get_feature_names(preproc, cols["categorical"], cols["numerical"])
    
    mean_abs_neg = np.abs(shap_values[..., 0]).mean(axis=0)
    mean_abs_pos = np.abs(shap_values[..., 1]).mean(axis=0)
    shap_df = pd.concat([pd.DataFrame(mean_abs_neg.tolist(), index=feature_names).T,
                         pd.DataFrame(mean_abs_pos.tolist(), index=feature_names).T, 
                         pd.DataFrame(shap_values.tolist(), columns=feature_names)], ignore_index=True)
    shap_df.insert(0, "row_idx", ["neg_mean_abs_shap","pos_mean_abs_shap"] + result["X_test"].index.tolist())
    shap_df.to_csv(report_dir / f"maura_hnncc_{model_suffix}_shap.csv", index=False)
    
    shap.plots.violin(shap_values[..., 1], features = X_test_proc, feature_names=feature_names, max_display=50, sort=True, plot_size=(25, 15), show=False)
    plt.savefig(report_dir / f"maura_hnncc_{model_suffix}_shap.png", dpi=500) #.png,.pdf will also support here
    plt.close()
    
labels = [f"rf_{x}" for x in ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]]
plot_curves([x.replace("_xgboost_","_rf_") for x in pred_lists], labels, Path("/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_rf_all_roc.png"))

Loaded dataset with 1500 samples and 268 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_specific_rf_struct.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_rf_struct_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_rf_struct_metric_pred.csv
Loaded dataset with 1500 samples and 22 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_specific_rf_inter.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_rf_inter_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_rf_inter_metric_pred.csv
Loaded dataset with 1500 samples and 221 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_specific_rf_res.j

Saved PR/ROC plot to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_specific_rf_all_roc.png


## Cytotoxicity

In [22]:
cfg = load_config(Path("/workspaces/tcr_structure_embedding/pipelines/maura_hnncc/config.yaml"))
cfg["seed"] = 42
tmp='Nils.cdr3b_aa.patient_id.stimulation_category_0307'

cfg["label_column"] = "killing"
cfg["filter_na_labels"] = True

In [23]:
for feature, name in zip([
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
], ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]):
    
    model = "xgboost"
    model_suffix = f"killing_{model}_{name}"
    
    cfg["feature_files"] = feature
    
    df = load_dataset(cfg)
    df = add_label_from_metadata(df, cfg)
    # exclude columns listed in cfg["feature_columns"]
    df = df.loc[:, ~df.columns.isin(cfg["feature_columns"])]
    
    label_col = cfg.get("label_column")
    if not label_col or label_col not in df.columns:
        raise ValueError("label_column must be set in config and exist in the feature table.")

    if cfg.get("filter_na_labels", True) is True:
        df = df.loc[~df[label_col].isna()]
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    
    cols = select_feature_columns(df, label_col)
    feature_cols = cols["all"]
    
    result = train_and_evaluate(df, label_col, cfg, model_name=model)
    
    model_dir = Path(cfg["output_model_dir"])
    report_dir = Path(cfg["output_report_dir"])
    model_dir.mkdir(parents=True, exist_ok=True)
    report_dir.mkdir(parents=True, exist_ok=True)

    suffix_override = cfg.get("run_suffix_override")
    model_path = model_dir / f"maura_hnncc_{model_suffix}.joblib"
    joblib.dump(result["model"], model_path)

    metrics_path = report_dir / f"maura_hnncc_{model_suffix}_metric.json"
    with metrics_path.open("w") as f:
        json.dump(result["metrics"], f, indent=2)

    predictions_path = report_dir / f"maura_hnncc_{model_suffix}_metric_pred.csv"
    pd.DataFrame({
        "row_idx": result["X_test"].index.tolist(),
        "y_true": result["y_test"],
        "y_prob": result["test_prob"],
    }).to_csv(predictions_path, index=False)
    
    print(f"Saved model to {model_path}")
    print(f"Saved metrics to {metrics_path}")
    print(f"Saved predictions to {predictions_path}")
    
    model = result["model"]
    preproc = model.named_steps.get("preprocess") if "preprocess" in model.named_steps else (model[:-1] if len(model.steps) > 1 else None)
    clf = model.named_steps["clf"]
    X_test_proc = preproc.transform(result["X_test"]) if preproc is not None else result["X_test"]
    
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_test_proc)
    
    shap_vals_array = np.array(shap_values)
    if shap_vals_array.ndim == 3:  # rare multiclass case
        shap_vals_array = shap_vals_array[0]
    mean_abs = np.abs(shap_vals_array).mean(axis=0)
    feature_names = get_feature_names(preproc, cols["categorical"], cols["numerical"])
    
    shap_df = pd.concat([pd.DataFrame(mean_abs.tolist(), index=feature_names).T, 
                     pd.DataFrame(shap_vals_array.tolist(), columns=feature_names)], ignore_index=True)
    shap_df.insert(0, "row_idx", ["mean_abs_shap"] + result["X_test"].index.tolist())
    shap_df.to_csv(report_dir / f"maura_hnncc_{model_suffix}_shap.csv", index=False)
    
    shap.plots.violin(shap_values, features = X_test_proc, feature_names=feature_names, max_display=50, sort=True, plot_size=(25, 15), show=False)
    plt.savefig(report_dir / f"maura_hnncc_{model_suffix}_shap.png", dpi=500) #.png,.pdf will also support here
    plt.close()   
    
labels = [f"xgb_{x}" for x in ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]]

plot_curves([x.replace("_specific_", "_killing_") for x in pred_lists], labels, Path("/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_xgboost_all_roc.png"))

Loaded dataset with 275 samples and 268 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_killing_xgboost_struct.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_xgboost_struct_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_xgboost_struct_metric_pred.csv
Loaded dataset with 275 samples and 22 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_killing_xgboost_inter.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_xgboost_inter_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_xgboost_inter_metric_pred.csv
Loaded dataset with 275 samples and 221 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hn

In [24]:
for feature, name in zip([
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
], ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]):
    
    model = "logreg"
    model_suffix = f"killing_{model}_{name}"
    
    cfg["feature_files"] = feature
    
    df = load_dataset(cfg)
    df = add_label_from_metadata(df, cfg)
    # exclude columns listed in cfg["feature_columns"]
    df = df.loc[:, ~df.columns.isin(cfg["feature_columns"])]
    
    label_col = cfg.get("label_column")
    if not label_col or label_col not in df.columns:
        raise ValueError("label_column must be set in config and exist in the feature table.")

    if cfg.get("filter_na_labels", True) is True:
        df = df.loc[~df[label_col].isna()]
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    
    cols = select_feature_columns(df, label_col)
    feature_cols = cols["all"]
    
    result = train_and_evaluate(df, label_col, cfg, model_name=model)
    
    model_dir = Path(cfg["output_model_dir"])
    report_dir = Path(cfg["output_report_dir"])
    model_dir.mkdir(parents=True, exist_ok=True)
    report_dir.mkdir(parents=True, exist_ok=True)

    suffix_override = cfg.get("run_suffix_override")
    model_path = model_dir / f"maura_hnncc_{model_suffix}.joblib"
    joblib.dump(result["model"], model_path)

    metrics_path = report_dir / f"maura_hnncc_{model_suffix}_metric.json"
    with metrics_path.open("w") as f:
        json.dump(result["metrics"], f, indent=2)

    predictions_path = report_dir / f"maura_hnncc_{model_suffix}_metric_pred.csv"
    pd.DataFrame({
        "row_idx": result["X_test"].index.tolist(),
        "y_true": result["y_test"],
        "y_prob": result["test_prob"],
    }).to_csv(predictions_path, index=False)
    
    print(f"Saved model to {model_path}")
    print(f"Saved metrics to {metrics_path}")
    print(f"Saved predictions to {predictions_path}")
    
    model = result["model"]
    preproc = model.named_steps.get("preprocess") if "preprocess" in model.named_steps else (model[:-1] if len(model.steps) > 1 else None)
    clf = model.named_steps["clf"]
    X_train_proc = preproc.transform(result["X_train"]) if preproc is not None else result["X_train"]
    X_test_proc = preproc.transform(result["X_test"]) if preproc is not None else result["X_test"]
    
    explainer = shap.LinearExplainer(clf, X_train_proc)
    shap_values = explainer.shap_values(X_test_proc)
    
    shap_vals_array = np.array(shap_values)
    if shap_vals_array.ndim == 3:  # rare multiclass case
        shap_vals_array = shap_vals_array[0]
    mean_abs = np.abs(shap_vals_array).mean(axis=0)
    feature_names = get_feature_names(preproc, cols["categorical"], cols["numerical"])
    
    shap_df = pd.concat([pd.DataFrame(mean_abs.tolist(), index=feature_names).T, 
                     pd.DataFrame(shap_vals_array.tolist(), columns=feature_names)], ignore_index=True)
    shap_df.insert(0, "row_idx", ["mean_abs_shap"] + result["X_test"].index.tolist())
    shap_df.to_csv(report_dir / f"maura_hnncc_{model_suffix}_shap.csv", index=False)
    
    shap.plots.violin(shap_values, features = X_test_proc, feature_names=feature_names, max_display=50, sort=True, plot_size=(25, 15), show=False)
    plt.savefig(report_dir / f"maura_hnncc_{model_suffix}_shap.png", dpi=500) #.png,.pdf will also support here
    plt.close()   

labels = [f"logreg_{x}" for x in ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]]
plot_curves([x.replace("_specific_xgboost_","_killing_logreg_") for x in pred_lists], labels, Path("/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_logreg_all_roc.png"))

Loaded dataset with 275 samples and 268 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_killing_logreg_struct.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_logreg_struct_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_logreg_struct_metric_pred.csv
Loaded dataset with 275 samples and 22 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_killing_logreg_inter.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_logreg_inter_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_logreg_inter_metric_pred.csv
Loaded dataset with 275 samples and 221 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_ki

In [25]:
for feature, name in zip([
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
], ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]):
    
    model = "random_forest"
    model_suffix = f"killing_rf_{name}"
    
    cfg["feature_files"] = feature
    
    df = load_dataset(cfg)
    df = add_label_from_metadata(df, cfg)
    # exclude columns listed in cfg["feature_columns"]
    df = df.loc[:, ~df.columns.isin(cfg["feature_columns"])]
    
    label_col = cfg.get("label_column")
    if not label_col or label_col not in df.columns:
        raise ValueError("label_column must be set in config and exist in the feature table.")

    if cfg.get("filter_na_labels", True) is True:
        df = df.loc[~df[label_col].isna()]
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    
    cols = select_feature_columns(df, label_col)
    feature_cols = cols["all"]
    
    result = train_and_evaluate(df, label_col, cfg, model_name=model)
    
    model_dir = Path(cfg["output_model_dir"])
    report_dir = Path(cfg["output_report_dir"])
    model_dir.mkdir(parents=True, exist_ok=True)
    report_dir.mkdir(parents=True, exist_ok=True)

    suffix_override = cfg.get("run_suffix_override")
    model_path = model_dir / f"maura_hnncc_{model_suffix}.joblib"
    joblib.dump(result["model"], model_path)

    metrics_path = report_dir / f"maura_hnncc_{model_suffix}_metric.json"
    with metrics_path.open("w") as f:
        json.dump(result["metrics"], f, indent=2)

    predictions_path = report_dir / f"maura_hnncc_{model_suffix}_metric_pred.csv"
    pd.DataFrame({
        "row_idx": result["X_test"].index.tolist(),
        "y_true": result["y_test"],
        "y_prob": result["test_prob"],
    }).to_csv(predictions_path, index=False)
    
    print(f"Saved model to {model_path}")
    print(f"Saved metrics to {metrics_path}")
    print(f"Saved predictions to {predictions_path}")
    
    model = result["model"]
    preproc = model.named_steps.get("preprocess") if "preprocess" in model.named_steps else (model[:-1] if len(model.steps) > 1 else None)
    clf = model.named_steps["clf"]
    X_train_proc = preproc.transform(result["X_train"]) if preproc is not None else result["X_train"]
    X_test_proc = preproc.transform(result["X_test"]) if preproc is not None else result["X_test"]
    
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_test_proc)
    
    feature_names = get_feature_names(preproc, cols["categorical"], cols["numerical"])
    
    mean_abs_neg = np.abs(shap_values[..., 0]).mean(axis=0)
    mean_abs_pos = np.abs(shap_values[..., 1]).mean(axis=0)
    shap_df = pd.concat([pd.DataFrame(mean_abs_neg.tolist(), index=feature_names).T,
                         pd.DataFrame(mean_abs_pos.tolist(), index=feature_names).T, 
                         pd.DataFrame(shap_values.tolist(), columns=feature_names)], ignore_index=True)
    shap_df.insert(0, "row_idx", ["neg_mean_abs_shap","pos_mean_abs_shap"] + result["X_test"].index.tolist())
    shap_df.to_csv(report_dir / f"maura_hnncc_{model_suffix}_shap.csv", index=False)
    
    shap.plots.violin(shap_values[..., 1], features = X_test_proc, feature_names=feature_names, max_display=50, sort=True, plot_size=(25, 15), show=False)
    plt.savefig(report_dir / f"maura_hnncc_{model_suffix}_shap.png", dpi=500) #.png,.pdf will also support here
    plt.close()
    
labels = [f"rf_{x}" for x in ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]]
plot_curves([x.replace("_specific_xgboost_","_killing_rf_") for x in pred_lists], labels, Path("/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_rf_all_roc.png"))

Loaded dataset with 275 samples and 268 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_killing_rf_struct.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_rf_struct_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_rf_struct_metric_pred.csv
Loaded dataset with 275 samples and 22 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_killing_rf_inter.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_rf_inter_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_killing_rf_inter_metric_pred.csv
Loaded dataset with 275 samples and 221 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_killing_rf_res.joblib
Save

## Reactivity

In [26]:
cfg = load_config(Path("/workspaces/tcr_structure_embedding/pipelines/maura_hnncc/config.yaml"))
cfg["seed"] = 42

cfg["label_column"] = "Nils.cdr3b_aa.patient_id.stimulation_category_0307"
cfg["filter_na_labels"] = False

In [29]:
for feature, name in zip([
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
], ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]):
    
    model = "xgboost"
    model_suffix = f"reactive_{model}_{name}"
    
    cfg["feature_files"] = feature
    
    df = load_dataset(cfg)
    df = add_label_from_metadata(df, cfg)
    # exclude columns listed in cfg["feature_columns"]
    df = df.loc[:, ~df.columns.isin(cfg["feature_columns"])]
    
    label_col = cfg.get("label_column")
    if not label_col or label_col not in df.columns:
        raise ValueError("label_column must be set in config and exist in the feature table.")

    if cfg.get("filter_na_labels", True) is True:
        df = df.loc[~df[label_col].isna()]
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    else:
        df[label_col] = df[label_col].fillna('miss')
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    
    cols = select_feature_columns(df, label_col)
    feature_cols = cols["all"]
    
    result = train_and_evaluate(df, label_col, cfg, model_name=model)
    
    model_dir = Path(cfg["output_model_dir"])
    report_dir = Path(cfg["output_report_dir"])
    model_dir.mkdir(parents=True, exist_ok=True)
    report_dir.mkdir(parents=True, exist_ok=True)

    suffix_override = cfg.get("run_suffix_override")
    model_path = model_dir / f"maura_hnncc_{model_suffix}.joblib"
    joblib.dump(result["model"], model_path)

    metrics_path = report_dir / f"maura_hnncc_{model_suffix}_metric.json"
    with metrics_path.open("w") as f:
        json.dump(result["metrics"], f, indent=2)

    predictions_path = report_dir / f"maura_hnncc_{model_suffix}_metric_pred.csv"
    pd.DataFrame({
        "row_idx": result["X_test"].index.tolist(),
        "y_true": result["y_test"],
        "y_prob": result["test_prob"],
    }).to_csv(predictions_path, index=False)
    
    print(f"Saved model to {model_path}")
    print(f"Saved metrics to {metrics_path}")
    print(f"Saved predictions to {predictions_path}")
    
    model = result["model"]
    preproc = model.named_steps.get("preprocess") if "preprocess" in model.named_steps else (model[:-1] if len(model.steps) > 1 else None)
    clf = model.named_steps["clf"]
    X_test_proc = preproc.transform(result["X_test"]) if preproc is not None else result["X_test"]
    
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_test_proc)
    
    shap_vals_array = np.array(shap_values)
    if shap_vals_array.ndim == 3:  # rare multiclass case
        shap_vals_array = shap_vals_array[0]
    mean_abs = np.abs(shap_vals_array).mean(axis=0)
    feature_names = get_feature_names(preproc, cols["categorical"], cols["numerical"])
    
    shap_df = pd.concat([pd.DataFrame(mean_abs.tolist(), index=feature_names).T, 
                     pd.DataFrame(shap_vals_array.tolist(), columns=feature_names)], ignore_index=True)
    shap_df.insert(0, "row_idx", ["mean_abs_shap"] + result["X_test"].index.tolist())
    shap_df.to_csv(report_dir / f"maura_hnncc_{model_suffix}_shap.csv", index=False)
    
    shap.plots.violin(shap_values, features = X_test_proc, feature_names=feature_names, max_display=50, sort=True, plot_size=(25, 15), show=False)
    plt.savefig(report_dir / f"maura_hnncc_{model_suffix}_shap.png", dpi=500) #.png,.pdf will also support here
    plt.close()   
    
labels = [f"xgb_{x}" for x in ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]]

plot_curves([x.replace("_specific_", "_reactive_") for x in pred_lists], labels, Path("/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_xgboost_all_roc.png"))

Loaded dataset with 1500 samples and 268 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_reactive_xgboost_struct.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_xgboost_struct_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_xgboost_struct_metric_pred.csv
Loaded dataset with 1500 samples and 22 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_reactive_xgboost_inter.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_xgboost_inter_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_xgboost_inter_metric_pred.csv
Loaded dataset with 1500 samples and 221 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models

In [30]:
for feature, name in zip([
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
], ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]):
    
    model = "logreg"
    model_suffix = f"reactive_{model}_{name}"
    
    cfg["feature_files"] = feature
    
    df = load_dataset(cfg)
    df = add_label_from_metadata(df, cfg)
    # exclude columns listed in cfg["feature_columns"]
    df = df.loc[:, ~df.columns.isin(cfg["feature_columns"])]
    
    label_col = cfg.get("label_column")
    if not label_col or label_col not in df.columns:
        raise ValueError("label_column must be set in config and exist in the feature table.")

    if cfg.get("filter_na_labels", True) is True:
        df = df.loc[~df[label_col].isna()]
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    else:
        df[label_col] = df[label_col].fillna('miss')
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    
    cols = select_feature_columns(df, label_col)
    feature_cols = cols["all"]
    
    result = train_and_evaluate(df, label_col, cfg, model_name=model)
    
    model_dir = Path(cfg["output_model_dir"])
    report_dir = Path(cfg["output_report_dir"])
    model_dir.mkdir(parents=True, exist_ok=True)
    report_dir.mkdir(parents=True, exist_ok=True)

    suffix_override = cfg.get("run_suffix_override")
    model_path = model_dir / f"maura_hnncc_{model_suffix}.joblib"
    joblib.dump(result["model"], model_path)

    metrics_path = report_dir / f"maura_hnncc_{model_suffix}_metric.json"
    with metrics_path.open("w") as f:
        json.dump(result["metrics"], f, indent=2)

    predictions_path = report_dir / f"maura_hnncc_{model_suffix}_metric_pred.csv"
    pd.DataFrame({
        "row_idx": result["X_test"].index.tolist(),
        "y_true": result["y_test"],
        "y_prob": result["test_prob"],
    }).to_csv(predictions_path, index=False)
    
    print(f"Saved model to {model_path}")
    print(f"Saved metrics to {metrics_path}")
    print(f"Saved predictions to {predictions_path}")
    
    model = result["model"]
    preproc = model.named_steps.get("preprocess") if "preprocess" in model.named_steps else (model[:-1] if len(model.steps) > 1 else None)
    clf = model.named_steps["clf"]
    X_train_proc = preproc.transform(result["X_train"]) if preproc is not None else result["X_train"]
    X_test_proc = preproc.transform(result["X_test"]) if preproc is not None else result["X_test"]
    
    explainer = shap.LinearExplainer(clf, X_train_proc)
    shap_values = explainer.shap_values(X_test_proc)
    
    shap_vals_array = np.array(shap_values)
    if shap_vals_array.ndim == 3:  # rare multiclass case
        shap_vals_array = shap_vals_array[0]
    mean_abs = np.abs(shap_vals_array).mean(axis=0)
    feature_names = get_feature_names(preproc, cols["categorical"], cols["numerical"])
    
    shap_df = pd.concat([pd.DataFrame(mean_abs.tolist(), index=feature_names).T, 
                     pd.DataFrame(shap_vals_array.tolist(), columns=feature_names)], ignore_index=True)
    shap_df.insert(0, "row_idx", ["mean_abs_shap"] + result["X_test"].index.tolist())
    shap_df.to_csv(report_dir / f"maura_hnncc_{model_suffix}_shap.csv", index=False)
    
    shap.plots.violin(shap_values, features = X_test_proc, feature_names=feature_names, max_display=50, sort=True, plot_size=(25, 15), show=False)
    plt.savefig(report_dir / f"maura_hnncc_{model_suffix}_shap.png", dpi=500) #.png,.pdf will also support here
    plt.close()   

labels = [f"logreg_{x}" for x in ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]]
plot_curves([x.replace("_specific_xgboost_","_reactive_logreg_") for x in pred_lists], labels, Path("/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_logreg_all_roc.png"))

Loaded dataset with 1500 samples and 268 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_reactive_logreg_struct.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_logreg_struct_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_logreg_struct_metric_pred.csv
Loaded dataset with 1500 samples and 22 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_reactive_logreg_inter.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_logreg_inter_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_logreg_inter_metric_pred.csv
Loaded dataset with 1500 samples and 221 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura

In [31]:
for feature, name in zip([
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
    ["/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_struct_features.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_interface_analyzer.csv",
     "/workspaces/tcr_structure_embedding/outputs/maura_hnncc/features/maura_hnncc_residue_energy_cdr.csv"],
], ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]):
    
    model = "random_forest"
    model_suffix = f"reactive_rf_{name}"
    
    cfg["feature_files"] = feature
    
    df = load_dataset(cfg)
    df = add_label_from_metadata(df, cfg)
    # exclude columns listed in cfg["feature_columns"]
    df = df.loc[:, ~df.columns.isin(cfg["feature_columns"])]
    
    label_col = cfg.get("label_column")
    if not label_col or label_col not in df.columns:
        raise ValueError("label_column must be set in config and exist in the feature table.")

    if cfg.get("filter_na_labels", True) is True:
        df = df.loc[~df[label_col].isna()]
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    else:
        df[label_col] = df[label_col].fillna('miss')
        print(f"Loaded dataset with {df.shape[0]} samples and {df.shape[1]} features.")
    
    cols = select_feature_columns(df, label_col)
    feature_cols = cols["all"]
    
    result = train_and_evaluate(df, label_col, cfg, model_name=model)
    
    model_dir = Path(cfg["output_model_dir"])
    report_dir = Path(cfg["output_report_dir"])
    model_dir.mkdir(parents=True, exist_ok=True)
    report_dir.mkdir(parents=True, exist_ok=True)

    suffix_override = cfg.get("run_suffix_override")
    model_path = model_dir / f"maura_hnncc_{model_suffix}.joblib"
    joblib.dump(result["model"], model_path)

    metrics_path = report_dir / f"maura_hnncc_{model_suffix}_metric.json"
    with metrics_path.open("w") as f:
        json.dump(result["metrics"], f, indent=2)

    predictions_path = report_dir / f"maura_hnncc_{model_suffix}_metric_pred.csv"
    pd.DataFrame({
        "row_idx": result["X_test"].index.tolist(),
        "y_true": result["y_test"],
        "y_prob": result["test_prob"],
    }).to_csv(predictions_path, index=False)
    
    print(f"Saved model to {model_path}")
    print(f"Saved metrics to {metrics_path}")
    print(f"Saved predictions to {predictions_path}")
    
    model = result["model"]
    preproc = model.named_steps.get("preprocess") if "preprocess" in model.named_steps else (model[:-1] if len(model.steps) > 1 else None)
    clf = model.named_steps["clf"]
    X_train_proc = preproc.transform(result["X_train"]) if preproc is not None else result["X_train"]
    X_test_proc = preproc.transform(result["X_test"]) if preproc is not None else result["X_test"]
    
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_test_proc)
    
    feature_names = get_feature_names(preproc, cols["categorical"], cols["numerical"])
    
    mean_abs_neg = np.abs(shap_values[..., 0]).mean(axis=0)
    mean_abs_pos = np.abs(shap_values[..., 1]).mean(axis=0)
    shap_df = pd.concat([pd.DataFrame(mean_abs_neg.tolist(), index=feature_names).T,
                         pd.DataFrame(mean_abs_pos.tolist(), index=feature_names).T, 
                         pd.DataFrame(shap_values.tolist(), columns=feature_names)], ignore_index=True)
    shap_df.insert(0, "row_idx", ["neg_mean_abs_shap","pos_mean_abs_shap"] + result["X_test"].index.tolist())
    shap_df.to_csv(report_dir / f"maura_hnncc_{model_suffix}_shap.csv", index=False)
    
    shap.plots.violin(shap_values[..., 1], features = X_test_proc, feature_names=feature_names, max_display=50, sort=True, plot_size=(25, 15), show=False)
    plt.savefig(report_dir / f"maura_hnncc_{model_suffix}_shap.png", dpi=500) #.png,.pdf will also support here
    plt.close()
    
labels = [f"rf_{x}" for x in ["struct", "inter", "res", "struct_inter", "struct_res", "inter_res", "struct_inter_res"]]
plot_curves([x.replace("_specific_xgboost_","_reactive_rf_") for x in pred_lists], labels, Path("/workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_rf_all_roc.png"))

Loaded dataset with 1500 samples and 268 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_reactive_rf_struct.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_rf_struct_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_rf_struct_metric_pred.csv
Loaded dataset with 1500 samples and 22 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_reactive_rf_inter.joblib
Saved metrics to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_rf_inter_metric.json
Saved predictions to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/reports/maura_hnncc_reactive_rf_inter_metric_pred.csv
Loaded dataset with 1500 samples and 221 features.
Saved model to /workspaces/tcr_structure_embedding/outputs/maura_hnncc/models/maura_hnncc_reactive_rf_res.j