In [4]:
!git clone https://github.com/BioAI-kits/AttentionMOI.git

Cloning into 'AttentionMOI'...
remote: Enumerating objects: 828, done.[K
remote: Counting objects: 100% (181/181), done.[K
remote: Compressing objects: 100% (142/142), done.[K
remote: Total 828 (delta 56), reused 118 (delta 27), pack-reused 647 (from 1)[K
Receiving objects: 100% (828/828), 269.73 MiB | 16.99 MiB/s, done.
Resolving deltas: 100% (364/364), done.


In [5]:
%cd /kaggle/working/AttentionMOI

/kaggle/working/AttentionMOI


In [6]:
%%writefile '/kaggle/working/AttentionMOI/setup.py'
from setuptools import setup, find_packages


install_packages = [
    'captum>=0.4.1',
    'mygene>=3.2.2',
    'openpyxl>=3.0.9',
    'packaging>=21.3',
    'pandas>=1.2.5',
    'pandocfilters>=1.5.0',
    'seaborn>=0.11.2',
    'torch>=2',
    'scikit-learn>=1.2.2',
    'numpy>=1.23.5',
    'matplotlib>=3.6.2',
    'xgboost>=1.7.4',
    'livelossplot', 
    'tensorboardX',
    'tqdm',
    'ipython',
]

setup(
    # 应用名
    name='AttentionMOI',
    # 作者名
    author='Billy',
    # 作者邮箱
    author_email='liangbilin0324@163.com',
    # 版本号
    version='0.1.2',
    # 要求python版本
    python_requires=">=3.9",
    # 找到本目录下的所有python包
    packages=find_packages(),
    # 自动安装依赖
    install_requires=install_packages,
    dependency_links=[
        "https://pypi.org/simple/",
        "https://download.pytorch.org/whl/cpu#egg=torch",
        ],
    # 程序网站
    url='https://github.com/BioAI-kits/AttentionMOI',
    # 程序简单描述
    description="A Denoised Multi-omics Integration Framework for Cancer Subtype Classification and Survival Prediction.",
    # 开源许可
    license='Apache License 2.0',
    # 包含的数据
    data_files=['AttentionMOI/example/cnv.csv.gz', 'AttentionMOI/example/met.csv.gz', 'AttentionMOI/example/rna.csv.gz', 'AttentionMOI/example/label.csv'],
    # 命令行
    entry_points={
        'console_scripts': ['moi = AttentionMOI.moi:run_main',
                            ],
    },
)


Overwriting /kaggle/working/AttentionMOI/setup.py


In [7]:
%%writefile '/kaggle/working/AttentionMOI/AttentionMOI/deepmoi.py'
import argparse, warnings, sys
import numpy as np
from .src.main import run

warnings.filterwarnings('ignore')
np.random.seed(1234)


def get_args():
    parser = argparse.ArgumentParser(
                                     prog='AttentionMOI',
                                     usage="The program is used to build machine/deep learning model with single/multi omics dataset.",
                                     description="", 
                                     epilog="Example (Data can be downloaded from https://github.com/BioAI-kits/AttentionMOI ): \nmoi -f GBM_exp.csv.gz -f GBM_met.csv.gz -f GBM_logRatio.csv.gz -n rna -n met -n cnv -l GBM_label.csv --FSD -m all -o GBM_Result \n ",
                                     formatter_class=argparse.RawTextHelpFormatter
                                     )

    # config
    parser.add_argument('-f', '--omic_file', action='append', help='REQUIRED: File path for omics files (should be matrix)', required=True)
    parser.add_argument('-n', '--omic_name', action='append',
                        help='REQUIRED: Omic names for omics files, should be the same order as the omics file', required=True)
    parser.add_argument('-l', '--label_file', help='REQUIRED: File path for label file', required=True)
    parser.add_argument('-o', '--outdir', help='OPTIONAL: Setting output file path, default=./output', type=str, default='./output')
    parser.add_argument('--clin_file', type=str, required=False, help='Path to the clinical data file (optional).')

    # feature selection with distribution
    parser.add_argument('-i', '--iteration', help='OPTIONAL: The number of FSD iterations (integer), default=10.', type=int, default=10)
    parser.add_argument('-s', '--seed', help='OPTIONAL: Random seed for FSD (integer), default=0', type=int, default=0)
    parser.add_argument('--threshold',
                        help='OPTIONAL: FSD threshold to select features (float), default=0.8 (select features that are selected in 80 percent FSD iterations)',
                        type=float, default=0.8)

    # feature selection
    parser.add_argument('--method', help='OPTIONAL: Method of feature selection, choosing from ANOVA, RFE, LASSO, PCA, default is no feature selection', type=str, default=None)
    parser.add_argument('--percentile', help='OPTIONAL: Percent of features to keep for ANOVA (integer between 1-100), only used when using ANOVA, default=30', type=int, default=30)
    parser.add_argument('--num_pc', help='OPTIONAL: Number of PCs to keep for PCA (integer), only used when using PCA, default=50', type=int, default=50)

    # whether using FSD
    parser.add_argument('--FSD', action="store_true", help='OPTIONAL: Whether to use FSD to mitigate noise of omics. Default is not using FSD, and set --FSD to use FSD')

    # building model
    parser.add_argument('-t', '--test_size', help='OPTIONAL: Testing dataset proportion when split train test dataset (float), default=0.3 (30 percent data for testing)', type=float, default=0.3)
    parser.add_argument('-b', '--batch', help='OPTIONAL: Mini-batch number for model training (integer), default=32', type=int, default=32)
    parser.add_argument('-e', '--epoch', help='OPTIONAL: Epoch number for model training (integer), default=300', type=int, default=300)
    parser.add_argument('-r', '--lr', help='OPTIONAL: Learning rate for model training(float), default=0.0001.', type=float, default=0.0001)
    parser.add_argument('-w', '--weight_decay', help='OPTIONAL: weight_decay parameter for model training (float), default=0.0001', type=float,
                        default=0.0001)

    # different models
    parser.add_argument('-m', '--model', help='OPTIONAL: Model names, choosing from DNN, Net (Net for AttentionMOI), RF, XGboost, svm, mogonet, moanna, default=DNN.', type=str, default="DNN")

    args = parser.parse_args()
    return args



def main():
    args = get_args()
    
    if len(set((args.omic_name))) < 2 and args.model in ['Net', 'all']:
        print('Single omic data cannot be used to construct the AttentionMOI model.')
        sys.exit(1)
        
    run(args)


if __name__ == "__main__":
    main()



Overwriting /kaggle/working/AttentionMOI/AttentionMOI/deepmoi.py


In [8]:
!pip install -e .

Obtaining file:///kaggle/working/AttentionMOI
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting captum>=0.4.1 (from AttentionMOI==0.1.2)
  Downloading captum-0.8.0-py3-none-any.whl.metadata (26 kB)
Collecting mygene>=3.2.2 (from AttentionMOI==0.1.2)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting livelossplot (from AttentionMOI==0.1.2)
  Downloading livelossplot-0.5.6-py3-none-any.whl.metadata (8.9 kB)
Collecting tensorboardX (from AttentionMOI==0.1.2)
  Downloading tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)
Collecting biothings-client>=0.2.6 (from mygene>=3.2.2->AttentionMOI==0.1.2)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2->AttentionMOI==0.1.2)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2->AttentionMOI==0.1.2)
  Downloading nvidia_cu

In [6]:
!moi -h

usage: The program is used to build machine/deep learning model with single/multi omics dataset.

options:
  -h, --help            show this help message and exit
  -f OMIC_FILE, --omic_file OMIC_FILE
                        REQUIRED: File path for omics files (should be matrix)
  -n OMIC_NAME, --omic_name OMIC_NAME
                        REQUIRED: Omic names for omics files, should be the same order as the omics file
  -l LABEL_FILE, --label_file LABEL_FILE
                        REQUIRED: File path for label file
  -o OUTDIR, --outdir OUTDIR
                        OPTIONAL: Setting output file path, default=./output
  --clin_file CLIN_FILE
                        Path to the clinical data file (optional).
  -i ITERATION, --iteration ITERATION
                        OPTIONAL: The number of FSD iterations (integer), default=10.
  -s SEED, --seed SEED  OPTIONAL: Random seed for FSD (integer), default=0
  --threshold THRESHOLD
                        OPTIONAL: FSD threshold to select

In [7]:
%cd /kaggle/working/AttentionMOI/dataset/GBM

/kaggle/working/AttentionMOI/dataset/GBM


In [9]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Print the current working directory
print(f"The current working directory is: {current_directory}")

The current working directory is: /kaggle/working/AttentionMOI


In [1]:
# === Master driver for GBM permutations (AttentionMOI CLI only) ===
# - Runs all omic subsets (rna, met, cnv), with/without FSD, all FS methods, all models
# - Emulates K-fold via repeated runs with different seeds (since repo uses train_test_split)
# - Collects per-epoch/per-run metrics from outdir/log.txt and outdir/evaluation.txt into CSVs

import os,  re, shlex, subprocess
#import sys, itertools,json, time
from pathlib import Path
import pandas as pd

# ----------------------------
# 0) Paths & basic parameters
# ----------------------------
BASE = Path("/Users/kaushikrajnadar/Downloads/AttentionMOI-master/AttentionMOI/example")  # you said this is where you are
assert BASE.exists(), f"Not found: {BASE}"

FILES = {
    "rna": str(BASE / "GBM_exp.csv.gz"),
    "met": str(BASE / "GBM_met.csv.gz"),
    "cnv": str(BASE / "GBM_cnv_logRatio.csv.gz"),
}
LABEL = str(BASE / "GBM_label.csv")

# Global output bucket that will contain one subfolder per run
ROOT_OUT = BASE / "GBM_Result_AllRuns"
ROOT_OUT.mkdir(parents=True, exist_ok=True)

# Controls
#K_FOLDS   = 5          # emulate 5-fold by 5 seeds per trial
N_TRIALS  = 30         # set to 30 for the paper-style violin distribution; smaller for quick pass
TEST_SIZE = 0.30
EPOCHS_DNN    = 100    # epochs for DNN/Net (ignored by RF/XGB/SVM/MOANNA/MOGONET)
BASE_SEED = 2025

# Choose omic subsets, FS methods, FSD on/off, and models
OMIC_SUBSETS = [
    ["rna"], ["met"], ["cnv"],
    ["rna","met"], ["rna","cnv"], ["met","cnv"],
    ["rna","met","cnv"],
]
FS_METHODS = [None, "ANOVA", "RFE", "LASSO", "PCA"]
FSD_FLAGS  = [False, True]
MODELS = ["DNN", "Net", "RF", "XGboost", "svm", "moanna", "mogonet"]

# Utility: skip invalid combos (Net requires >=2 omics)
def valid_combo(model, omics):
    if model == "Net" and len(set(omics)) < 2:
        return False
    return True

# Utility: build a unique outdir name for one run
def make_outdir(root_out, omics, fsd, method, model, trial_idx, fold_idx, seed):
    omic_tag = "-".join(omics)
    fsd_tag  = "FSD" if fsd else "NoFSD"
    mth_tag  = "None" if method is None else method
    run_tag  = f"trial{trial_idx:02d}_fold{fold_idx:02d}_seed{seed}"
    outdir   = root_out / omic_tag / fsd_tag / mth_tag / model / run_tag
    outdir.mkdir(parents=True, exist_ok=True)
    return outdir

# Utility: assemble the moi command respecting repo’s flags
def build_cmd(omics, method, fsd, model, outdir, test_size, seed, epochs):
    cmd = ["moi"]
    # -f / -n (files and names must be in the same order)
    for om in omics:
        cmd += ["-f", FILES[om]]
    for om in omics:
        cmd += ["-n", om]
    # labels
    cmd += ["-l", LABEL]
    # common settings
    cmd += ["-o", str(outdir)]
    cmd += ["-t", str(test_size)]
    cmd += ["-s", str(seed)]                 # seed controls FSD and train_test_split()
    cmd += ["-e", str(epochs)]               # epochs (used by DNN/Net)
    # FSD
    if fsd:
        cmd += ["--FSD", "-i", "10", "--threshold", "0.8"]  # default FSD knobs
    # FS method
    if method is not None:
        cmd += ["--method", method]
        if method == "ANOVA":
            cmd += ["--percentile", "30"]
        if method == "PCA":
            cmd += ["--num_pc", "50"]
    # Model
    cmd += ["-m", model]
    return cmd

# ----------------------------
# 1) Parsers for repo logs
# ----------------------------

# DNN/Net epoch line examples (train.py):
# Train: "Epoch 10 | Train Loss 0.2965718934 | Train_ACC 0.932 | Train_AUC 0.985 | Train_F1_score 0.909 | Train_Recall 0.925 | Train_Precision 0.897"
# Test:  "Epoch 10 | Test Loss  0.1234567890 | Test_ACC  0.900 | Test_AUC  0.920 | Test_F1_score  0.880 | Test_Recall  0.860 | Test_Precision  0.905"

re_epoch_train = re.compile(
    r"Epoch\s+(\d+)\s+\|\s+Train Loss\s+([0-9\.Ee+-]+)\s+\|\s+Train_ACC\s+([0-9\.]+)\s+\|\s+Train_AUC\s+([0-9\.]+)\s+\|\s+Train_F1_score\s+([0-9\.]+)\s+\|\s+Train_Recall\s+([0-9\.]+)\s+\|\s+Train_Precision\s+([0-9\.]+)"
)
re_epoch_test  = re.compile(
    r"Epoch\s+(\d+)\s+\|\s+Test Loss\s+([0-9\.Ee+-]+)\s+\|\s+Test_ACC\s+([0-9\.]+)\s+\|\s+Test_AUC\s+([0-9\.]+)\s+\|\s+Test_F1_score\s+([0-9\.]+)\s+\|\s+Test_Recall\s+([0-9\.]+)\s+\|\s+Test_Precision\s+([0-9\.]+)"
)

# ML one-shot metrics (RF/XGboost/SVM) also appear in log.txt:
# "Train_ACC 0.932 | Train_AUC 0.985 | Train_F1_score 0.909 | Train_Recall 0.925 | Train_Precision 0.897"
# "Test_ACC  0.900 | Test_AUC  0.920 | Test_F1_score  0.880 | Test_Recall  0.860 | Test_Precision  0.905"
re_ml_train = re.compile(
    r"Train_ACC\s+([0-9\.]+)\s+\|\s+Train_AUC\s+([0-9\.]+)\s+\|\s+Train_F1_score\s+([0-9\.]+)\s+\|\s+Train_Recall\s+([0-9\.]+)\s+\|\s+Train_Precision\s+([0-9\.]+)"
)
re_ml_test = re.compile(
    r"Test_ACC\s+([0-9\.]+)\s+\|\s+Test_AUC\s+([0-9\.]+)\s+\|\s+Test_F1_score\s+([0-9\.]+)\s+\|\s+Test_Recall\s+([0-9\.]+)\s+\|\s+Test_Precision\s+([0-9\.]+)"
)

def parse_log_epochs(log_path, model):
    """
    Returns a dataframe of per-epoch metrics when available (DNN/Net),
    or single-row entries for ML models (RF/XGboost/SVM/MOANNA/MOGONET).
    """
    rows = []
    if not Path(log_path).exists():
        return pd.DataFrame(rows)

    with open(log_path, "r") as f:
        for line in f:
            line = line.strip()
            # DNN/Net epoch lines
            m = re_epoch_train.search(line)
            if m:
                ep, loss, acc, auc, f1, rec, prec = m.groups()
                rows.append(dict(
                    epoch=int(ep), split="Train",
                    loss=float(loss), acc=float(acc), auc=float(auc),
                    f1=float(f1), recall=float(rec), precision=float(prec)
                ))
                continue
            m = re_epoch_test.search(line)
            if m:
                ep, loss, acc, auc, f1, rec, prec = m.groups()
                rows.append(dict(
                    epoch=int(ep), split="Test",
                    loss=float(loss), acc=float(acc), auc=float(auc),
                    f1=float(f1), recall=float(rec), precision=float(prec)
                ))
                continue
            # ML one-shot lines
            if model in ["RF", "XGboost", "svm", "moanna", "mogonet"]:
                mt = re_ml_train.search(line)
                if mt:
                    acc, auc, f1, rec, prec = mt.groups()
                    rows.append(dict(
                        epoch=1, split="Train",
                        loss=None, acc=float(acc), auc=float(auc),
                        f1=float(f1), recall=float(rec), precision=float(prec)
                    ))
                    continue
                mt = re_ml_test.search(line)
                if mt:
                    acc, auc, f1, rec, prec = mt.groups()
                    rows.append(dict(
                        epoch=1, split="Test",
                        loss=None, acc=float(acc), auc=float(auc),
                        f1=float(f1), recall=float(rec), precision=float(prec)
                    ))
                    continue
    return pd.DataFrame(rows)

# evaluation.txt lines look like:
# "{model}\t{FS_desc}\t{omic_names}\t{acc}\t{prec}\t{f1}\t{auc}\t{recall}"
def parse_evaluation_txt(eval_path):
    if not Path(eval_path).exists():
        return None
    lines = [l.strip() for l in open(eval_path, "r").read().splitlines() if l.strip()]
    if not lines:
        return None
    # take the LAST non-empty line (this run)
    parts = lines[-1].split("\t")
    if len(parts) < 8:
        return None
    model, fs_desc, omic_desc, acc, prec, f1, auc, recall = parts[:8]
    return dict(model=model, fs_desc=fs_desc, omic_desc=omic_desc,
                acc=float(acc), prec=float(prec), f1=float(f1),
                auc=float(auc), recall=float(recall))

# ----------------------------
# 2) Main sweep
# ----------------------------
all_epoch_rows = []
all_run_rows   = []

# Configure a base seed so trial t, fold k seed is reproducible but distinct
BASE_SEED = 2025

total_plans = 0
for omics in OMIC_SUBSETS:
    for fsd in FSD_FLAGS:
        for method in FS_METHODS:
            for model in MODELS:
                if not valid_combo(model, omics):
                    continue
                total_plans += 1

print(f"Planned unique (omics,fsd,method,model) combos: {total_plans}")
print(f"Each combo runs {N_TRIALS} trials (single hold-out split per trial).")

for omics in OMIC_SUBSETS:
    for fsd in FSD_FLAGS:
        for method in FS_METHODS:
            for model in MODELS:
                if not valid_combo(model, omics):
                    continue

                for trial_idx in range(1, N_TRIALS+1):
                   
                        seed = BASE_SEED + (trial_idx) 

                        outdir = make_outdir(ROOT_OUT, omics, fsd, method, model, trial_idx, seed)
                        cmd    = build_cmd(omics, method, fsd, model, outdir, TEST_SIZE, seed, EPOCHS)

                        # Run the AttentionMOI CLI
                        print("\n==> RUN:", " ".join(shlex.quote(c) for c in cmd))
                        res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
                        # Persist raw stdout for debugging
                        (outdir/"stdout.txt").write_text(res.stdout)

                        # Parse logs to per-epoch CSV
                        log_path  = outdir/"log.txt"
                        eval_path = outdir/"evaluation.txt"

                        ep_df = parse_log_epochs(log_path, model)
                        if not ep_df.empty:
                            # annotate with run metadata
                            ep_df.insert(0, "model", model)
                            ep_df.insert(0, "method", "None" if method is None else method)
                            ep_df.insert(0, "fsd", int(fsd))
                            ep_df.insert(0, "omics", "-".join(omics))
                            ep_df.insert(0, "trial", trial_idx)
                            ep_df.insert(0, "fold", fold_idx)
                            ep_df.to_csv(outdir/"epoch_metrics.csv", index=False)
                            all_epoch_rows.append(ep_df)

                        # Parse evaluation.txt to per-run summary CSV
                        run_rec = parse_evaluation_txt(eval_path)
                        if run_rec is not None:
                            run_rec.update(dict(
                                omics="-".join(omics),
                                fsd=int(fsd),
                                method="None" if method is None else method,
                                model=model,
                                trial=trial_idx,
                                seed=seed,
                                outdir=str(outdir),
                            ))
                            # write a small run_summary.csv in the outdir
                            pd.DataFrame([run_rec]).to_csv(outdir/"run_summary.csv", index=False)
                            all_run_rows.append(run_rec)

# ----------------------------
# 3) Write global aggregators
# ----------------------------
if all_epoch_rows:
    epochs_all = pd.concat(all_epoch_rows, ignore_index=True)
    epochs_all.to_csv(ROOT_OUT/"ALL_epoch_metrics.csv", index=False)
    print("Wrote:", ROOT_OUT/"ALL_epoch_metrics.csv")

if all_run_rows:
    runs_all = pd.DataFrame(all_run_rows)
    runs_all.to_csv(ROOT_OUT/"ALL_run_summaries.csv", index=False)
    print("Wrote:", ROOT_OUT/"ALL_run_summaries.csv")

print("DONE.")


Planned unique (omics,fsd,method,model) combos: 460
Each combo runs 30 trials (single hold-out split per trial).


TypeError: make_outdir() missing 1 required positional argument: 'seed'