In [1]:
# =========================
# Cell 1) Imports + Paths
# =========================
from pathlib import Path
import pandas as pd
from functools import reduce

# If you run this notebook inside "code/", Path.cwd() will be ".../code"
CODE_DIR = Path.cwd()

# Project root: if you're inside "code/", go one level up; otherwise use cwd
PROJECT_ROOT = CODE_DIR.parent if CODE_DIR.name.lower() == "code" else CODE_DIR

# Recommended folders
RAW_DIR = PROJECT_ROOT / "preprocessing_data"   # raw CSVs live here
OUT_DIR = PROJECT_ROOT / "data"                 # outputs live here

RAW_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DIR:", RAW_DIR)
print("OUT_DIR:", OUT_DIR)


PROJECT_ROOT: c:\Users\junse\Documents\research\UPENN-GBM\sample_code
RAW_DIR: c:\Users\junse\Documents\research\UPENN-GBM\sample_code\preprocessing_data
OUT_DIR: c:\Users\junse\Documents\research\UPENN-GBM\sample_code\data


In [2]:
# =========================
# Cell 2) Helper functions
# =========================

def detect_id_col(df: pd.DataFrame) -> str:
    """
    Detect an ID column name in a robust way.
    """
    candidates = ["ID", "Id", "id", "SubjectID", "subject_id", "BraTS21ID", "PatientID", "patient_id"]
    for c in candidates:
        if c in df.columns:
            return c
    # fallback: any column containing 'id'
    for c in df.columns:
        if "id" in c.lower():
            return c
    raise ValueError(f"Could not detect an ID column. Columns={list(df.columns)[:20]} ...")


def normalize_id_series(s: pd.Series) -> pd.Series:
    """
    Normalize IDs as strings: strip spaces.
    (Keep it conservative; do NOT change hyphen/underscore unless you are sure.)
    """
    return s.astype(str).str.strip()


def load_radiomics_one(path: Path) -> pd.DataFrame:
    """
    Load one radiomics CSV:
    - Normalize ID column name to 'ID'
    - Prefix feature columns by file stem to avoid collisions
    - Collapse duplicated IDs by mean over numeric columns (safety)
    """
    df = pd.read_csv(path)

    id_col = detect_id_col(df)
    if id_col != "ID":
        df = df.rename(columns={id_col: "ID"})

    df["ID"] = normalize_id_series(df["ID"])

    prefix = path.stem  # filename without .csv
    rename = {c: f"{prefix}__{c}" for c in df.columns if c != "ID"}
    df = df.rename(columns=rename)

    # If duplicated IDs exist, average numeric columns (rare but safe)
    df = df.groupby("ID", as_index=False).mean(numeric_only=True)
    return df


def build_y_from_clinical(clin_path: Path) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Build MGMT label y from the clinical CSV.
    Returns (y, clin_all).
      - y columns: ['ID', 'y_mgmt'] where y_mgmt=1 means Methylated
    """
    clin = pd.read_csv(clin_path)

    id_col = detect_id_col(clin)
    if id_col != "ID":
        clin = clin.rename(columns={id_col: "ID"})

    clin["ID"] = normalize_id_series(clin["ID"])

    if "MGMT" not in clin.columns:
        raise ValueError(f"Clinical CSV does not contain 'MGMT'. Columns={list(clin.columns)}")

    # Keep only labeled cases for supervised learning
    labeled = clin[clin["MGMT"].isin(["Methylated", "Unmethylated"])].copy()
    labeled["y_mgmt"] = (labeled["MGMT"] == "Methylated").astype(int)

    y = labeled[["ID", "y_mgmt"]].drop_duplicates("ID").reset_index(drop=True)
    return y, clin


In [3]:
# =========================
# Cell 3) Build X from ALL radiomics CSVs + Build y + Merge
# =========================

# --- 1) Locate clinical file ---
# Put the clinical CSV in RAW_DIR (recommended). If not there, try OUT_DIR as fallback.
CLIN_NAME = "UPENN-GBM_clinical_info_v2.1.csv"

clin_path = RAW_DIR / CLIN_NAME
if not clin_path.exists():
    clin_path = OUT_DIR / CLIN_NAME

if not clin_path.exists():
    raise FileNotFoundError(f"Cannot find clinical file: {CLIN_NAME} in RAW_DIR or OUT_DIR")

# --- 2) Choose where raw radiomics CSVs are ---
# Primary: RAW_DIR. If RAW_DIR has no radiomics CSVs, fallback to OUT_DIR.
raw_candidates = list(RAW_DIR.glob("*.csv"))
if len(raw_candidates) == 0:
    print("[Warning] RAW_DIR has no CSVs. Falling back to OUT_DIR for radiomics CSVs.")
    raw_candidates = list(OUT_DIR.glob("*.csv"))

# --- 3) Filter radiomics files ---
# Exclude clinical and any meta/parameter files
radiomics_paths = []
for p in raw_candidates:
    name = p.name
    if name == CLIN_NAME:
        continue
    if name.lower().startswith("fe_params"):
        continue
    # keep only radiomics CSVs
    if name.lower().startswith("radiomic_features_") and name.lower().endswith(".csv"):
        radiomics_paths.append(p)

radiomics_paths = sorted(radiomics_paths)

print(f"Found radiomics CSVs: {len(radiomics_paths)}")
print("First 5 files:")
for p in radiomics_paths[:5]:
    print(" -", p.name)

if len(radiomics_paths) == 0:
    raise RuntimeError("No radiomics CSVs found. Put Radiomic_Features_*.csv into preprocessing_data/ (recommended).")


# --- 4) Build y ---
y, clin_all = build_y_from_clinical(clin_path)

print("\n=== Clinical / y summary ===")
print("Clinical rows (all):", len(clin_all))
print("y rows (MGMT labeled):", len(y))
print("Unique IDs in y:", y["ID"].nunique())
print("y=1 proportion (methylated):", y["y_mgmt"].mean())
print("\n=== y.head(10) ===")
print(y.head(10))


# --- 5) Load radiomics, then merge to X ---
radiomics_dfs = []
loaded_files = []

print("\n=== Loading radiomics files ===")
for path in radiomics_paths:
    df = load_radiomics_one(path)
    radiomics_dfs.append(df)
    loaded_files.append(path.name)
    print(f"[Loaded] {path.name:55s} shape={df.shape}, unique_IDs={df['ID'].nunique()}")

print("\n=== Merge strategy: OUTER join to keep as many patients as possible ===")
X = radiomics_dfs[0]
print(f"Start: {loaded_files[0]} -> n={X['ID'].nunique()}, cols={X.shape[1]}")

merge_report = []
merge_report.append({"step": 0, "file_added": loaded_files[0], "n_unique_ids": X["ID"].nunique(), "n_cols": X.shape[1]})

for i, (fn, df) in enumerate(zip(loaded_files[1:], radiomics_dfs[1:]), start=1):
    before_n = X["ID"].nunique()
    before_cols = X.shape[1]
    # OUTER join = union of IDs; missing values will appear (you can impute later)
    X = X.merge(df, on="ID", how="outer")
    after_n = X["ID"].nunique()
    after_cols = X.shape[1]
    print(f"After + {fn}: n {before_n} -> {after_n}, cols {before_cols} -> {after_cols}")
    merge_report.append({"step": i, "file_added": fn, "n_unique_ids": after_n, "n_cols": after_cols})

print("\n=== Final X summary ===")
print("X shape (rows, cols):", X.shape)
print("Unique IDs in X:", X["ID"].nunique())
n_features = X.shape[1] - 1
print("Number of feature columns (excluding ID):", n_features)

print("\n=== X.head(5) (ID + first 10 feature columns) ===")
show_cols = ["ID"] + [c for c in X.columns if c != "ID"][:10]
print(X[show_cols].head(5))

print("\nFirst 20 column names:")
print(list(X.columns[:20]))
print("\nLast 20 column names:")
print(list(X.columns[-20:]))


# --- 6) Merge X with y (this defines your supervised dataset) ---
XY = X.merge(y, on="ID", how="inner")

print("\n=== Final XY summary ===")
print("XY shape (rows, cols):", XY.shape)
print("Unique IDs in XY:", XY["ID"].nunique())
print("Label distribution in XY:")
print(XY["y_mgmt"].value_counts(dropna=False))
print("y=1 proportion in XY:", XY["y_mgmt"].mean())

print("\n=== XY.head(5) (ID, y, + first 10 features) ===")
xy_show_cols = ["ID", "y_mgmt"] + [c for c in X.columns if c != "ID"][:10]
print(XY[xy_show_cols].head(5))


# --- 7) Missingness report (features only) ---
feat_cols = [c for c in X.columns if c != "ID"]
miss_rate = X[feat_cols].isna().mean().sort_values(ascending=False)

print("\n=== Top missing-rate features (first 15) ===")
print(miss_rate.head(15))


# --- 8) Save outputs ---
X_out = OUT_DIR / "X_all_radiomics.csv"
y_out = OUT_DIR / "y_mgmt.csv"
XY_out = OUT_DIR / "XY_mgmt.csv"
report_out = OUT_DIR / "merge_report.csv"

X.to_csv(X_out, index=False)
y.to_csv(y_out, index=False)
XY.to_csv(XY_out, index=False)
pd.DataFrame(merge_report).to_csv(report_out, index=False)

print("\n=== Saved files ===")
print("X:", X_out)
print("y:", y_out)
print("XY:", XY_out)
print("merge report:", report_out)


Found radiomics CSVs: 33
First 5 files:
 - Radiomic_Features_CaPTk_segm_DSC_ap-rCBV_ED.csv
 - Radiomic_Features_CaPTk_segm_DSC_ap-rCBV_ET.csv
 - Radiomic_Features_CaPTk_segm_DSC_ap-rCBV_NC.csv
 - Radiomic_Features_CaPTk_segm_DSC_PH_ED.csv
 - Radiomic_Features_CaPTk_segm_DSC_PH_ET.csv

=== Clinical / y summary ===
Clinical rows (all): 671
y rows (MGMT labeled): 291
Unique IDs in y: 291
y=1 proportion (methylated): 0.41580756013745707

=== y.head(10) ===
                   ID  y_mgmt
0  UPENN-GBM-00022_11       0
1  UPENN-GBM-00034_11       0
2  UPENN-GBM-00088_11       0
3  UPENN-GBM-00091_11       0
4  UPENN-GBM-00092_11       0
5  UPENN-GBM-00093_11       0
6  UPENN-GBM-00093_21       0
7  UPENN-GBM-00094_11       0
8  UPENN-GBM-00095_11       1
9  UPENN-GBM-00098_11       1

=== Loading radiomics files ===
[Loaded] Radiomic_Features_CaPTk_segm_DSC_ap-rCBV_ED.csv         shape=(207, 145), unique_IDs=207
[Loaded] Radiomic_Features_CaPTk_segm_DSC_ap-rCBV_ET.csv         shape=(206, 145),

In [4]:
# -------------------------
# Quick peek: X / y / XY
# -------------------------

print("=== Shapes ===")
print("X :", X.shape, " | unique IDs:", X["ID"].nunique())
print("y :", y.shape, " | unique IDs:", y["ID"].nunique())
print("XY:", XY.shape, "| unique IDs:", XY["ID"].nunique())
print()

print("=== Column counts ===")
print("X total cols:", X.shape[1], " (features =", X.shape[1]-1, "+ ID)")
print("y total cols:", y.shape[1], " (should be 2: ID + y_mgmt)")
print("XY total cols:", XY.shape[1], "(features + ID + y)")
print()

print("=== y head ===")
print(y.head(10))
print()

print("=== X head (ID + first 10 feature columns) ===")
x_preview_cols = ["ID"] + [c for c in X.columns if c != "ID"][:10]
print(X[x_preview_cols].head(5))
print()

print("=== X tail (ID + last 10 feature columns) ===")
x_preview_cols2 = ["ID"] + [c for c in X.columns if c != "ID"][-10:]
print(X[x_preview_cols2].head(5))
print()

print("=== XY head (ID + y + first 10 features) ===")
xy_preview_cols = ["ID", "y_mgmt"] + [c for c in XY.columns if c not in ["ID", "y_mgmt"]][:10]
print(XY[xy_preview_cols].head(5))
print()

print("=== Column name samples ===")
feat_cols = [c for c in X.columns if c != "ID"]
print("First 20 feature colnames:")
print(feat_cols[:20])
print()
print("Last 20 feature colnames:")
print(feat_cols[-20:])


=== Shapes ===
X : (232, 4753)  | unique IDs: 232
y : (291, 2)  | unique IDs: 291
XY: (59, 4754) | unique IDs: 59

=== Column counts ===
X total cols: 4753  (features = 4752 + ID)
y total cols: 2  (should be 2: ID + y_mgmt)
XY total cols: 4754 (features + ID + y)

=== y head ===
                   ID  y_mgmt
0  UPENN-GBM-00022_11       0
1  UPENN-GBM-00034_11       0
2  UPENN-GBM-00088_11       0
3  UPENN-GBM-00091_11       0
4  UPENN-GBM-00092_11       0
5  UPENN-GBM-00093_11       0
6  UPENN-GBM-00093_21       0
7  UPENN-GBM-00094_11       0
8  UPENN-GBM-00095_11       1
9  UPENN-GBM-00098_11       1

=== X head (ID + first 10 feature columns) ===
                   ID  \
0  UPENN-GBM-00001_11   
1  UPENN-GBM-00002_11   
2  UPENN-GBM-00003_11   
3  UPENN-GBM-00004_11   
4  UPENN-GBM-00005_11   

   Radiomic_Features_CaPTk_segm_DSC_ap-rCBV_ED__DSC_ap-rCBV_ED_Intensity_CoefficientOfVariation  \
0                                           0.753788                                        