In [6]:
# Build X (radiomics features) and y (MGMT label)
# - Uses segmentation-based radiomics CSV (CaPTk_segm_*.csv)
# - Uses UPENN-GBM_clinical_info_v2.1.csv for labels
# - Does NOT save anything

import numpy as np
import pandas as pd
import numpy as np
from xgboost import XGBClassifier

# If you run this from: .../MRI/code/
FEATURES_PATH = "../preprocessing_data/Radiomic_Features_CaPTk_segm_FLAIR_NC.csv"
CLINICAL_PATH = "../preprocessing_data/UPENN-GBM_clinical_info_v2.1.csv"

# 1) Load
df_feat = pd.read_csv(FEATURES_PATH)   # has: SubjectID + many feature columns
df_clin = pd.read_csv(CLINICAL_PATH)  # has: ID, MGMT, ...

# 2) Merge on subject key
df = df_feat.merge(
    df_clin[["ID", "MGMT"]],
    left_on="SubjectID",
    right_on="ID",
    how="inner"
)

# 3) Build y from MGMT
# Keep only the two clean classes; drop "Not Available" and "Indeterminate"
df = df[df["MGMT"].isin(["Methylated", "Unmethylated"])].copy()

y = df["MGMT"].map({"Methylated": 1, "Unmethylated": 0}).astype(int)

# 4) Build X from numeric feature columns
# Drop identifier/label columns
X_all = df.drop(columns=["SubjectID", "ID", "MGMT"], errors="ignore")

# Keep only numeric columns (radiomics should be numeric)
X = X_all.select_dtypes(include=[np.number]).copy()

# Optional: remove all-NaN or constant columns (safe cleanup)
X = X.dropna(axis=1, how="all")
X = X.loc[:, X.nunique(dropna=True) > 1]

print("Merged rows with usable MGMT:", len(df))
print("X shape:", X.shape, "y shape:", y.shape)
print("Class balance (y):", y.value_counts().to_dict())

# Now you have:
#   X: pandas DataFrame
#   y: pandas Series (0/1)

Merged rows with usable MGMT: 59
X shape: (59, 135) y shape: (59,)
Class balance (y): {0: 37, 1: 22}


In [7]:
# Next step: verify test IDs exist in this dataframe, then split into train/test
# - Assumes you already ran the previous block and have: df, X, y
# - Assumes you already created a fixed test-id CSV (15/15) somewhere
# - Does NOT save anything

import pandas as pd

# Path to your fixed test IDs (edit if needed)
TEST_IDS_PATH = "../data/test_ids_core10_plus_extra20_seed340.csv"  # columns: ID, y_mgmt (or ID,y)

# 1) Load test IDs
test_ids_df = pd.read_csv(TEST_IDS_PATH)

# Robustly pick the ID column name from the test file
if "ID" in test_ids_df.columns:
    test_ids = set(test_ids_df["ID"].astype(str))
elif "SubjectID" in test_ids_df.columns:
    test_ids = set(test_ids_df["SubjectID"].astype(str))
else:
    raise ValueError(f"Test IDs file must contain 'ID' or 'SubjectID'. Columns: {list(test_ids_df.columns)}")

# 2) Check overlap with current df (this df only contains subjects in FLAIR_NC + usable MGMT)
df_ids = df["ID"].astype(str)
in_test_mask = df_ids.isin(test_ids)

n_total = len(df)
n_overlap = int(in_test_mask.sum())
print("=== Split verification ===")
print("Total rows in current df:", n_total)
print("Test IDs requested (file):", len(test_ids))
print("Overlap (test IDs found in this df):", n_overlap)

# Optional: show missing IDs (from test list that are NOT in this df)
missing = sorted(list(test_ids - set(df_ids)))
print("Missing test IDs (not present in this df):", len(missing))
print("First 10 missing:", missing[:10])

# 3) Split (NOTE: for this specific feature file, test size may be <30 due to missing IDs)
X_test = X.loc[in_test_mask].copy()
y_test = y.loc[in_test_mask].copy()

X_train = X.loc[~in_test_mask].copy()
y_train = y.loc[~in_test_mask].copy()

print("\n=== Split sizes ===")
print("Train:", X_train.shape, y_train.shape)
print("Test :", X_test.shape, y_test.shape)
print("Test class balance:", y_test.value_counts().to_dict())

# Now you have:
#   X_train, y_train, X_test, y_test
# ready for CV on train and final evaluation on test


=== Split verification ===
Total rows in current df: 59
Test IDs requested (file): 30
Overlap (test IDs found in this df): 12
Missing test IDs (not present in this df): 18
First 10 missing: ['UPENN-GBM-00301_11', 'UPENN-GBM-00381_11', 'UPENN-GBM-00405_11', 'UPENN-GBM-00424_11', 'UPENN-GBM-00445_11', 'UPENN-GBM-00446_11', 'UPENN-GBM-00447_11', 'UPENN-GBM-00452_11', 'UPENN-GBM-00455_11', 'UPENN-GBM-00458_11']

=== Split sizes ===
Train: (47, 135) (47,)
Test : (12, 135) (12,)
Test class balance: {1: 6, 0: 6}


In [8]:
# 1) XGBoost training
# Note: use eval_metric to avoid warning; keep it simple/robust.
xgb = XGBClassifier(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=340,
    n_jobs=-1,
    eval_metric="logloss",
)

xgb.fit(X_train, y_train)

# 2) Get feature importances and select > 0
importances = xgb.feature_importances_  # aligned with X_train.columns
selected_mask = importances > 0

selected_features = X_train.columns[selected_mask].tolist()

# 3) Filter X_train / X_test to selected features
X_train_sel = X_train[selected_features].copy()
X_test_sel = X_test[selected_features].copy()

# 4) Report how many features remain
print("=== XGBoost feature filtering ===")
print("Original #features:", X_train.shape[1])
print("#features with importance > 0:", len(selected_features))

# Optional: show top-20 by importance (non-zero)
top_idx = np.argsort(importances)[::-1]
top_nonzero = [i for i in top_idx if importances[i] > 0][:20]
print("\nTop features (up to 20):")
for i in top_nonzero:
    print(f"{X_train.columns[i]}  importance={importances[i]:.6f}")

# Now use:
#   X_train_sel, y_train
#   X_test_sel,  y_test


=== XGBoost feature filtering ===
Original #features: 135
#features with importance > 0: 106

Top features (up to 20):
FLAIR_NC_Histogram_Bins-16_Bins-16_Kurtosis  importance=0.048880
FLAIR_NC_Intensity_TenthPercentile  importance=0.036599
FLAIR_NC_Histogram_Bins-16_Bins-16_QuartileCoefficientOfVariation  importance=0.026606
FLAIR_NC_Intensity_StandardDeviation  importance=0.025830
FLAIR_NC_Histogram_Bins-16_Bins-16_Bin-13_Frequency  importance=0.024289
FLAIR_NC_Intensity_QuartileCoefficientOfVariation  importance=0.023323
FLAIR_NC_Morphologic_EllipseDiameter_Axis-2  importance=0.023193
FLAIR_NC_Histogram_Bins-16_Bins-16_NinetyFifthPercentile  importance=0.022022
FLAIR_NC_Intensity_InterQuartileRange  importance=0.021736
FLAIR_NC_Histogram_Bins-16_Bins-16_Bin-6_Frequency  importance=0.018740
FLAIR_NC_Intensity_Energy  importance=0.018419
FLAIR_NC_Histogram_Bins-16_Bins-16_Entropy  importance=0.018372
FLAIR_NC_Histogram_Bins-16_Bins-16_Bin-14_Probability  importance=0.018338
FLAIR_NC_Mo

In [9]:
# Save filtered (selected) features + labels into MRI/Net/
# Creates 4 files:
#   - Net/X_train.csv
#   - Net/y_train.csv
#   - Net/X_test.csv
#   - Net/y_test.csv

from pathlib import Path
import pandas as pd

# Assumes these exist from previous steps:
#   X_train_sel, y_train, X_test_sel, y_test

# Resolve output directory robustly (works regardless of current working directory)
# If running from MRI/code/, this will point to MRI/Net/
base_dir = Path("..").resolve()   # MRI/
net_dir = base_dir / "Net"
net_dir.mkdir(parents=True, exist_ok=True)

# Ensure indices align and save without index
X_train_sel.to_csv(net_dir / "X_train.csv", index=False)
pd.Series(y_train).reset_index(drop=True).to_csv(net_dir / "y_train.csv", index=False, header=["y"])

X_test_sel.to_csv(net_dir / "X_test.csv", index=False)
pd.Series(y_test).reset_index(drop=True).to_csv(net_dir / "y_test.csv", index=False, header=["y"])

print("Saved to:", net_dir)
print("Files:",
      (net_dir / "X_train.csv").name,
      (net_dir / "y_train.csv").name,
      (net_dir / "X_test.csv").name,
      (net_dir / "y_test.csv").name)
print("Shapes:",
      "X_train", X_train_sel.shape,
      "y_train", len(y_train),
      "X_test", X_test_sel.shape,
      "y_test", len(y_test))


Saved to: C:\Users\junse\Documents\research\UPENN-GBM\MRI\Net
Files: X_train.csv y_train.csv X_test.csv y_test.csv
Shapes: X_train (47, 106) y_train 47 X_test (12, 106) y_test 12
