In [6]:
import glob
import os
import pandas as pd

In [19]:
cna_df = pd.read_csv('data/feature_eng/all_cancers_cna_df.csv').drop("Unnamed: 0", axis = 1)
tsv_df = pd.read_csv('data/feature_eng/all_cancers_tsv_df.csv').drop("Unnamed: 0", axis = 1)
vcf_df = pd.read_csv('data/feature_eng/all_cancers_vcf_df.csv').drop("Unnamed: 0", axis = 1)

snv_features = pd.read_csv('data/feature_eng/snv_features.csv').drop("Unnamed: 0", axis = 1)
cna_features = pd.read_csv('data/feature_eng/cna_features.csv').drop("Unnamed: 0", axis = 1)
sv_features = pd.read_csv('data/feature_eng/sv_features.csv').drop("Unnamed: 0", axis = 1)

In [8]:
# one label per sample_id
labels = (
    vcf_df[["sample_id", "cancer_type"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

labels

Unnamed: 0,sample_id,cancer_type
0,sim3_Lymph-CLL,Lymph-CLL
1,sim54_Lymph-CLL,Lymph-CLL
2,sim40_Lymph-CLL,Lymph-CLL
3,sim68_Lymph-CLL,Lymph-CLL
4,sim97_Lymph-CLL,Lymph-CLL
...,...,...
795,sim49_Breast-AdenoCa,Breast-AdenoCa
796,sim48_Breast-AdenoCa,Breast-AdenoCa
797,sim60_Breast-AdenoCa,Breast-AdenoCa
798,sim74_Breast-AdenoCa,Breast-AdenoCa


In [22]:
# --- Baseline (SNV-only) feature table ---
baseline_features = (
    snv_features
    .merge(labels, on="sample_id", how="inner")
)

# --- Multi-omic (SNV + CNA + SV) ---
multi_omic = (
    snv_features
    .merge(cna_features, on="sample_id", how="left")
    .merge(sv_features,  on="sample_id", how="left")
    .merge(labels,       on="sample_id", how="left")
)

multi_omic = multi_omic.fillna(0)  # no CNA/SV → treat as 0 burden
multi_omic.drop("cancer_type", axis = 1)

target_col = "cancer_type"

X_base = baseline_features.drop(columns=["sample_id", target_col])
y_base = baseline_features[target_col]

X_multi = multi_omic.drop(columns=["sample_id", target_col])
y_multi = multi_omic[target_col]

print("Baseline X/y:", X_base.shape, len(y_base))
print("Multi-omic X/y:", X_multi.shape, len(y_multi))


Baseline X/y: (800, 224) 800
Multi-omic X/y: (800, 235) 800


In [23]:
multi_omic

Unnamed: 0,sample_id,snv_total,snv_chr_1,snv_chr_10,snv_chr_11,snv_chr_12,snv_chr_13,snv_chr_14,snv_chr_15,snv_chr_16,...,cna_total_len_Mb,cna_gain_Mb,cna_loss_Mb,sv_total,sv_DEL,sv_DUP,sv_TRA,sv_h2hINV,sv_t2tINV,cancer_type
0,sim100_Breast-AdenoCa,5590,470.0,215.0,256.0,265.0,142.0,159.0,137.0,136.0,...,3036.303649,1440.570770,666.314749,339,66.0,209.0,25.0,13.0,26.0,Breast-AdenoCa
1,sim100_CNS-PiloAstro,149,15.0,5.0,3.0,7.0,5.0,3.0,3.0,7.0,...,3095.677385,295.039418,0.000000,5,1.0,4.0,0.0,0.0,0.0,CNS-PiloAstro
2,sim100_Eso-AdenoCa,24535,1606.0,1149.0,1284.0,945.0,1244.0,797.0,601.0,632.0,...,3036.303610,1826.282458,640.523251,367,85.0,246.0,15.0,16.0,5.0,Eso-AdenoCa
3,sim100_Kidney-RCC,18624,1463.0,762.0,855.0,923.0,725.0,546.0,499.0,514.0,...,3095.677378,170.550246,620.019032,40,16.0,9.0,11.0,1.0,3.0,Kidney-RCC
4,sim100_Liver-HCC,15206,1084.0,721.0,683.0,712.0,590.0,497.0,399.0,354.0,...,3095.677316,634.121198,524.289882,108,28.0,63.0,4.0,6.0,7.0,Liver-HCC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,sim9_Kidney-RCC,18635,1397.0,807.0,841.0,851.0,753.0,516.0,470.0,497.0,...,3036.303817,583.233911,837.317002,24,10.0,7.0,3.0,1.0,3.0,Kidney-RCC
796,sim9_Liver-HCC,11591,862.0,543.0,517.0,518.0,473.0,372.0,300.0,257.0,...,3095.677358,1454.263472,529.172519,83,18.0,47.0,9.0,6.0,3.0,Liver-HCC
797,sim9_Lymph-CLL,1968,143.0,90.0,87.0,100.0,79.0,66.0,60.0,46.0,...,3095.677380,134.439932,128.800105,10,4.0,4.0,1.0,0.0,1.0,Lymph-CLL
798,sim9_Panc-Endocrine,1987,123.0,68.0,57.0,112.0,82.0,79.0,39.0,36.0,...,3095.677345,1147.262446,1527.820996,95,39.0,52.0,3.0,0.0,1.0,Panc-Endocrine


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

random_state = 73

# stratified splits to preserve cancer_type distribution
Xb_train, Xb_val, yb_train, yb_val = train_test_split(
    X_base, y_base, test_size=0.2, random_state=random_state, stratify=y_base
)

Xm_train, Xm_val, ym_train, ym_val = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=random_state, stratify=y_multi
)

# scaling for linear models
scaler_base = StandardScaler()
Xb_train_scaled = scaler_base.fit_transform(Xb_train)
Xb_val_scaled   = scaler_base.transform(Xb_val)

scaler_multi = StandardScaler()
Xm_train_scaled = scaler_multi.fit_transform(Xm_train)
Xm_val_scaled   = scaler_multi.transform(Xm_val)

# models
logreg_base = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=1000,
    n_jobs=-1,
    random_state=random_state
)

logreg_multi = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=1000,
    n_jobs=-1,
    random_state=random_state
)

rf_multi = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=random_state
)

# --- train & eval ---

# 1) Baseline SNV-only
logreg_base.fit(Xb_train_scaled, yb_train)
yb_pred = logreg_base.predict(Xb_val_scaled)
base_acc = accuracy_score(yb_val, yb_pred)
base_f1  = f1_score(yb_val, yb_pred, average="macro")

# 2) Multi-omic, same model
logreg_multi.fit(Xm_train_scaled, ym_train)
ym_pred_lr = logreg_multi.predict(Xm_val_scaled)
multi_lr_acc = accuracy_score(ym_val, ym_pred_lr)
multi_lr_f1  = f1_score(ym_val, ym_pred_lr, average="macro")

# 3) Multi-omic, Random Forest
rf_multi.fit(Xm_train_scaled, ym_train)
ym_pred_rf = rf_multi.predict(Xm_val_scaled)
multi_rf_acc = accuracy_score(ym_val, ym_pred_rf)
multi_rf_f1  = f1_score(ym_val, ym_pred_rf, average="macro")

# comparison table
results = pd.DataFrame([
    {
        "Features": "SNV-only",
        "Model": "LogisticRegression",
        "Accuracy": base_acc,
        "Macro-F1": base_f1,
    },
    {
        "Features": "SNV + CNA + SV",
        "Model": "LogisticRegression",
        "Accuracy": multi_lr_acc,
        "Macro-F1": multi_lr_f1,
    },
    {
        "Features": "SNV + CNA + SV",
        "Model": "RandomForest",
        "Accuracy": multi_rf_acc,
        "Macro-F1": multi_rf_f1,
    },
])

print(results)


         Features               Model  Accuracy  Macro-F1
0        SNV-only  LogisticRegression   0.98125  0.981242
1  SNV + CNA + SV  LogisticRegression   0.98750  0.987637
2  SNV + CNA + SV        RandomForest   0.99375  0.993746


In [27]:

multi_omic['cancer_type']

0      Breast-AdenoCa
1       CNS-PiloAstro
2         Eso-AdenoCa
3          Kidney-RCC
4           Liver-HCC
            ...      
795        Kidney-RCC
796         Liver-HCC
797         Lymph-CLL
798    Panc-Endocrine
799     Prost-AdenoCA
Name: cancer_type, Length: 800, dtype: object