In [62]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [63]:
cna_df = pd.read_csv('data/inputs/all_cancers_cna_df.csv')
tsv_df = pd.read_csv('data/inputs/all_cancers_tsv_df.csv')
vcf_df = pd.read_csv('data/inputs/all_cancers_vcf_df.csv')

In [64]:
vcf_df["donor_id"] = vcf_df["ID"].astype(str)
vcf_df["sample_id"] = vcf_df["donor_id"] + "_" + vcf_df["cancer_type"].astype(str)

cna_df["donor_id"] = cna_df["donor_id"].astype(str)
cna_df["sample_id"] = cna_df["donor_id"] + "_" + cna_df["cancer_type"].astype(str)

tsv_df["donor_id"] = tsv_df["donor_id"].astype(str)
tsv_df["sample_id"] = tsv_df["donor_id"] + "_" + tsv_df["cancer_type"].astype(str)

## VCF

In [65]:
vcf_df[['AF', 'MS']] = vcf_df['INFO'].str.split(';', expand=True)
vcf_df = vcf_df.drop("INFO", axis = 1)

vcf_df['AF'] = vcf_df['AF'].str.replace('AF=', '', regex=False).astype(float)
vcf_df['MS'] = vcf_df['MS'].str.replace('MS=', '', regex=False)

In [79]:
def build_snv_features(vcf_df: pd.DataFrame) -> pd.DataFrame:
    """
    Features:
      - snv_total: total SNVs per donor
      - snv_chr_*: counts per chromosome
      - snv_ms_*: counts per mutation signature (MS column)
    """
    total = vcf_df.groupby("sample_id").size().rename("snv_total")

    # per-chromosome counts
    chr_counts = (
        vcf_df.groupby(["sample_id", "CHROM"])
          .size().rename("count")
          .reset_index()
          .pivot(index="sample_id", columns="CHROM", values="count")
          .fillna(0)
    )
    chr_counts.columns = [f"snv_chr_{c}" for c in chr_counts.columns]

    # mutation signature counts (MS column: SBS5, DEL, etc.)
    if "MS" in vcf_df.columns:
        ms_counts = (
            vcf_df.groupby(["sample_id", "MS"])
              .size().rename("count")
              .reset_index()
              .pivot(index="sample_id", columns="MS", values="count")
              .fillna(0)
        )
        ms_counts.columns = [f"snv_ms_{m}" for m in ms_counts.columns]
        snv_features = pd.concat([total, chr_counts, ms_counts], axis=1)
    else:
        snv_features = pd.concat([total, chr_counts], axis=1)

    return snv_features.reset_index()

snv_features = build_snv_features(vcf_df)

In [80]:
vcf_df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,cancer_type,donor_id,sample_id,AF,MS
0,1,3768230,sim3,G,T,.,.,Lymph-CLL,sim3,sim3_Lymph-CLL,0.27,SBS5
1,1,3770069,sim3,T,G,.,.,Lymph-CLL,sim3,sim3_Lymph-CLL,0.59,SBS5
2,1,5143136,sim3,A,G,.,.,Lymph-CLL,sim3,sim3_Lymph-CLL,0.21,SBS5
3,1,7284496,sim3,T,C,.,.,Lymph-CLL,sim3,sim3_Lymph-CLL,0.19,SBS5
4,1,17592191,sim3,G,A,.,.,Lymph-CLL,sim3,sim3_Lymph-CLL,0.13,SBS5
...,...,...,...,...,...,...,...,...,...,...,...,...
6719326,X,151793044,sim4,G,A,.,.,Breast-AdenoCa,sim4,sim4_Breast-AdenoCa,0.27,SBS5
6719327,X,152214102,sim4,G,T,.,.,Breast-AdenoCa,sim4,sim4_Breast-AdenoCa,0.44,SBS5
6719328,X,152930953,sim4,CTTTTTCTATT,C,.,.,Breast-AdenoCa,sim4,sim4_Breast-AdenoCa,0.18,DEL
6719329,X,154930673,sim4,T,A,.,.,Breast-AdenoCa,sim4,sim4_Breast-AdenoCa,0.26,SBS5


In [81]:
snv_features

Unnamed: 0,sample_id,snv_total,snv_chr_1,snv_chr_10,snv_chr_11,snv_chr_12,snv_chr_13,snv_chr_14,snv_chr_15,snv_chr_16,...,snv_ms_driver_TRIO_Intron,snv_ms_driver_TSC1_Intron,snv_ms_driver_TSC2_Intron,snv_ms_driver_VHL_Coding,snv_ms_driver_WHSC1L1_Intron,snv_ms_driver_WT1_Intron,snv_ms_driver_WWOX_Intron,snv_ms_driver_ZC3H11A_Intron,snv_ms_driver_ZFHX3_Intron,snv_ms_driver_ZNF292_Coding
0,sim100_Breast-AdenoCa,5590,470.0,215.0,256.0,265.0,142.0,159.0,137.0,136.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,sim100_CNS-PiloAstro,149,15.0,5.0,3.0,7.0,5.0,3.0,3.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,sim100_Eso-AdenoCa,24535,1606.0,1149.0,1284.0,945.0,1244.0,797.0,601.0,632.0,...,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0
3,sim100_Kidney-RCC,18624,1463.0,762.0,855.0,923.0,725.0,546.0,499.0,514.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,sim100_Liver-HCC,15206,1084.0,721.0,683.0,712.0,590.0,497.0,399.0,354.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,sim9_Kidney-RCC,18635,1397.0,807.0,841.0,851.0,753.0,516.0,470.0,497.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
796,sim9_Liver-HCC,11591,862.0,543.0,517.0,518.0,473.0,372.0,300.0,257.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
797,sim9_Lymph-CLL,1968,143.0,90.0,87.0,100.0,79.0,66.0,60.0,46.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
798,sim9_Panc-Endocrine,1987,123.0,68.0,57.0,112.0,82.0,79.0,39.0,36.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## CNA

In [69]:
def build_cna_features(cna: pd.DataFrame) -> pd.DataFrame:
    """
    Features:
      - cna_n_segments
      - cna_gain_Mb, cna_loss_Mb
      - cna_frac_altered
    Uses major_cn + minor_cn as total copy number.
    """
    cna["seg_len"] = cna["end"] - cna["start"]
    cna["total_cn"] = cna["major_cn"] + cna["minor_cn"]

    # simple thresholds, should be tweaked further
    cna["is_gain"] = cna["total_cn"] > 2.5
    cna["is_loss"] = cna["total_cn"] < 1.5

    def sum_where(x, mask):
        # x is seg_len series, mask is from full df
        return x[mask.loc[x.index]].sum()

    agg = cna.groupby("sample_id").agg(
        cna_n_segments=("seg_len", "size"),
        cna_total_len_bp=("seg_len", "sum"),
        cna_gain_bp=("seg_len", lambda x: sum_where(x, cna["is_gain"])),
        cna_loss_bp=("seg_len", lambda x: sum_where(x, cna["is_loss"])),
    )

    genome_size_bp = 3_000_000_000  # rough est of actual size of genome

    agg["cna_frac_altered"] = (agg["cna_gain_bp"] + agg["cna_loss_bp"]) / genome_size_bp

    # convert to Mb
    for col in ["cna_total_len_bp", "cna_gain_bp", "cna_loss_bp"]:
        agg[col.replace("_bp", "_Mb")] = agg[col] / 1e6
        agg.drop(columns=[col], inplace=True)

    return agg.reset_index()

cna_features = build_cna_features(cna_df)


In [74]:
cna_df

Unnamed: 0,chrom,start,end,major_cn,minor_cn,donor_id,study,id,cancer_type,sample_id,seg_len,total_cn,is_gain,is_loss
0,1,1,249250621,1,1,sim87,Lymph-CLL,cna0,Lymph-CLL,sim87_Lymph-CLL,249250620,2,False,False
1,2,1,243199373,1,1,sim87,Lymph-CLL,cna1,Lymph-CLL,sim87_Lymph-CLL,243199372,2,False,False
2,3,1,198022430,1,1,sim87,Lymph-CLL,cna2,Lymph-CLL,sim87_Lymph-CLL,198022429,2,False,False
3,4,1,191154276,1,1,sim87,Lymph-CLL,cna3,Lymph-CLL,sim87_Lymph-CLL,191154275,2,False,False
4,5,1,180915260,1,1,sim87,Lymph-CLL,cna4,Lymph-CLL,sim87_Lymph-CLL,180915259,2,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67714,X,146636776,149293459,2,2,sim51,Breast-AdenoCa,cna404,Breast-AdenoCa,sim51_Breast-AdenoCa,2656683,4,True,False
67715,X,149293460,152607683,2,0,sim51,Breast-AdenoCa,cna405,Breast-AdenoCa,sim51_Breast-AdenoCa,3314223,2,False,False
67716,X,152607684,155144332,2,1,sim51,Breast-AdenoCa,cna406,Breast-AdenoCa,sim51_Breast-AdenoCa,2536648,3,True,False
67717,X,155144333,155151791,4,1,sim51,Breast-AdenoCa,cna407,Breast-AdenoCa,sim51_Breast-AdenoCa,7458,5,True,False


In [70]:
cna_features

Unnamed: 0,sample_id,cna_n_segments,cna_frac_altered,cna_total_len_Mb,cna_gain_Mb,cna_loss_Mb
0,sim100_Breast-AdenoCa,197,0.702295,3036.303649,1440.570770,666.314749
1,sim100_CNS-PiloAstro,27,0.098346,3095.677385,295.039418,0.000000
2,sim100_Eso-AdenoCa,236,0.822269,3036.303610,1826.282458,640.523251
3,sim100_Kidney-RCC,34,0.263523,3095.677378,170.550246,620.019032
4,sim100_Liver-HCC,96,0.386137,3095.677316,634.121198,524.289882
...,...,...,...,...,...,...
795,sim9_Kidney-RCC,29,0.473517,3036.303817,583.233911,837.317002
796,sim9_Liver-HCC,54,0.661145,3095.677358,1454.263472,529.172519
797,sim9_Lymph-CLL,32,0.087747,3095.677380,134.439932,128.800105
798,sim9_Panc-Endocrine,67,0.891694,3095.677345,1147.262446,1527.820996


## SV

In [71]:
def build_sv_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Features:
      - sv_total: total SVs per donor
      - svclass: counts per svclass (DEL, DUP, h2hINV, t2tINV, ...)
    """
    # total SV count
    total = df.groupby("sample_id").size().rename("sv_total")

    # counts per type
    type_counts = (
        df.groupby(["sample_id", "svclass"])
          .size().rename("count")
          .reset_index()
          .pivot(index="sample_id", columns="svclass", values="count")
          .fillna(0)
    )
    type_counts.columns = [f"sv_{t}" for t in type_counts.columns]

    sv_features = pd.concat([total, type_counts], axis=1)
    return sv_features.reset_index()

sv_features = build_sv_features(tsv_df)


In [75]:
tsv_df

Unnamed: 0,chrom1,start1,end1,chrom2,start2,end2,strand1,strand2,svclass,id,allele,donor_id,tumor,cancer_type,sample_id
0,2,239593892,239593893,2,243199373,243199374,+,-,DEL,cna2,minor,sim11,Lymph-CLL,Lymph-CLL,sim11_Lymph-CLL
1,5,9097605,9097606,5,12331150,12331151,-,+,DUP,cna6,major,sim11,Lymph-CLL,Lymph-CLL,sim11_Lymph-CLL
2,5,12331151,12331152,5,180915260,180915261,+,-,DEL,cna7,minor,sim11,Lymph-CLL,Lymph-CLL,sim11_Lymph-CLL
3,6,141477017,141477018,6,171115067,171115068,+,-,DEL,cna9,minor,sim11,Lymph-CLL,Lymph-CLL,sim11_Lymph-CLL
4,9,140763827,140763828,9,141213431,141213432,-,+,DUP,cna13,major,sim11,Lymph-CLL,Lymph-CLL,sim11_Lymph-CLL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90886,22,51174528,51174529,22,51304566,51304567,-,+,DUP,cna234,major,sim76,Breast-AdenoCa,Breast-AdenoCa,sim76_Breast-AdenoCa
90887,X,3752614,3752615,X,18814659,18814660,+,+,h2hINV,cna235,major,sim76,Breast-AdenoCa,Breast-AdenoCa,sim76_Breast-AdenoCa
90888,X,42956969,42956970,X,50230425,50230426,-,+,DUP,cna236,major,sim76,Breast-AdenoCa,Breast-AdenoCa,sim76_Breast-AdenoCa
90889,X,122122165,122122166,X,155270560,155270561,-,+,DUP,cna238,minor,sim76,Breast-AdenoCa,Breast-AdenoCa,sim76_Breast-AdenoCa


In [72]:
sv_features

Unnamed: 0,sample_id,sv_total,sv_DEL,sv_DUP,sv_TRA,sv_h2hINV,sv_t2tINV
0,sim100_Breast-AdenoCa,339,66.0,209.0,25.0,13.0,26.0
1,sim100_CNS-PiloAstro,5,1.0,4.0,0.0,0.0,0.0
2,sim100_Eso-AdenoCa,367,85.0,246.0,15.0,16.0,5.0
3,sim100_Kidney-RCC,40,16.0,9.0,11.0,1.0,3.0
4,sim100_Liver-HCC,108,28.0,63.0,4.0,6.0,7.0
...,...,...,...,...,...,...,...
795,sim9_Kidney-RCC,24,10.0,7.0,3.0,1.0,3.0
796,sim9_Liver-HCC,83,18.0,47.0,9.0,6.0,3.0
797,sim9_Lymph-CLL,10,4.0,4.0,1.0,0.0,1.0
798,sim9_Panc-Endocrine,95,39.0,52.0,3.0,0.0,1.0


In [86]:
snv_features.to_csv('data/feature_eng/snv_features.csv')
cna_features.to_csv('data/feature_eng/cna_features.csv')
sv_features.to_csv('data/feature_eng/sv_features.csv')