In [2]:
import glob
import os
import pandas as pd

In [6]:
cancers = [
    'Lymph-CLL',
    'Prost-AdenoCA',
    'Panc-Endocrine', 
    'Liver-HCC',
    'Kidney-RCC',
    'CNS-PiloAstro',
    'Eso-AdenoCa',
    'Breast-AdenoCa'
]

cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]

all_dfs = []   # collect all cancer dataframes here

init_fp = os.getcwd()
cancer_dfs = []
for folder in cancers:
    cancer_vcf_files = glob.glob(f"data/input/OncoGAN_synthetic_VCFs/{folder}/*.vcf")
    for file in cancer_vcf_files:
        dt = pd.read_csv(file, sep="\t", comment="#", header=None, names=cols)
        df = pd.DataFrame(dt)
        df["cancer_type"] = folder
        cancer_dfs.append(df)

    # concat all files for this cancer
    cancer_df = pd.concat(cancer_dfs, ignore_index=True)
    all_dfs.append(cancer_df)

# now concat across all cancers
vcf_df = pd.concat(all_dfs, ignore_index=True)

out_dir = init_fp + "/data/interim"

out_path = os.path.join(out_dir, "all_cancers_vcf_df.csv")
vcf_df.to_csv(out_path, index=False)

print("Saved to:", out_path)


  dt = pd.read_csv(file, sep="\t", comment="#", header=None, names=cols)
  dt = pd.read_csv(file, sep="\t", comment="#", header=None, names=cols)


Saved to: /Users/lauren_myers/code/Explainable_AI/Explainable_AI_Genomics_Study/data/interim/all_cancers_vcf_df.csv


In [16]:
vcf_df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,cancer_type,verify
0,1,3768230,sim3,G,T,.,.,AF=0.27;MS=SBS5,Lymph-CLL,sim3Lymph-CLL
1,1,3770069,sim3,T,G,.,.,AF=0.59;MS=SBS5,Lymph-CLL,sim3Lymph-CLL
2,1,5143136,sim3,A,G,.,.,AF=0.21;MS=SBS5,Lymph-CLL,sim3Lymph-CLL
3,1,7284496,sim3,T,C,.,.,AF=0.19;MS=SBS5,Lymph-CLL,sim3Lymph-CLL
4,1,17592191,sim3,G,A,.,.,AF=0.13;MS=SBS5,Lymph-CLL,sim3Lymph-CLL
...,...,...,...,...,...,...,...,...,...,...
25069930,X,151793044,sim4,G,A,.,.,AF=0.27;MS=SBS5,Breast-AdenoCa,sim4Breast-AdenoCa
25069931,X,152214102,sim4,G,T,.,.,AF=0.44;MS=SBS5,Breast-AdenoCa,sim4Breast-AdenoCa
25069932,X,152930953,sim4,CTTTTTCTATT,C,.,.,AF=0.18;MS=DEL,Breast-AdenoCa,sim4Breast-AdenoCa
25069933,X,154930673,sim4,T,A,.,.,AF=0.26;MS=SBS5,Breast-AdenoCa,sim4Breast-AdenoCa


In [21]:
## Load CNAs
all_dfs = []

for folder in cancers:
    cancer_cna_files = glob.glob(f"data/input/OncoGAN_synthetic_CNA_SV/{folder}/*cna.tsv")
    cancer_dfs = []
    for file in cancer_cna_files:
        dt = pd.read_csv(file, sep="\t")
        df = pd.DataFrame(dt)
        df["cancer_type"] = folder
        cancer_dfs.append(df)

    cancer_df = pd.concat(cancer_dfs, ignore_index=True)
    all_dfs.append(cancer_df)

# now concat across all cancers
cna_df = pd.concat(all_dfs, ignore_index=True)

out_dir = init_fp + "/data/interim"

out_path = os.path.join(out_dir, "all_cancers_cna_df.csv")
cna_df.to_csv(out_path, index=False)

print(cna_df)


      chrom      start        end  major_cn  minor_cn donor_id  \
0         1          1  249250621         1         1    sim87   
1         2          1  243199373         1         1    sim87   
2         3          1  198022430         1         1    sim87   
3         4          1  191154276         1         1    sim87   
4         5          1  180915260         1         1    sim87   
...     ...        ...        ...       ...       ...      ...   
67714     X  146636776  149293459         2         2    sim51   
67715     X  149293460  152607683         2         0    sim51   
67716     X  152607684  155144332         2         1    sim51   
67717     X  155144333  155151791         4         1    sim51   
67718     X  155151792  155270560         2         2    sim51   

                study      id     cancer_type  
0           Lymph-CLL    cna0       Lymph-CLL  
1           Lymph-CLL    cna1       Lymph-CLL  
2           Lymph-CLL    cna2       Lymph-CLL  
3           Lym

In [22]:
## Load SVs
all_dfs = []

for folder in cancers:
    cancer_sv_files = glob.glob(f"data/input/OncoGAN_synthetic_CNA_SV/{folder}/*sv.tsv")
    cancer_dfs = []
    for file in cancer_sv_files:
        dt = pd.read_csv(file, sep="\t")
        df = pd.DataFrame(dt)
        df["cancer_type"] = folder
        cancer_dfs.append(df)

    cancer_df = pd.concat(cancer_dfs, ignore_index=True)
    all_dfs.append(cancer_df)

# now concat across all cancers
tsv_dict = pd.concat(all_dfs, ignore_index=True)

out_dir = init_fp + "/data/interim"

out_path = os.path.join(out_dir, "all_cancers_tsv_df.csv")
tsv_dict.to_csv(out_path, index=False)

print(tsv_dict)


      chrom1     start1       end1 chrom2     start2       end2 strand1  \
0          2  239593892  239593893      2  243199373  243199374       +   
1          5    9097605    9097606      5   12331150   12331151       -   
2          5   12331151   12331152      5  180915260  180915261       +   
3          6  141477017  141477018      6  171115067  171115068       +   
4          9  140763827  140763828      9  141213431  141213432       -   
...      ...        ...        ...    ...        ...        ...     ...   
90886     22   51174528   51174529     22   51304566   51304567       -   
90887      X    3752614    3752615      X   18814659   18814660       +   
90888      X   42956969   42956970      X   50230425   50230426       -   
90889      X  122122165  122122166      X  155270560  155270561       -   
90890      X  142307314  142307315      X  142388067  142388068       -   

      strand2 svclass      id allele donor_id           tumor     cancer_type  
0           -     D