In [2]:
import glob
import os
import pandas as pd

In [73]:
path = "/Users/lauren_myers/code/OncoGAN/data/OncoGAN_synthetic_VCFs/"
os.chdir(path)

cancers = [
    'Lymph-CLL',
    'Prost-AdenoCA',
    'Panc-Endocrine', 
    'Liver-HCC',
    'Kidney-RCC',
    'CNS-PiloAstro',
    'Eso-AdenoCa',
    'Breast-AdenoCa'
]

cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]

all_dfs = []   # collect all cancer dataframes here

for folder in cancers:
    print(folder)
    fp = path + folder
    os.chdir(fp)
    cancer_vcf_files = glob.glob("*.vcf")

    cancer_dfs = []
    for file in cancer_vcf_files:
        cfp = fp + '/' + file
        dt = pd.read_csv(cfp, sep="\t", comment="#", header=None, names=cols)
        df = pd.DataFrame(dt)
        df["cancer_type"] = folder
        cancer_dfs.append(df)

    # concat all files for this cancer
    cancer_df = pd.concat(cancer_dfs, ignore_index=True)
    all_dfs.append(cancer_df)

    os.chdir("..")

# now concat across all cancers
vcf_df = pd.concat(all_dfs, ignore_index=True)

out_dir = "/Users/lauren_myers/code/Pipeline/data/inputs"
os.makedirs(out_dir, exist_ok=True) 

out_path = os.path.join(out_dir, "all_cancers_vcf_df.csv")
vcf_df.to_csv(out_path, index=False)

print("Saved to:", out_path)


Lymph-CLL
Prost-AdenoCA
Panc-Endocrine
Liver-HCC
Kidney-RCC
CNS-PiloAstro
Eso-AdenoCa


  dt = pd.read_csv(cfp, sep="\t", comment="#", header=None, names=cols)
  dt = pd.read_csv(cfp, sep="\t", comment="#", header=None, names=cols)


Breast-AdenoCa
Saved to: /Users/lauren_myers/code/Pipeline/data/inputs/all_cancers_vcf_df.csv


In [78]:
## Load CNAs
path = "/Users/lauren_myers/code/OncoGAN/data/OncoGAN_synthetic_CNA_SV/"
os.chdir(path)

all_dfs = []

for folder in cancers:
    fp = path + folder
    os.chdir(fp)
    
    cancer_cna_files = glob.glob("*cna.tsv")

    cancer_dfs = []
    for file in cancer_cna_files:
        cfp = fp + '/' + file
        dt = pd.read_csv(cfp, sep="\t")
        df = pd.DataFrame(dt)
        df["cancer_type"] = folder
        cancer_dfs.append(df)

    cancer_df = pd.concat(cancer_dfs, ignore_index=True)
    all_dfs.append(cancer_df)

    os.chdir("..")

# now concat across all cancers
cna_df = pd.concat(all_dfs, ignore_index=True)

out_path = os.path.join(out_dir, "all_cancers_cna_df.csv")
cna_df.to_csv(out_path, index=False)

print(cna_df)


      chrom      start        end  major_cn  minor_cn donor_id  \
0         1          1  249250621         1         1    sim87   
1         2          1  243199373         1         1    sim87   
2         3          1  198022430         1         1    sim87   
3         4          1  191154276         1         1    sim87   
4         5          1  180915260         1         1    sim87   
...     ...        ...        ...       ...       ...      ...   
67714     X  146636776  149293459         2         2    sim51   
67715     X  149293460  152607683         2         0    sim51   
67716     X  152607684  155144332         2         1    sim51   
67717     X  155144333  155151791         4         1    sim51   
67718     X  155151792  155270560         2         2    sim51   

                study      id     cancer_type  
0           Lymph-CLL    cna0       Lymph-CLL  
1           Lymph-CLL    cna1       Lymph-CLL  
2           Lymph-CLL    cna2       Lymph-CLL  
3           Lym

In [77]:
## Load TSVs
path = "/Users/lauren_myers/code/OncoGAN/data/OncoGAN_synthetic_CNA_SV/"
os.chdir(path)

all_dfs = []

for folder in cancers:
    fp = path + folder
    os.chdir(fp)
    cancer_sv_files = glob.glob("*sv.tsv")

    cancer_dfs = []
    for file in cancer_sv_files:
        cfp = fp + '/' + file
        dt = pd.read_csv(cfp, sep="\t")
        df = pd.DataFrame(dt)
        df["cancer_type"] = folder
        cancer_dfs.append(df)

    cancer_df = pd.concat(cancer_dfs, ignore_index=True)
    all_dfs.append(cancer_df)

    os.chdir("..")

# now concat across all cancers
tsv_dict = pd.concat(all_dfs, ignore_index=True)

out_path = os.path.join(out_dir, "all_cancers_tsv_df.csv")
tsv_dict.to_csv(out_path, index=False)

print(tsv_dict)


      chrom1     start1       end1 chrom2     start2       end2 strand1  \
0          2  239593892  239593893      2  243199373  243199374       +   
1          5    9097605    9097606      5   12331150   12331151       -   
2          5   12331151   12331152      5  180915260  180915261       +   
3          6  141477017  141477018      6  171115067  171115068       +   
4          9  140763827  140763828      9  141213431  141213432       -   
...      ...        ...        ...    ...        ...        ...     ...   
90886     22   51174528   51174529     22   51304566   51304567       -   
90887      X    3752614    3752615      X   18814659   18814660       +   
90888      X   42956969   42956970      X   50230425   50230426       -   
90889      X  122122165  122122166      X  155270560  155270561       -   
90890      X  142307314  142307315      X  142388067  142388068       -   

      strand2 svclass      id allele donor_id           tumor     cancer_type  
0           -     D