In [1]:
import pandas, seaborn, scipy, numpy, matplotlib, collections, itertools, math, functools, sys, sklearn, os



In [2]:
ega_filenames = pandas.read_table("../data/ega_contents.aspera.tsv", header=None)[0]
ega_filenames = pandas.Series([x.replace(".aes", "") for x in ega_filenames])

downloaded_file_paths = pandas.read_table("../data/downloaded_file_paths.txt", header=None)[0]

ISSUE_EGA_FILENAMES = set()
def lookup_filename(substring):
    results = downloaded_file_paths[downloaded_file_paths.str.contains(substring)]
    ega_results = set(ega_filenames[ega_filenames.str.contains(substring)])
    
    if len(results) > 1:
        results = [x for x in results if next(iter(ega_results)) in x]

    if len(results) == 0:
        print("NOT_FOUND: %s, EGA results: %s" % (substring, ega_results))
        ISSUE_EGA_FILENAMES.update(ega_results)
        return "NOT_FOUND:%s" % substring
    if len(results) > 1:
        print("MULTIPLE_FOUND: %s %s, EGA results: %s" % (substring, sorted(results), ega_results))
        #ISSUE_EGA_FILENAMES.update(ega_results)
        return "MULTIPLE_FOUND:%s" % substring
    return list(results)[0]


In [3]:
samples = pandas.read_table("../data/sample.tsv")
specimens = pandas.read_table("../data/specimen.tsv")
ids = pandas.read_csv("../data/ICGC_IDs_19Oct2015_External_modified.csv")

sources = pandas.merge(ids, samples, left_on='DNA_biospecimen', right_on="submitted_sample_id", how='inner')
sources = pandas.merge(sources, specimens, on="icgc_specimen_id", how='inner').copy()


assert all(sources.submitted_donor_id_x == sources.submitted_donor_id_y)
assert all(sources.icgc_donor_id_x == sources.icgc_donor_id_y)

sources["cohort"] = "AOCS"
sources["donor"] = sources.submitted_donor_id_x
sources["tissue_type"] = sources.SpecimenType.map({"Tumour": "solid", "Ascites": "ascites"})
sources["timepoint"] = sources.CollectionPoint.map(
    {"Primary": "primary", "Recurrence": "recurrence", "Autopsy": "recurrence"})

sources["source_id"] = sources.DNA_biospecimen
sources.index = sources.source_id

sources["treated"] = sources.specimen_donor_treatment_type == "other therapy"
sources["metastasis"] = sources.specimen_type_description.str.contains("metastasis")
sources["interval_days"] = sources.specimen_interval

print('looking for dna')
sources["bam_path_tumor_dna"] = [lookup_filename(row.DNA_sample_string) for (i,row) in sources.iterrows()]

print("looking for rna")
sources["bam_path_tumor_rna"] = [lookup_filename(row["RNA data file"]) for (i,row) in sources.iterrows()]

sources.loc["AOCS-170-1-8", "treated"] = False  # weird incorrect data point based on email from Elizabeth Christie

assert sources["source_id"].nunique() == 114
assert len(sources["source_id"]) == 114

simple_sources = sources[
    "source_id,donor,cohort,treated,timepoint,metastasis,tissue_type,interval_days,bam_path_tumor_dna,bam_path_tumor_rna".split(",")
]

sources.to_csv("../data/sources.full.csv", index=False)
simple_sources.to_csv("../data/sources.simple.csv", index=False)


looking for dna
looking for rna
