In [None]:
import pandas, seaborn, scipy, numpy, matplotlib, collections, itertools, math, functools, sys, sklearn

In [39]:
ega_filenames = pandas.read_table("../data/ega_contents.aspera.tsv", header=None)[0]

def lookup_filename(substring):
    results = ega_filenames[ega_filenames.str.contains(substring)]
    if len(results) == 0:
        print("NOT_FOUND:%s" % substring)
        return "NOT_FOUND:%s" % substring
    if len(results) > 1:
        print("MULTIPLE_FOUND:%s" % substring)
        return "MULTIPLE_FOUND:%s" % substring
    return list(results)[0]


In [58]:
samples = pandas.read_table("../data/sample.tsv")
specimens = pandas.read_table("../data/specimen.tsv")
ids = pandas.read_csv("../data/ICGC_IDs_19Oct2015_External_modified.csv")

sources = pandas.merge(ids, samples, left_on='DNA_biospecimen', right_on="submitted_sample_id", how='inner')
sources = pandas.merge(sources, specimens, on="icgc_specimen_id", how='inner').copy()


assert all(sources.submitted_donor_id_x == sources.submitted_donor_id_y)
assert all(sources.icgc_donor_id_x == sources.icgc_donor_id_y)

sources["cohort"] = "AOCS"
sources["donor"] = sources.submitted_donor_id_x
sources["tissue_type"] = sources.SpecimenType.map({"Tumour": "solid", "Ascites": "ascites"})
sources["timepoint"] = sources.CollectionPoint.map(
    {"Primary": "primary", "Recurrence": "recurrence", "Autopsy": "recurrence"})

sources["source_id"] = sources.DNA_biospecimen
sources.index = sources.source_id

sources["treated"] = sources.specimen_donor_treatment_type == "other therapy"
sources["metastasis"] = sources.specimen_type_description.str.contains("metastasis")
sources["interval_days"] = sources.specimen_interval

print('looking for dna')
sources["bam_path_tumor_dna"] = [lookup_filename(row.DNA_sample_string) for (i,row) in sources.iterrows()]

print("looking for rna")
sources["bam_path_tumor_rna"] = [lookup_filename(row["RNA data file"]) for (i,row) in sources.iterrows()]

sources.loc["AOCS-170-1-8", "treated"] = False  # weird incorrect data point based on email from Elizabeth Christie


assert sources["source_id"].nunique() == 114
assert len(sources["source_id"]) == 114

simple_sources = sources[
    "source_id,donor,cohort,treated,timepoint,metastasis,tissue_type,interval_days,bam_path_tumor_dna,bam_path_tumor_rna".split(",")
]

sources.to_csv("../data/sources.full.csv", index=False)
simple_sources.to_csv("../data/sources.simple.csv", index=False)


looking for dna
looking for rna
NOT_FOUND:140414_EXTERN_0192_C42UFACXX.nopd.AOCS_057_EXTERNAOCS20140414001
NOT_FOUND:140414_EXTERN_0192_C42UFACXX.nopd.AOCS_117_EXTERNAOCS20140414002
NOT_FOUND:131206_EXTERN_0046_BC2GHAACXX.nopd.AOCS_139_ICGCDBDE20131122037
NOT_FOUND:131206_EXTERN_0046_BC2GHAACXX.nopd.AOCS_139_ICGCDBDE20131122039
NOT_FOUND:131206_EXTERN_0046_BC2GHAACXX.nopd.AOCS_139_ICGCDBDE20131122041
NOT_FOUND:140414_EXTERN_0192_C42UFACXX.nopd.AOCS_141_EXTERNAOCS20140414003
NOT_FOUND:140414_EXTERN_0192_C42UFACXX.nopd.AOCS_150_EXTERNAOCS20140414004
NOT_FOUND:140414_EXTERN_0192_C42UFACXX.nopd.AOCS_157_EXTERNAOCS20140414005
NOT_FOUND:140414_EXTERN_0192_C42UFACXX.nopd.AOCS_158_EXTERNAOCS20140414006
NOT_FOUND:140414_EXTERN_0192_C42UFACXX.nopd.AOCS_160_EXTERNAOCS20140414007
NOT_FOUND:140227_EXTERN_0128_AC2V5YACXX.nopd.AOCS_167_EXTERNAOCS20140227009
NOT_FOUND:140227_EXTERN_0128_AC2V5YACXX.nopd.AOCS_167_EXTERNAOCS20140227011
NOT_FOUND:140414_EXTERN_0192_C42UFACXX.nopd.AOCS_168_EXTERNAOCS201404

In [50]:
simple_sources.tail()

Unnamed: 0_level_0,source_id,donor,cohort,treated,timepoint,metastasis,tissue_type,interval_days,bam_path_tumor_dna,bam_path_tumor_rna
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AOCS-169-1-0,AOCS-169-1-0,AOCS-169,AOCS,True,primary,False,solid,108,EGAZ00001018818_2014_ICGC_IcgcOvarian_AOCS169_...,NOT_FOUND:140414_EXTERN_0192_C42UFACXX.nopd.AO...
AOCS-170-1-8,AOCS-170-1-8,AOCS-170,AOCS,False,primary,False,solid,61,EGAZ00001018689_2014_ICGC_IcgcOvarian_AOCS170_...,EGAZ00001018763_2014_ICGC_140227_EXTERN_0129_B...
AOCS-170-3-5,AOCS-170-3-5,AOCS-170,AOCS,False,primary,False,ascites,0,EGAZ00001018606_2014_ICGC_IcgcOvarian_AOCS170_...,EGAZ00001018531_2014_ICGC_140227_EXTERN_0129_B...
AOCS-171-1-0,AOCS-171-1-0,AOCS-171,AOCS,False,primary,False,solid,0,EGAZ00001018682_2014_ICGC_IcgcOvarian_AOCS171_...,EGAZ00001018642_2014_ICGC_140227_EXTERN_0128_A...
AOCS-171-3-8,AOCS-171-3-8,AOCS-171,AOCS,False,primary,False,ascites,-23,EGAZ00001018724_2014_ICGC_IcgcOvarian_AOCS171_...,EGAZ00001018785_2014_ICGC_140227_EXTERN_0128_A...


In [47]:
original_sources = pandas.read_csv("../data/sources.aocs_only.csv")
original_sources
original_sources.treated.value_counts()

False    79
True     37
Name: treated, dtype: int64

In [52]:
original_sources.donor.nunique()
simple_sources.donor.nunique()

92

In [54]:
original_sources.shape, simple_sources.shape

((116, 10), (114, 10))

In [55]:
simple_sources.treated.value_counts(), original_sources.treated.value_counts()

(False    79
 True     35
 Name: treated, dtype: int64, False    79
 True     37
 Name: treated, dtype: int64)

In [56]:
simple_sources.metastasis.value_counts(), original_sources.metastasis.value_counts()

(False    109
 True       5
 Name: metastasis, dtype: int64, False    113
 True       3
 Name: metastasis, dtype: int64)