Query2 was given as find cases with proteomic and genomic data. Noted that the query was formulated as "data ina PDC and GDC" which is not the same thing as a query on data type.

We do today have examples of find cases with proteomic and imaging data that would cross nodes.


In [1]:
from cdapython import Q, columns, unique_terms
unique_terms('ResearchSubject.Specimen.File.data_category')

SELECT DISTINCT(_File.data_category) FROM `gdc-bq-sample.cda_mvp.v3`, UNNEST(ResearchSubject) AS _ResearchSubject,UNNEST(_ResearchSubject.Specimen) AS _Specimen,UNNEST(_Specimen.File) AS _File ORDER BY _File.data_category


['Biospecimen',
 'Combined Nucleotide Variation',
 'Copy Number Variation',
 'DNA Methylation',
 'Peptide Spectral Matches',
 'Processed Mass Spectra',
 'Raw Mass Spectra',
 'Sequencing Reads',
 'Simple Nucleotide Variation',
 'Somatic Structural Variation',
 'Structural Variation',
 'Transcriptome Profiling']

Nothing there on imaging


In [2]:
unique_terms('ResearchSubject.Specimen.File.data_type')

SELECT DISTINCT(_File.data_type) FROM `gdc-bq-sample.cda_mvp.v3`, UNNEST(ResearchSubject) AS _ResearchSubject,UNNEST(_ResearchSubject.Specimen) AS _Specimen,UNNEST(_Specimen.File) AS _File ORDER BY _File.data_type


['Aggregated Somatic Mutation',
 'Aligned Reads',
 'Allele-specific Copy Number Segment',
 'Annotated Somatic Mutation',
 'Copy Number Segment',
 'Differential Gene Expression',
 'Gene Expression Quantification',
 'Gene Level Copy Number',
 'Gene Level Copy Number Scores',
 'Isoform Expression Quantification',
 'Masked Annotated Somatic Mutation',
 'Masked Copy Number Segment',
 'Masked Somatic Mutation',
 'Methylation Beta Value',
 'Open Standard',
 'Proprietary',
 'Raw CGI Variant',
 'Raw Simple Somatic Mutation',
 'Single Cell Analysis',
 'Slide Image',
 'Splice Junction Quantification',
 'Structural Rearrangement',
 'Text',
 'Transcript Fusion',
 'miRNA Expression Quantification']

In [3]:
unique_terms('ResearchSubject.Specimen.File.file_format')

SELECT DISTINCT(_File.file_format) FROM `gdc-bq-sample.cda_mvp.v3`, UNNEST(ResearchSubject) AS _ResearchSubject,UNNEST(_ResearchSubject.Specimen) AS _Specimen,UNNEST(_Specimen.File) AS _File ORDER BY _File.file_format


[None, 'mzIdentML', 'mzML', 'tsv', 'vendor-specific']

In [4]:
unique_terms('ResearchSubject.Specimen.File.type')

SELECT DISTINCT(_File.type) FROM `gdc-bq-sample.cda_mvp.v3`, UNNEST(ResearchSubject) AS _ResearchSubject,UNNEST(_ResearchSubject.Specimen) AS _Specimen,UNNEST(_Specimen.File) AS _File ORDER BY _File.type


[None]

In [5]:
import cda_client
host='https://cda.cda-dev.broadinstitute.org'
api_client = cda_client.ApiClient(configuration=cda_client.Configuration(host=host))
api_instance = cda_client.QueryApi(api_client)

datatypeQueryCount =  '''SELECT COUNT(DISTINCT p.id) patientCount FROM 
                    gdc-bq-sample.cda_mvp.v3 p,
                    
                    UNNEST(p.ResearchSubject) AS su1, 
                    UNNEST(su1.Specimen) AS sp1, 
                    UNNEST(sp1.File) AS f1, 
                    

                    UNNEST(p.ResearchSubject) AS su2, 
                    UNNEST(su2.Specimen) AS sp2, 
                    UNNEST(sp2.File) AS f2
                    
                    
                    WHERE (
                    ( f1.data_type in ( 'Slide Image'  ) )
                    AND
                    ( f2.data_category in ('Peptide Spectral Matches',
                                            'Processed Mass Spectra',
                                             'Raw Mass Spectra'                                         )
                    )
                    )'''

api_instance.sql_query('v3',datatypeQueryCount)

{'next_url': None,
 'previous_url': None,
 'query_sql': 'SELECT COUNT(DISTINCT p.id) patientCount FROM \n'
              '                    gdc-bq-sample.cda_mvp.v3 p,\n'
              '                    \n'
              '                    UNNEST(p.ResearchSubject) AS su1, \n'
              '                    UNNEST(su1.Specimen) AS sp1, \n'
              '                    UNNEST(sp1.File) AS f1, \n'
              '                    \n'
              '\n'
              '                    UNNEST(p.ResearchSubject) AS su2, \n'
              '                    UNNEST(su2.Specimen) AS sp2, \n'
              '                    UNNEST(sp2.File) AS f2\n'
              '                    \n'
              '                    \n'
              '                    WHERE (\n'
              "                    ( f1.data_type in ( 'Slide Image'  ) )\n"
              '                    AND\n'
              "                    ( f2.data_category in ('Peptide Spectral "
   

The disconcerting thing about that is that it gives exactly the same number of Patients as the query in Query 2.

In [20]:
def queryModV3(api_instance, query, limit=None):
    modQuery = '''
    with new_v3 as (select p.*,  d1.Treatment as treatment
    FROM gdc-bq-sample.cda_mvp.v3 p,
    unnest(ResearchSubject) as s1,
    UNNEST(s1.identifier) as i1,
    UNNEST(s1.Diagnosis) as d1
    where i1.system = 'GDC') 
    {}
    '''.format(query)
    return api_instance.sql_query('v3',modQuery, limit=limit)



In [29]:
tq2 = '''
select count(distinct new_v3.id) patientCount
from new_v3, unnest(new_v3.treatment) as tx,

unnest(ResearchSubject) as s1,
UNNEST(s1.identifier) as i1,

unnest(ResearchSubject) as s2,
UNNEST(s2.identifier) as i2

where tx.type = 'Radiation Therapy, NOS'
and i1.system = 'GDC'
and i2.system = 'PDC'

'''
res2 = queryModV3(api_instance, tq2)
res2

{'next_url': None,
 'previous_url': None,
 'query_sql': '\n'
              '    with new_v3 as (select p.*,  d1.Treatment as treatment\n'
              '    FROM gdc-bq-sample.cda_mvp.v3 p,\n'
              '    unnest(ResearchSubject) as s1,\n'
              '    UNNEST(s1.identifier) as i1,\n'
              '    UNNEST(s1.Diagnosis) as d1\n'
              "    where i1.system = 'GDC') \n"
              '    \n'
              'select count(distinct new_v3.id) patientCount\n'
              'from new_v3, unnest(new_v3.treatment) as tx,\n'
              '\n'
              'unnest(ResearchSubject) as s1,\n'
              'UNNEST(s1.identifier) as i1,\n'
              '\n'
              'unnest(ResearchSubject) as s2,\n'
              'UNNEST(s2.identifier) as i2\n'
              '\n'
              "where tx.type = 'Radiation Therapy, NOS'\n"
              "and i1.system = 'GDC'\n"
              "and i2.system = 'PDC'\n"
              '\n'
              '\n'
              '    ',
 'result'

In [52]:
datatypeQueryIds =  '''SELECT DISTINCT p.id FROM 
                    gdc-bq-sample.cda_mvp.v3 p,
                    
                    UNNEST(p.ResearchSubject) AS su1, 
                    UNNEST(su1.Specimen) AS sp1, 
                    UNNEST(sp1.File) AS f1, 
                    

                    UNNEST(p.ResearchSubject) AS su2, 
                    UNNEST(su2.Specimen) AS sp2, 
                    UNNEST(sp2.File) AS f2
                    
                    
                    WHERE (
                    ( f1.data_type in ( 'Slide Image'  ) )
                    AND
                    ( f2.data_category in ('Peptide Spectral Matches',
                                            'Processed Mass Spectra',
                                             'Raw Mass Spectra'                                         )
                    )
                    ) order by id'''

res = api_instance.sql_query('v3',datatypeQueryIds, limit=500)

In [53]:
print(len(res.result))

369


In [54]:
datatypeQuery2Ids = '''
select distinct new_v3.id
from new_v3, unnest(new_v3.treatment) as tx,

unnest(ResearchSubject) as s1,
UNNEST(s1.identifier) as i1,

unnest(ResearchSubject) as s2,
UNNEST(s2.identifier) as i2

where tx.type = 'Radiation Therapy, NOS'
and i1.system = 'GDC'
and i2.system = 'PDC'
order by id
'''
res2 = queryModV3(api_instance, datatypeQuery2Ids, limit=500)
print(len(res2.result))

369


In [55]:
res2.result == res.result

True

In [51]:
res.result

[{'id': 'TCGA-AA-3554'},
 {'id': 'TCGA-AA-3552'},
 {'id': 'TCGA-AA-A00K'},
 {'id': 'TCGA-AA-3561'},
 {'id': 'TCGA-AA-A01Z'},
 {'id': 'TCGA-AA-3818'},
 {'id': 'TCGA-AA-3710'},
 {'id': 'TCGA-AA-A00F'},
 {'id': 'TCGA-AA-3518'},
 {'id': 'TCGA-AA-3534'},
 {'id': 'TCGA-AA-A01F'},
 {'id': 'TCGA-AA-A004'},
 {'id': 'TCGA-AA-A01C'},
 {'id': 'TCGA-AA-3864'},
 {'id': 'TCGA-AA-3986'},
 {'id': 'TCGA-AA-A02Y'},
 {'id': 'TCGA-AA-3526'},
 {'id': 'TCGA-AA-3695'},
 {'id': 'TCGA-A6-3807'},
 {'id': 'TCGA-AA-3664'},
 {'id': 'TCGA-AA-A01V'},
 {'id': 'TCGA-AA-A02R'},
 {'id': 'TCGA-AA-A02H'},
 {'id': 'TCGA-AA-A022'},
 {'id': 'TCGA-A6-3810'},
 {'id': 'TCGA-AG-3593'},
 {'id': 'TCGA-AG-A014'},
 {'id': 'TCGA-AG-3580'},
 {'id': 'TCGA-AG-4007'},
 {'id': 'TCGA-AG-A015'},
 {'id': 'TCGA-AG-A011'},
 {'id': 'TCGA-AG-A00Y'},
 {'id': 'TCGA-AG-A00H'},
 {'id': 'TCGA-AG-3574'},
 {'id': 'TCGA-AG-A026'},
 {'id': 'TCGA-AG-A032'},
 {'id': 'TCGA-AG-3584'},
 {'id': 'TCGA-AG-A00C'},
 {'id': 'TCGA-42-2588'},
 {'id': 'TCGA-25-2404'},


In [59]:
allGDC_PDC = '''
select count (distinct new_v3.id) patientCount
from new_v3, unnest(new_v3.treatment) as tx,

unnest(ResearchSubject) as s1,
UNNEST(s1.identifier) as i1,

unnest(ResearchSubject) as s2,
UNNEST(s2.identifier) as i2

where i1.system = 'GDC'
and i2.system = 'PDC'
'''
res2 = queryModV3(api_instance, allGDC_PDC)
print(res2)



{'next_url': None,
 'previous_url': None,
 'query_sql': '\n'
              '    with new_v3 as (select p.*,  d1.Treatment as treatment\n'
              '    FROM gdc-bq-sample.cda_mvp.v3 p,\n'
              '    unnest(ResearchSubject) as s1,\n'
              '    UNNEST(s1.identifier) as i1,\n'
              '    UNNEST(s1.Diagnosis) as d1\n'
              "    where i1.system = 'GDC') \n"
              '    \n'
              'select count (distinct new_v3.id) patientCount\n'
              'from new_v3, unnest(new_v3.treatment) as tx,\n'
              '\n'
              'unnest(ResearchSubject) as s1,\n'
              'UNNEST(s1.identifier) as i1,\n'
              '\n'
              'unnest(ResearchSubject) as s2,\n'
              'UNNEST(s2.identifier) as i2\n'
              '\n'
              "where i1.system = 'GDC'\n"
              "and i2.system = 'PDC'\n"
              '\n'
              '    ',
 'result': [{'patientCount': 369}]}


In [61]:
newv3count = '''
select count (*) patientCount
from new_v3
'''
res2 = queryModV3(api_instance, newv3count)
print(res2)

{'next_url': None,
 'previous_url': None,
 'query_sql': '\n'
              '    with new_v3 as (select p.*,  d1.Treatment as treatment\n'
              '    FROM gdc-bq-sample.cda_mvp.v3 p,\n'
              '    unnest(ResearchSubject) as s1,\n'
              '    UNNEST(s1.identifier) as i1,\n'
              '    UNNEST(s1.Diagnosis) as d1\n'
              "    where i1.system = 'GDC') \n"
              '    \n'
              'select count (*) patientCount\n'
              'from new_v3\n'
              '\n'
              '    ',
 'result': [{'patientCount': 84206}]}


In [62]:
newv3count = '''
select count (distinct new_v3.id) patientCount
from new_v3
'''
res2 = queryModV3(api_instance, newv3count)
print(res2)

{'next_url': None,
 'previous_url': None,
 'query_sql': '\n'
              '    with new_v3 as (select p.*,  d1.Treatment as treatment\n'
              '    FROM gdc-bq-sample.cda_mvp.v3 p,\n'
              '    unnest(ResearchSubject) as s1,\n'
              '    UNNEST(s1.identifier) as i1,\n'
              '    UNNEST(s1.Diagnosis) as d1\n'
              "    where i1.system = 'GDC') \n"
              '    \n'
              'select count (distinct new_v3.id) patientCount\n'
              'from new_v3\n'
              '\n'
              '    ',
 'result': [{'patientCount': 84152}]}
