Run a query which returns two columns

In [2]:
import cda_client
host='https://cda.cda-dev.broadinstitute.org'
api_client = cda_client.ApiClient(configuration=cda_client.Configuration(host=host))
api_instance = cda_client.QueryApi(api_client)

two_id_query = '''select p.id ,  s1.id
    FROM gdc-bq-sample.cda_mvp.v3 p,
    unnest(ResearchSubject) as s1,
    UNNEST(s1.identifier) as i1,
    UNNEST(s1.Diagnosis) as d1
    where i1.system ='GDC' 
    order by p.id '''
api_instance.sql_query('v3',two_id_query,
                      limit=10)

{'next_url': None,
 'previous_url': None,
 'query_sql': 'select p.id ,  s1.id\n'
              '    FROM gdc-bq-sample.cda_mvp.v3 p,\n'
              '    unnest(ResearchSubject) as s1,\n'
              '    UNNEST(s1.identifier) as i1,\n'
              '    UNNEST(s1.Diagnosis) as d1\n'
              "    where i1.system ='GDC' \n"
              '    order by p.id ',
 'result': [{'id': 'be37f1f7-2f98-4f74-bc04-6dd2ae2afcad'},
            {'id': 'e6915db0-7c89-484d-8f9f-15cca68b82fc'},
            {'id': '16614d46-172b-479c-992b-e80a8e9a2c59'},
            {'id': '567fc9e3-17a6-42b1-a896-5e9a9507d1d8'},
            {'id': '54e89878-a1bc-4f5a-9d68-4842a469586e'},
            {'id': 'a1c7b7b9-b8c8-48c3-9420-55497f9318fd'},
            {'id': 'ce3c8b98-e275-4cfd-a379-940d675a564b'},
            {'id': 'e4ce89ef-bcaa-418a-8a6b-3602793b9bbf'},
            {'id': '19d3c861-8a5f-49a2-acc0-b55b25465c35'},
            {'id': 'afae8dce-294a-4108-bb28-376f804ae5c4'}]}

Try the API directly. This demonstrates the issue occurs within the API, not the python client.

In [3]:
import requests
def runAPIQuery(querystring, limit=None):
    cdaURL = 'https://cda.cda-dev.broadinstitute.org/api/v1/sql-query/v3'
    #Using a limit:
    if limit is not None:
        cdaURL = "{}?limit={}".format(cdaURL, str(limit))
        
    headers = {'accept' : 'application/json', 'Content-Type' : 'text/plain'}

    request = requests.post(cdaURL, headers = headers, data = querystring)

    if request.status_code == 200:
        return request.json()
    else:
        raise Exception ("Query failed code {}. {}".format(request.status_code,query))
        
runAPIQuery(two_id_query, limit=10)

{'query_sql': "select p.id ,  s1.id\n    FROM gdc-bq-sample.cda_mvp.v3 p,\n    unnest(ResearchSubject) as s1,\n    UNNEST(s1.identifier) as i1,\n    UNNEST(s1.Diagnosis) as d1\n    where i1.system ='GDC' \n    order by p.id ",
 'result': [{'id': 'be37f1f7-2f98-4f74-bc04-6dd2ae2afcad'},
  {'id': 'e6915db0-7c89-484d-8f9f-15cca68b82fc'},
  {'id': '16614d46-172b-479c-992b-e80a8e9a2c59'},
  {'id': '567fc9e3-17a6-42b1-a896-5e9a9507d1d8'},
  {'id': '54e89878-a1bc-4f5a-9d68-4842a469586e'},
  {'id': 'a1c7b7b9-b8c8-48c3-9420-55497f9318fd'},
  {'id': 'ce3c8b98-e275-4cfd-a379-940d675a564b'},
  {'id': 'e4ce89ef-bcaa-418a-8a6b-3602793b9bbf'},
  {'id': '19d3c861-8a5f-49a2-acc0-b55b25465c35'},
  {'id': 'afae8dce-294a-4108-bb28-376f804ae5c4'}],
 'previous_url': None,
 'next_url': None}

The SQL is legal, as demonstrated by the fact it runs. In this case what is actually returned by BigQuery can't be seen. However, the following comparable query to BigQuery also has two 'id' columns. BiqQuery returns these as 'id' and 'id_1'.

` SELECT f.id, r.*, sa.id, f.object.aliquot_value.*
FROM isbcgc-216220.1000Genomes.BDC_1000Genomes f, unnest (relations) as r
join isbcgc-216220.1000Genomes.BDC_1000Genomes sa on r.dst_id = sa.id
where f.name = 'aliquot' `


In [4]:
test_query = '''select p.id ,  su.id, su_id.system, su_id.value, sp_id.system, sp_id.value
    FROM gdc-bq-sample.cda_mvp.v3 p,
    unnest(ResearchSubject) as su,
    UNNEST(su.Identifier) as su_id,
    unnest(Specimen) as sp,
    UNNEST(sp.Identifier) as sp_id
    order by p.id '''
runAPIQuery(test_query,
                      limit=10)

{'query_sql': 'select p.id ,  su.id, su_id.system, su_id.value, sp_id.system, sp_id.value\n    FROM gdc-bq-sample.cda_mvp.v3 p,\n    unnest(ResearchSubject) as su,\n    UNNEST(su.Identifier) as su_id,\n    unnest(Specimen) as sp,\n    UNNEST(sp.Identifier) as sp_id\n    order by p.id ',
 'result': [{'id': 'be37f1f7-2f98-4f74-bc04-6dd2ae2afcad',
   'system': 'GDC',
   'value': '786c67dd-7425-5804-97ce-7b64e96569c1'},
  {'id': 'be37f1f7-2f98-4f74-bc04-6dd2ae2afcad',
   'system': 'GDC',
   'value': 'dcc36b6e-7743-40b4-98eb-06580a694e0e'},
  {'id': 'be37f1f7-2f98-4f74-bc04-6dd2ae2afcad',
   'system': 'GDC',
   'value': '721645ba-916b-5481-bf76-63891cb79836'},
  {'id': 'be37f1f7-2f98-4f74-bc04-6dd2ae2afcad',
   'system': 'GDC',
   'value': '49602e48-0ccf-5f3c-9529-255fc3f3368f'},
  {'id': 'be37f1f7-2f98-4f74-bc04-6dd2ae2afcad',
   'system': 'GDC',
   'value': '3ae5c716-7061-4236-8a90-bddc4e6e4834'},
  {'id': 'be37f1f7-2f98-4f74-bc04-6dd2ae2afcad',
   'system': 'GDC',
   'value': 'e0c6965b-f

What happens if we look at the raw content of the API response?

In [13]:
cdaURL = 'https://cda.cda-dev.broadinstitute.org/api/v1/sql-query/v3?limit=10'

headers = {'accept' : 'application/json', 'Content-Type' : 'text/plain'}

request = requests.post(cdaURL, headers = headers, data = two_id_query)
print (request.content)

b'{"query_sql":"select p.id ,  s1.id\\n    FROM gdc-bq-sample.cda_mvp.v3 p,\\n    unnest(ResearchSubject) as s1,\\n    UNNEST(s1.identifier) as i1,\\n    UNNEST(s1.Diagnosis) as d1\\n    where i1.system =\'GDC\' \\n    order by p.id ","result":[{\n  "id": "01BR001",\n  "id": "be37f1f7-2f98-4f74-bc04-6dd2ae2afcad"\n}, {\n  "id": "01BR008",\n  "id": "e6915db0-7c89-484d-8f9f-15cca68b82fc"\n}, {\n  "id": "01BR009",\n  "id": "16614d46-172b-479c-992b-e80a8e9a2c59"\n}, {\n  "id": "01BR010",\n  "id": "567fc9e3-17a6-42b1-a896-5e9a9507d1d8"\n}, {\n  "id": "01BR015",\n  "id": "54e89878-a1bc-4f5a-9d68-4842a469586e"\n}, {\n  "id": "01BR017",\n  "id": "a1c7b7b9-b8c8-48c3-9420-55497f9318fd"\n}, {\n  "id": "01BR018",\n  "id": "ce3c8b98-e275-4cfd-a379-940d675a564b"\n}, {\n  "id": "01BR020",\n  "id": "e4ce89ef-bcaa-418a-8a6b-3602793b9bbf"\n}, {\n  "id": "01BR023",\n  "id": "19d3c861-8a5f-49a2-acc0-b55b25465c35"\n}, {\n  "id": "01BR025",\n  "id": "afae8dce-294a-4108-bb28-376f804ae5c4"\n}],"previous_url":

The multiple ids are present there in the raw content. As follows it is request.json() that eliminates them.

In [14]:
request.json()

{'query_sql': "select p.id ,  s1.id\n    FROM gdc-bq-sample.cda_mvp.v3 p,\n    unnest(ResearchSubject) as s1,\n    UNNEST(s1.identifier) as i1,\n    UNNEST(s1.Diagnosis) as d1\n    where i1.system ='GDC' \n    order by p.id ",
 'result': [{'id': 'be37f1f7-2f98-4f74-bc04-6dd2ae2afcad'},
  {'id': 'e6915db0-7c89-484d-8f9f-15cca68b82fc'},
  {'id': '16614d46-172b-479c-992b-e80a8e9a2c59'},
  {'id': '567fc9e3-17a6-42b1-a896-5e9a9507d1d8'},
  {'id': '54e89878-a1bc-4f5a-9d68-4842a469586e'},
  {'id': 'a1c7b7b9-b8c8-48c3-9420-55497f9318fd'},
  {'id': 'ce3c8b98-e275-4cfd-a379-940d675a564b'},
  {'id': 'e4ce89ef-bcaa-418a-8a6b-3602793b9bbf'},
  {'id': '19d3c861-8a5f-49a2-acc0-b55b25465c35'},
  {'id': 'afae8dce-294a-4108-bb28-376f804ae5c4'}],
 'previous_url': None,
 'next_url': None}

That is a problem in the python requests module. The content is not legal json - so the requests module would be better raising an error rather than passing on an incomplete version of the content.

In [None]:
The CDA REST API issue is that it should avoid passing back illegal j