## Using SQL to explore CDA
First a couple of cdapython queries to check some of its not immediately obvious aspects.

In [7]:
from cdapython import Q, columns, unique_terms
unique_terms("ResearchSubject.Diagnosis.tumor_stage")

SELECT DISTINCT(_Diagnosis.tumor_stage) FROM `gdc-bq-sample.cda_mvp.v3`, UNNEST(ResearchSubject) AS _ResearchSubject,UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis ORDER BY _Diagnosis.tumor_stage


[None,
 '',
 '1B',
 '2A',
 '2B',
 '3',
 '4',
 'Adverse',
 'Favorable',
 'FavorableOrIntermediate',
 'I',
 'I b',
 'IA',
 'IB',
 'IC',
 'II',
 'II b',
 'IIA',
 'IIB',
 'III',
 'IIIA',
 'IIIB',
 'IIIC',
 'IPI:0',
 'IPI:1',
 'IPI:12',
 'IPI:13',
 'IPI:14',
 'IPI:15',
 'IPI:2',
 'IPI:23',
 'IPI:24',
 'IPI:25',
 'IPI:3',
 'IPI:34',
 'IPI:35',
 'IPI:4',
 'IPI:45',
 'IPI:5',
 'IV',
 'IVA',
 'IVB',
 'IVa',
 'IVb',
 'Intermediate',
 'IntermediateOrAdverse',
 'N/A',
 'Normal',
 'Not Performed',
 'Not Reported',
 'Not Reported/ Unknown',
 'Not Reported/Unknown',
 'PT4apN0',
 'Stage 1B',
 'Stage I',
 'Stage IA',
 'Stage IA3',
 'Stage IB',
 'Stage IC',
 'Stage II',
 'Stage IIA',
 'Stage IIB',
 'Stage III',
 'Stage IIIA',
 'Stage IIIB',
 'Stage IIIC',
 'Stage IV',
 'Stage IVA',
 'Stage IVB',
 'Stage1',
 'T1N0Mx',
 'T1aN0M0',
 'T2',
 'TxNxM1',
 'Unknown',
 'i',
 'i/ii nos',
 'ii',
 'ii/v',
 'iii',
 'iii/v',
 'iiib',
 'iiib/v',
 'is',
 'iv',
 'iv/v',
 'na',
 'no resection',
 'not reported',
 'pT1',
 '

I asked if we can do unique terms for a project. Well it turns out we can do it for a 'system'.

In [12]:
unique_terms("ResearchSubject.Diagnosis.tumor_stage", system='PDC')

SELECT DISTINCT(_Diagnosis.tumor_stage) FROM `gdc-bq-sample.cda_mvp.v3`, UNNEST(ResearchSubject) AS _ResearchSubject,UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis,UNNEST(_ResearchSubject.identifier) AS _identifier WHERE _identifier.system="PDC" ORDER BY _Diagnosis.tumor_stage


['',
 'IA',
 'IB',
 'IIA',
 'IIB',
 'IIIA',
 'IIIB',
 'IV',
 'IVB',
 'N/A',
 'Not Performed',
 'Not Reported',
 'Not Reported/ Unknown',
 'Not Reported/Unknown',
 'Stage 1B',
 'Stage I',
 'Stage IA',
 'Stage IA3',
 'Stage IB',
 'Stage IC',
 'Stage II',
 'Stage IIA',
 'Stage IIB',
 'Stage III',
 'Stage IIIA',
 'Stage IIIB',
 'Stage IIIC',
 'Stage IV',
 'Stage IVA',
 'Stage1',
 'Unknown',
 'no resection',
 'pT1',
 'pT1a',
 'pT1b',
 'pT2 N0',
 'pT2, pN2, pM not applicable',
 'pT2N1',
 'pT2N2',
 'pT3 N2 M(Not Applicable)',
 'pT3a',
 'pT4',
 'stage iiic']

But to explore at a project level it looks like we must resort to SQL.

### Passing Direct SQL Queries
It would be useful to pass some direct SQL queries. Swagger is possible, but not really scriptable. If I want to call the REST API directly from Python I'll probably end up writing a client. How does cdapython call the REST API? It a pre-existing client already exists - in the cda_client module. No need to write our own. Even better, it's already installed; as a cdapython dependency.

First some set up.

In [1]:
import cda_client
host='https://cda.cda-dev.broadinstitute.org'
api_client = cda_client.ApiClient(configuration=cda_client.Configuration(host=host))
api_instance = cda_client.QueryApi(api_client)

Now we can run some SQL.

One oddity is that version is a mandatory parameter to sql_query, but the version is actually specified as part of the table name. Currently you can pass anything to version. Not to say that will always be the case, and we should respect the intent of the interface and use a proper version number, (apart from the following, just to illustrate).

In [4]:
sql = '''SELECT DISTINCT(_Diagnosis.tumor_stage) 
FROM `gdc-bq-sample.cda_mvp.v2`, 
UNNEST(ResearchSubject) AS _ResearchSubject,
UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis,
UNNEST(_ResearchSubject.identifier) AS _identifier 
WHERE _identifier.system="PDC" ORDER BY _Diagnosis.tumor_stage'''
api_response = api_instance.sql_query('anything_goes_in',sql)
print(api_response)

{'next_url': None,
 'previous_url': None,
 'query_sql': 'SELECT DISTINCT(_Diagnosis.tumor_stage) \n'
              'FROM `gdc-bq-sample.cda_mvp.v2`, \n'
              'UNNEST(ResearchSubject) AS _ResearchSubject,\n'
              'UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis,\n'
              'UNNEST(_ResearchSubject.identifier) AS _identifier \n'
              'WHERE _identifier.system="PDC" ORDER BY _Diagnosis.tumor_stage',
 'result': [{'tumor_stage': ''},
            {'tumor_stage': 'IA'},
            {'tumor_stage': 'IB'},
            {'tumor_stage': 'IIA'},
            {'tumor_stage': 'IIB'},
            {'tumor_stage': 'IIIA'},
            {'tumor_stage': 'IIIB'},
            {'tumor_stage': 'IV'},
            {'tumor_stage': 'IVB'},
            {'tumor_stage': 'N/A'},
            {'tumor_stage': 'Not Performed'},
            {'tumor_stage': 'Not Reported'},
            {'tumor_stage': 'Not Reported/ Unknown'},
            {'tumor_stage': 'Not Reported/Unknown'},
            

Now we can explore the use of tumor_stage across projects.

In [5]:
sql = '''SELECT _Diagnosis.tumor_stage, _ResearchSubject.associated_project , count(*) row_count
FROM `gdc-bq-sample.cda_mvp.v3`, 
UNNEST(ResearchSubject) AS _ResearchSubject,
UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis 
WHERE _Diagnosis.tumor_stage in ('IIIC', 'Stage IIIC', 'stage iiic')
group BY _ResearchSubject.associated_project, _Diagnosis.tumor_stage
order BY _ResearchSubject.associated_project, _Diagnosis.tumor_stage '''

api_response = api_instance.sql_query('v3',sql)
print(api_response)



{'next_url': None,
 'previous_url': None,
 'query_sql': 'SELECT _Diagnosis.tumor_stage, '
              '_ResearchSubject.associated_project , count(*) row_count\n'
              'FROM `gdc-bq-sample.cda_mvp.v3`, \n'
              'UNNEST(ResearchSubject) AS _ResearchSubject,\n'
              'UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis \n'
              "WHERE _Diagnosis.tumor_stage in ('IIIC', 'Stage IIIC', 'stage "
              "iiic')\n"
              'group BY _ResearchSubject.associated_project, '
              '_Diagnosis.tumor_stage\n'
              'order BY _ResearchSubject.associated_project, '
              '_Diagnosis.tumor_stage ',
 'result': [{'associated_project': 'CPTAC-2',
             'row_count': 66,
             'tumor_stage': 'IIIC'},
            {'associated_project': 'CPTAC-2',
             'row_count': 92,
             'tumor_stage': 'Stage IIIC'},
            {'associated_project': 'CPTAC-TCGA',
             'row_count': 13,
             'tumor_stage': '

It's hard to see what's going on when looking at json. A DataFrame is more compact and makes a visual scan easier. Fortunately the list of dicts returned by the client is tailor made for a DataFrame.

In [6]:
import pandas as pd
pd.DataFrame(api_response.result,  index=None)

Unnamed: 0,tumor_stage,associated_project,row_count
0,IIIC,CPTAC-2,66
1,Stage IIIC,CPTAC-2,92
2,Stage IIIC,CPTAC-TCGA,13
3,stage iiic,CPTAC-TCGA,1
4,stage iiic,TCGA-BRCA,65
5,stage iiic,TCGA-COAD,41
6,stage iiic,TCGA-ESCA,7
7,stage iiic,TCGA-LIHC,9
8,stage iiic,TCGA-READ,14
9,stage iiic,TCGA-SKCM,68


### Exploring the Subject 09CO022 example via SQL
Start with the same SQL that was generated by cdapython

In [7]:
import json
query = '''SELECT * FROM gdc-bq-sample.cda_mvp.v3, 
UNNEST(ResearchSubject) AS _ResearchSubject 
WHERE (_ResearchSubject.id = 'c5421e34-e5c7-4ba5-aed9-146a5575fd8d')'''
api_response = api_instance.sql_query('v3',query)
print(json.dumps(api_response.result,indent=3))

[
   {
      "days_to_birth": null,
      "race": "black or african american",
      "sex": "female",
      "ethnicity": "not hispanic or latino",
      "id": "c5421e34-e5c7-4ba5-aed9-146a5575fd8d",
      "ResearchSubject": [
         {
            "Diagnosis": [
               {
                  "morphology": "8140/3",
                  "tumor_stage": "Stage IIB",
                  "tumor_grade": "Not Reported",
                  "Treatment": [],
                  "id": "7b8d36ba-ab84-48ad-ac2c-11ac40d3d0eb",
                  "primary_diagnosis": "Adenocarcinoma, NOS",
                  "age_at_diagnosis": null
               }
            ],
            "Specimen": [
               {
                  "File": [
                     {
                        "label": "fbc0c313-d356-4ad9-8257-57e90fb7f26b.wxs.Pindel.somatic_annotation.vcf.gz",
                        "associated_project": [
                           "CPTAC-2"
                        ],
                        "drs_u

Two things of note from that. One strategic, and one more fine detailed, but with strategic implications.

First, it brings home that the duplication referred to in the 09CO022 example is baked into the cda_mvp.v3 table, and is a product of the ETL. The key to data aggregation is how that mapping is done - specifically how are the mappings created, managed and used. Also the question of whether one mapping works for all. There is a follow on from that - namely the choice between ETL to a static store vs on the fly mapping as needed. 

Second, careful inspection of the result above shows that the for this subject at least the Stage was consistently used across both PDC and GDC. Historical knowledge suggests that the subject record in CPTAC-2 (the data in PDC) was copied across from TCGA (the data now in GDC).

Besides visual review we can check the above programmatically

In [29]:
def printSubject(subject, indent=0):
    ident = subject['identifier'][0]
    print(('\t'*(indent+1)+'system:{} id:{}').format(ident['system'], ident['value']))
    diag = subject['Diagnosis'][0]
    print('\t'*(indent+1) + diag['tumor_stage'])

    print('\t'*(indent+1) + str(diag['age_at_diagnosis']))
    print()

    
for subject in api_response.result:
    printSubject(subject)
    for nestedSubject in subject['ResearchSubject']:
        printSubject(nestedSubject, indent=1)
    

	system:GDC id:c5421e34-e5c7-4ba5-aed9-146a5575fd8d
	Stage IIB
	None

		system:GDC id:c5421e34-e5c7-4ba5-aed9-146a5575fd8d
		Stage IIB
		None

		system:PDC id:459e3b69-63d6-11e8-bcf1-0a2705229b82
		Stage IIB
		None



That said, we can see that 'IIIC' was also used for tumor_stage in CPTAC2. What if we look at those records.



In [33]:
sql = '''SELECT * FROM gdc-bq-sample.cda_mvp.v3, 
UNNEST(ResearchSubject) AS _ResearchSubject ,
UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis 
WHERE _Diagnosis.tumor_stage ='IIIC'
'''

api_response = api_instance.sql_query('v3',sql)
for subject in api_response.result:
    printSubject(subject)
    for nestedSubject in subject['ResearchSubject']:
        printSubject(nestedSubject, indent=1)

	system:GDC id:b295d458-952a-42af-9f7f-2c07e895990c
	IIIC
	None

		system:GDC id:b295d458-952a-42af-9f7f-2c07e895990c
		IIIC
		None

	system:GDC id:c3a430fe-641e-4b7b-884e-9735908692d6
	IIIC
	None

		system:GDC id:c3a430fe-641e-4b7b-884e-9735908692d6
		IIIC
		None

		system:PDC id:526a3f3c-0881-11eb-bc0e-0aad30af8a83
		Stage IIIC
		0

	system:GDC id:dc8e3901-b845-4419-8f76-d31750eacf60
	IIIC
	None

		system:GDC id:dc8e3901-b845-4419-8f76-d31750eacf60
		IIIC
		None

		system:PDC id:526a3b90-0881-11eb-bc0e-0aad30af8a83
		Stage IIIC
		0

	system:GDC id:6f493081-5bd6-4d07-9a36-b775eda105e1
	IIIC
	None

		system:GDC id:6f493081-5bd6-4d07-9a36-b775eda105e1
		IIIC
		None

		system:PDC id:526a3d60-0881-11eb-bc0e-0aad30af8a83
		Stage IIIC
		0

	system:GDC id:9d33308f-f7cc-420e-9116-b9a95068d045
	IIIC
	None

		system:GDC id:9d33308f-f7cc-420e-9116-b9a95068d045
		IIIC
		None

		system:PDC id:33f2e3fb-63d7-11e8-bcf1-0a2705229b82
		Stage IIIC
		0

	system:GDC id:01026cf5-8582-4864-8e89-111c05660eb6

In [36]:
def printSubject2(subject, indent=0):
    ident = subject['identifier'][0]
    print(('\t'*(indent+1)+'system:{} id:{}').format(ident['system'], ident['value']))
    print(subject['Diagnosis'][0])
    print()

for subject in api_response.result:
    printSubject2(subject)
    for nestedSubject in subject['ResearchSubject']:
        printSubject2(nestedSubject, indent=1)


	system:GDC id:b295d458-952a-42af-9f7f-2c07e895990c
{'morphology': '8441/3', 'tumor_stage': 'IIIC', 'tumor_grade': 'Not Reported', 'Treatment': [], 'id': '69bccafb-3215-4e8d-b392-447e4f8696cd', 'primary_diagnosis': 'Serous adenocarcinoma, NOS', 'age_at_diagnosis': None}

		system:GDC id:b295d458-952a-42af-9f7f-2c07e895990c
{'morphology': '8441/3', 'tumor_stage': 'IIIC', 'tumor_grade': 'Not Reported', 'Treatment': [], 'id': '69bccafb-3215-4e8d-b392-447e4f8696cd', 'primary_diagnosis': 'Serous adenocarcinoma, NOS', 'age_at_diagnosis': None}

	system:GDC id:c3a430fe-641e-4b7b-884e-9735908692d6
{'morphology': '8441/3', 'tumor_stage': 'IIIC', 'tumor_grade': 'Not Reported', 'Treatment': [], 'id': 'e1a040fa-d089-46c3-b20d-cf0ea981df4b', 'primary_diagnosis': 'Serous adenocarcinoma, NOS', 'age_at_diagnosis': None}

		system:GDC id:c3a430fe-641e-4b7b-884e-9735908692d6
{'morphology': '8441/3', 'tumor_stage': 'IIIC', 'tumor_grade': 'Not Reported', 'Treatment': [], 'id': 'e1a040fa-d089-46c3-b20d-cf0