In [1]:
import json
import pprint
import pandas as pd

In [2]:
import requests

fields = [
    "file_name",
    "cases.case_id",
    "cases.samples.sample_id",
    "cases.project.project_id", 
    "cases.demographic.vital_status"
    ]

fields = ",".join(fields)

files_endpt = "https://api.gdc.cancer.gov/files"

# This set of filters is nested under an 'and' operator.
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.diagnoses.prior_treatment",
            "value": ["no"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.samples.sample_type",
            "value": ["primary tumor","recurrent tumor","tumor"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        }
    ]
}


# A POST is used, so the filter parameters can be passed directly as a Dict object.
pages = []
for x in range(0, 11001, 1000):
    params = {
        "filters": filters,
        "fields": fields,
        "format": "JSON",
        "size": "1000",
        "from": x
        }

    # The parameters are passed to 'json' rather than 'params' in this case
    response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
    
    pages.append(response.json())




In [3]:
params = {
        "filters": filters,
        "fields": fields,
        "format": "JSON",
        "size": "1000",
        "from": 10000
        }

    # The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
    
pages.append(response.json())

In [6]:
pages[0]

{'data': {'hits': [{'id': '6b62d6dd-bb6f-4f39-9041-992c47b875f3',
    'cases': [{'case_id': 'c3148e68-1739-4334-abda-1dcbc2166846',
      'project': {'project_id': 'TCGA-BRCA'},
      'samples': [{'sample_id': '5559a7ea-5db7-4af8-b491-24bb9871b89d'}],
      'demographic': {'vital_status': 'Dead'}}],
    'file_name': 'd1f7678a-8106-46c9-95b7-5ec5d940f08c.rna_seq.augmented_star_gene_counts.tsv'},
   {'id': '90eefb33-6808-45c3-9ab1-6eca0decdc9d',
    'cases': [{'case_id': 'c348a9b3-c901-4384-a222-144387bac0c5',
      'project': {'project_id': 'TCGA-BRCA'},
      'samples': [{'sample_id': '438152c8-2d26-4b35-9f68-a4b6c473916e'}],
      'demographic': {'vital_status': 'Dead'}}],
    'file_name': '71c9a6fc-0538-4533-af51-f7c9defe7770.rna_seq.augmented_star_gene_counts.tsv'},
   {'id': 'eb166054-ff70-4a86-883b-9c25a7d2b0e5',
    'cases': [{'case_id': 'c364e81c-eb1e-4870-ab37-9c661f5f2e3d',
      'project': {'project_id': 'TCGA-BRCA'},
      'samples': [{'sample_id': 'd5adaf35-04b1-4198-b308-f

In [5]:
hits = [x for page in pages for x in page['data']['hits']]
metadata = pd.DataFrame(columns = ['id', 'filename', 'case_id', 'project_id', 'sample_id', 'disease_type', 'gender', 'race', 'ethnicity'])
ambiguous = []
for x in hits:
    case = x['cases'][0]
    sample  = case['samples'][0]
    demographic  = case['demographic']
    if len(x['cases']) == 1 and len(case['samples']) == 1:
        row = [x['id'], 
               x['file_name'], 
               case['case_id'], 
               case['project']['project_id'],
               sample['sample_id'], 
               case['disease_type'],
               demographic['gender'],
               demographic['race'],
               demographic['ethnicity']
              ]
        metadata.loc[len(metadata)] = row
    else:
        ambiguous.append(x)



KeyError: 'disease_type'

In [73]:
metadata.to_csv('')