In [1]:
# Sample request from GDC website
### Error: 
#{
#  "message": "d853e541-f16a-4345-9f00-88e03c2dc0bc not found"
#}
import requests
import json

file_endpt = 'https://api.gdc.cancer.gov/files/'
file_uuid = 'd853e541-f16a-4345-9f00-88e03c2dc0bc'
response = requests.get(file_endpt + file_uuid)

# OUTPUT METHOD 1: Write to a file.
file = open("sample_request.json", "w")
file.write(response.text)
file.close()

# OUTPUT METHOD 2: View on screen.
print(json.dumps(response.json(), indent=2))

{
  "message": "d853e541-f16a-4345-9f00-88e03c2dc0bc not found"
}


In [34]:
# Simple Query Request
import requests
import json

cases_endpt = 'https://api.gdc.cancer.gov/cases'

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "submitter_id",
    "case_id",
    "primary_site",
    "disease_type",
    "diagnoses.vital_status"
    ]

fields = ','.join(fields)

params = {
    "fields": fields,
    "format": "TSV",
    "size": "100"
    }

response = requests.get(cases_endpt, params = params)

print(response.content)

b'case_id\tdisease_type\tid\tprimary_site\tsubmitter_id\r\n3479c5b5-51ce-57e8-b6e9-bbd99e5b10eb\tComplex Mixed and Stromal Neoplasms\t3479c5b5-51ce-57e8-b6e9-bbd99e5b10eb\tKidney\tTARGET-52-PAKHTL\r\n917e2af4-8687-5c3f-9696-0611ff6801ed\tComplex Mixed and Stromal Neoplasms\t917e2af4-8687-5c3f-9696-0611ff6801ed\tKidney\tTARGET-52-PAUCGJ\r\n8afef6e5-753f-5272-b9e1-b491d883316a\tComplex Mixed and Stromal Neoplasms\t8afef6e5-753f-5272-b9e1-b491d883316a\tKidney\tTARGET-52-PATFXW\r\n51edacf9-41a8-5940-b84d-1f078dd0c9c5\tComplex Mixed and Stromal Neoplasms\t51edacf9-41a8-5940-b84d-1f078dd0c9c5\tKidney\tTARGET-52-PATENH\r\nea300498-806d-56cf-9cca-adb64f869e7b\tComplex Mixed and Stromal Neoplasms\tea300498-806d-56cf-9cca-adb64f869e7b\tKidney\tTARGET-52-PARUGK\r\nef42d20f-5f3e-58c6-9d52-0a2aa48a162c\tComplex Mixed and Stromal Neoplasms\tef42d20f-5f3e-58c6-9d52-0a2aa48a162c\tKidney\tTARGET-52-PAJNFZ\r\nb12f8793-6f99-5a82-a821-a8be6c723dcb\tComplex Mixed and Stromal Neoplasms\tb12f8793-6f99-5a82-a

In [35]:
# A Filtered Query
import requests
import json

fields = [
    "submitter_id",
    "case_id",
    "primary_site",
    "disease_type",
    "diagnoses.vital_status"
    ]

fields = ",".join(fields)

cases_endpt = "https://api.gdc.cancer.gov/cases"

filters = {
    "op": "in",
    "content":{
        "field": "primary_site",
        "value": ["Kidney"]
        }
    }

# With a GET request, the filters parameter needs to be converted
# from a dictionary to JSON-formatted string

params = {
    "filters": json.dumps(filters),
    "fields": fields,
    "format": "TSV",
    "size": "100"
    }

response = requests.get(cases_endpt, params = params)

print(response.content)

b'case_id\tdisease_type\tid\tprimary_site\tsubmitter_id\r\n3479c5b5-51ce-57e8-b6e9-bbd99e5b10eb\tComplex Mixed and Stromal Neoplasms\t3479c5b5-51ce-57e8-b6e9-bbd99e5b10eb\tKidney\tTARGET-52-PAKHTL\r\n917e2af4-8687-5c3f-9696-0611ff6801ed\tComplex Mixed and Stromal Neoplasms\t917e2af4-8687-5c3f-9696-0611ff6801ed\tKidney\tTARGET-52-PAUCGJ\r\n8afef6e5-753f-5272-b9e1-b491d883316a\tComplex Mixed and Stromal Neoplasms\t8afef6e5-753f-5272-b9e1-b491d883316a\tKidney\tTARGET-52-PATFXW\r\n51edacf9-41a8-5940-b84d-1f078dd0c9c5\tComplex Mixed and Stromal Neoplasms\t51edacf9-41a8-5940-b84d-1f078dd0c9c5\tKidney\tTARGET-52-PATENH\r\nea300498-806d-56cf-9cca-adb64f869e7b\tComplex Mixed and Stromal Neoplasms\tea300498-806d-56cf-9cca-adb64f869e7b\tKidney\tTARGET-52-PARUGK\r\nef42d20f-5f3e-58c6-9d52-0a2aa48a162c\tComplex Mixed and Stromal Neoplasms\tef42d20f-5f3e-58c6-9d52-0a2aa48a162c\tKidney\tTARGET-52-PAJNFZ\r\nb12f8793-6f99-5a82-a821-a8be6c723dcb\tComplex Mixed and Stromal Neoplasms\tb12f8793-6f99-5a82-a

In [42]:
# Complex Filters

import requests
import json

fields = [
    "file_name",
    "cases.submitter_id",
    "cases.samples.sample_type",
    "cases.disease_type",
    "cases.project.project_id"
    ]

fields = ",".join(fields)

files_endpt = "https://api.gdc.cancer.gov/files"

# This set of filters is nested under an 'and' operator.
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": ["Lung"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.experimental_strategy",
            "value": ["RNA-Seq"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["BAM"]
            }
        }
    ]
}

# A POST is used, so the filter parameters can be passed directly as a Dict object.
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "2000"
    }

# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

print(response.content.decode("utf-8"))

cases.0.disease_type	cases.0.project.project_id	cases.0.samples.0.sample_type	cases.0.submitter_id	file_name	id
Squamous Cell Neoplasms	TCGA-LUSC	Primary Tumor	TCGA-37-4133	8fa46159-7340-44ad-894f-f611ecead93d.rna_seq.transcriptome.gdc_realn.bam	74ef7f3e-ca6a-4d65-ad2c-746204daa06e
Squamous Cell Neoplasms	TCGA-LUSC	Primary Tumor	TCGA-37-4133	8fa46159-7340-44ad-894f-f611ecead93d.rna_seq.chimeric.gdc_realn.bam	299809c8-7700-4b45-9c3e-f199ad7a156f
Squamous Cell Neoplasms	TCGA-LUSC	Primary Tumor	TCGA-NK-A5CR	93f8d34b-ef94-4875-9cfb-7f2e5124c972.rna_seq.transcriptome.gdc_realn.bam	a6dd905e-e77b-4cc1-8aea-a98edd46dfcc
Squamous Cell Neoplasms	TCGA-LUSC	Solid Tissue Normal	TCGA-43-6647	c0859fd9-6d8e-4037-86d6-3f2703e480e9.rna_seq.chimeric.gdc_realn.bam	9d85b1ec-d9a0-4899-8c41-71a5fe3ecba4
Squamous Cell Neoplasms	TCGA-LUSC	Primary Tumor	TCGA-98-A538	31eef80b-438a-4bad-aee6-3aca3ab59f9e.rna_seq.genomic.gdc_realn.bam	867ab64a-ff06-4443-be4d-0617c1cd7e3d
Squamous Cell Neoplasms	TCGA-LUSC	Pri

In [None]:
# DOWNLOADING FILES

In [41]:
# Simple Download Request

import requests
import json
import re

file_id = "b658d635-258a-4f6f-8377-767a43771fe4"

data_endpt = "https://api.gdc.cancer.gov/data/{}".format(file_id)

response = requests.get(data_endpt, headers = {"Content-Type": "application/json"})

# The file name can be found in the header within the Content-Disposition key.
response_head_cd = response.headers["Content-Disposition"]

file_name = re.findall("filename=(.+)", response_head_cd)[0]

with open(file_name, "wb") as output_file:
    output_file.write(response.content)

In [39]:
# Passing a Token to Download a Controlled-Acces File

import requests
import json

'''
 This script will not work until $TOKEN_FILE_PATH
 is replaced with an actual path.
'''
token_file = "$TOKEN_FILE_PATH"

file_id = "11443f3c-9b8b-4e47-b5b7-529468fec098"

data_endpt = "https://api.gdc.cancer.gov/slicing/view/{}".format(file_id)

with open(token_file,"r") as token:
    token_string = str(token.read().strip())

params = {"gencode": ["BRCA1", "BRCA2"]}

response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers = {
                            "Content-Type": "application/json",
                            "X-Auth-Token": token_string
                            })

file_name = "brca_slices.bam"

with open(file_name, "wb") as output_file:
    output_file.write(response.content)

In [6]:
# Post Request to Download Multiple Files

import os.path
import requests
import json
import re

data_endpt = "https://api.gdc.cancer.gov/data"

ids = [
    "b658d635-258a-4f6f-8377-767a43771fe4",
    "3968213d-b293-4b3d-8033-5b5a0ca07b6c"
    ]

params = {"ids": ids}

response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"
                            })

response_head_cd = response.headers["Content-Disposition"]

file_name = re.findall("filename=(.+)", response_head_cd)[0]

save_path = 'data/'

completeName = os.path.join(save_path, file_name)


with open(completeName, "wb") as output_file:
    output_file.write(response.content)

In [9]:
# Downloading a Set of Files Based on a Filter

import requests
import json
import re

files_endpt = "https://api.gdc.cancer.gov/files"

filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": ["Lung"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.demographic.race",
            "value": ["white"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.demographic.gender",
            "value": ["female"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.analysis.workflow_type",
            "value": ["HTSeq - FPKM"]
            }
        }
    ]
}

# Here a GET is used, so the filter parameters should be passed as a JSON string.

params = {
    "filters": json.dumps(filters),
    "fields": "file_id",
    "format": "JSON",
    "size": "1000"
    }

response = requests.get(files_endpt, params = params)

file_uuid_list = []

# This step populates the download list with the file_ids from the previous query
for file_entry in json.loads(response.content.decode("utf-8"))["data"]["hits"]:
    file_uuid_list.append(file_entry["file_id"])

data_endpt = "https://api.gdc.cancer.gov/data"

params = {"ids": file_uuid_list}

response = requests.post(data_endpt, data = json.dumps(params), headers = {"Content-Type": "application/json"})

response_head_cd = response.headers["Content-Disposition"]

file_name = re.findall("filename=(.+)", response_head_cd)[0]

with open(file_name, "wb") as output_file:
    output_file.write(response.content)

KeyError: 'content-disposition'

In [None]:
# BAM Slicing

import requests
import json

'''
 This script will not work until $TOKEN_FILE_PATH
 is replaced with an actual path.
'''
token_file = "$TOKEN_FILE_PATH"

file_id = "11443f3c-9b8b-4e47-b5b7-529468fec098"

data_endpt = "https://api.gdc.cancer.gov/slicing/view/{}".format(file_id)

with open(token_file,"r") as token:
    token_string = str(token.read().strip())

params = {"gencode": ["BRCA1", "BRCA2"]}

response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers = {
                            "Content-Type": "application/json",
                            "X-Auth-Token": token_string
                            })

file_name = "brca_slices.bam"

with open(file_name, "wb") as output_file:
    output_file.write(response.content)

In [None]:
import requests
import json

'''
 This script will not work until $TOKEN_FILE_PATH
 is replaced with an actual path.
'''
token_file = "$TOKEN_FILE_PATH"

file_ids = [
    "11443f3c-9b8b-4e47-b5b7-529468fec098",
    "1f103620-bb34-46f1-b565-94f0027e396d",
    "ca549554-a244-4209-9086-92add7bb7109"
    ]

for file_id in file_ids:

    data_endpt = "https://api.gdc.cancer.gov/slicing/view/{}".format(file_id)

    with open(token_file, "r") as token:
        token_string = str(token.read().strip())

    params = {
        "regions": ["chr1:1-20000", "chr10:129000-160000"]
        }

    response = requests.post(data_endpt,
                            data = json.dumps(params),
                            headers = {
                                "Content-Type": "application/json",
                                "X-Auth-Token": token_string
                                })

    file_name = "{}_region_slices.bam".format(file_id)

    with open(file_name, "wb") as output_file:
        output_file.write(response.content)

In [36]:
# Trouble Shooting
import requests
status_endpt = "https://api.gdc.cancer.gov/status"
response = requests.get(status_endpt)

# OUTPUT METHOD 1: Write to a file.
file = open("api_status.json", "w")
file.write(response.text)
file.close()

# OUTPUT METHOD 2: View on screen.
print(response.content)

# Pretty print JSON
ugly_json = response.content
parsed_json = json.loads(ugly_json)
pretty_json = json.dumps(parsed_json, indent = 2)

print(pretty_json)

b'{"commit":"4dd3680528a19ed33cfc83c7d049426c97bb903b","data_release":"Data Release 36.0 - December 12, 2022","status":"OK","tag":"3.0.0","version":1}\n'
{
  "commit": "4dd3680528a19ed33cfc83c7d049426c97bb903b",
  "data_release": "Data Release 36.0 - December 12, 2022",
  "status": "OK",
  "tag": "3.0.0",
  "version": 1
}
