In [1]:
import requests
import json
import io
import pickle

import pandas as pd

In [2]:
files_endpt = "https://api.gdc.cancer.gov/files"

# Field Groups

https://docs.gdc.cancer.gov/API/Users_Guide/Appendix_A_Available_Fields/#field-group-listing-by-endpoint

https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/

https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables

## JSON

In [3]:
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ["Breast", "Brain", "Bronchus and lung", "Colon", "Liver and intrahepatic bile ducts"]
            }
        },
        {
        "op": "=",
        "content":{
            "field": "files.data_format",
            "value": ["SVS"]
            }
        }
    ]
}

In [4]:
json_params = {
    "filters": json.dumps(filters),
    "expand": "cases.project,cases.demographic,cases.diagnoses,cases.samples,cases.tissue_source_site,cases.project,cases",
    "format": "json",
    "size": "40000"
}

In [5]:
response = requests.post(files_endpt, headers={"Content-Type": "application/json"}, json=json_params)
with open('all_metadata.json', 'w') as f:
    json.dump(response.json(), f)

## CSV

In [6]:
csv_params = {
    "filters": json.dumps(filters),
    "expand": "cases.project,cases.demographic,cases.diagnoses,cases.samples,cases.tissue_source_site,cases.project,cases",
    "format": "csv",
    "size": "40000"
}

In [7]:
response = requests.post(files_endpt, headers={"Content-Type": "application/json"}, json=csv_params)
df = pd.read_csv(io.StringIO(response.content.decode('utf-8')), dtype='object')
df.to_csv('all_metadata.csv', index=False)

### CSV Refinement

In [8]:
fields = {
    "id": "id",
    "data_format": "data_format",
    "access": "access",
    "file_name": "file_name",
    "submitter_id": "submitter_id",
    "data_category": "data_category",
    "type": "type",
    "file_size": "file_size",
    "created_datetime": "created_datetime",
    "md5sum": "md5sum",
    "updated_datetime": "updated_datetime",
    "file_id": "file_id",
    "data_type": "data_type",
    "state": "state",
    "experimental_strategy": "experimental_strategy",
    "version": "version",
    "data_release": "data_release",
    
    "cases.0.primary_site": "primary_site",
    "cases.0.disease_type": "disease_type",
    "cases.0.case_id": "case_id",
    
    "cases.0.project.name": "project_name",
    "cases.0.project.primary_site": "project_primary_site",
    "cases.0.project.disease_type": "project_disease_type",
    
    "cases.0.diagnoses.0.ajcc_pathologic_stage": "diag_ajcc_pathologic_stage",
    "cases.0.diagnoses.0.synchronous_malignancy": "diag_synchronous_malignancy",
    "cases.0.diagnoses.0.tissue_or_organ_of_origin": "diag_tissue_or_organ_of_origin",
    "cases.0.diagnoses.0.primary_diagnosis": "diag_primary_diagnosis",
    "cases.0.diagnoses.0.prior_malignancy": "diag_prior_malignancy",
    "cases.0.diagnoses.0.ajcc_pathologic_t": "diag_ajcc_pathologic_t",
    "cases.0.diagnoses.0.morphology": "diag_morphology",
    "cases.0.diagnoses.0.ajcc_pathologic_n": "diag_ajcc_pathologic_n",
    "cases.0.diagnoses.0.ajcc_pathologic_m": "diag_ajcc_pathologic_m",
    "cases.0.diagnoses.0.classification_of_tumor": "diag_classification_of_tumor",
    "cases.0.diagnoses.0.icd_10_code": "diag_icd_10_code",
    "cases.0.diagnoses.0.site_of_resection_or_biopsy": "diag_site_of_resection_or_biopsy",
    "cases.0.diagnoses.0.progression_or_recurrence": "diag_progression_or_recurrence",
    
    "cases.0.samples.0.sample_type_id": "sample_type_id",
    "cases.0.samples.0.sample_type": "sample_type",
    "cases.0.samples.0.tissue_type": "tissue_type",
}

In [9]:
metadata = df[list(fields.keys())]
metadata = metadata.rename(columns=fields, inplace=False)
metadata["patient_id"] = metadata["submitter_id"].apply(lambda row: row.split("-")[2])

In [10]:
metadata.to_csv("metadata.csv", index=False)

# Manifest

In [11]:
name_dict = {
    'cases.0.primary_site': 'primary_site',
    'file_name': 'filename',
    'file_size': 'size',
    'md5sum': 'md5'
}

cols = ['id', 'filename', 'md5', 'size', 'state']

In [12]:
manifest = df.rename(name_dict, axis='columns')
manifest = manifest[cols]

In [13]:
manifest.to_csv('download_manifest.txt', sep='\t', index=False)