# Creating manifest

The GDC API (<a href="https://docs.gdc.cancer.gov/API/Users_Guide/Getting_Started/">user guide</a>) can be used to create a manifest of files to download. The files can be downloaded directly using the API, but since we will be downloading quite a few files, we will use the <a href="https://gdc.cancer.gov/access-data/gdc-data-transfer-tool">GDC data transfer tool</a> instead. More information about the use of this client can be found <a href="https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Preparing_for_Data_Download_and_Upload/">here</a>. 

First, we can verify we can connect to the GDC API and check that it is up.

In [2]:
import json
import requests

def check_tcga():
    """
        Checks the status of TCGA to ensure it is up
    """
    status_endpt = 'https://api.gdc.cancer.gov/status'
    response = requests.get(status_endpt)
    response = response.json()
    status = response[u"status"]
    if status == u"OK":
        return True
    else:
        return False

In [3]:
check_tcga()

True

In [None]:
import json
import requests
def fetch_manifest(project, data_category, out_dir, data_type=None, workflow_type=None):
    """
        Fetches the manifest and metadata of the files of data_type for the passed in project regarding 
        
        project: TCGA project (ie TCGA-STAD, TCGA-CESC)
        data_category: file category (ie Clinical, Transcriptome Profiling)
        out_dir: directory to output manifest and metadata files
        data_type: file experiment type (ie Gene Expression Quantification)
    """
    file_endpt = 'https://api.gdc.cancer.gov/files'
    file_name = "{} {} Manifest.txt".format(project, data_category)
    content_list ={"op":"and",
                  "content":[
                      {
                          "op": "=",
                          "content":{
                              "field": "data_category",
                              "value": data_category
                          }
                      },{
                          "op": "=",
                          "content":{
                              "field": "cases.project.project_id",
                              "value": project
                          }
                      }
                  ]
                  }
    # for all except clinical data
    if data_type is not None:
        file_name = "{} {} Manifest.txt".format(project, data_type)
        content_list["content"] += [{
                          "op": "=",
                          "content":{
                              "field": "data_type",
                              "value": data_type
                          }
                      }]
        # mRNA-seq data have multiple analysis methods, only retrieve one specified by workflow_type
        if data_type == "Gene Expression Quantification":
            content_list["content"] += [{
                              "op": "=",
                              "content":{
                                  "field": "analysis.workflow_type",
                                  "value": workflow_type
                              }
                          }]
    params = {'filters': json.dumps(content_list), "return_type": "manifest", "size": "1500"}
    responseRAW = requests.get(file_endpt, params=params)
    # remove spaces 
    file_name = file_name.replace(" ", "_")
    file_name = "{}/{}".format(out_dir, file_name)
    with open(file_name, "w") as out_file:
        out_file.write(responseRAW.content)
        
    # fetch metadata
    field_list = [
        "file_name",
        "file_id",
        "associated_entities.case_id",
        "cases.samples.sample_type",
        "cases.samples.sample_type_id"
    ]

    fields = ",".join(field_list)
    
    meta_params = {'filters': json.dumps(content_list), "size": "1500", "pretty": True, "fields":fields}
    meta_responseRAW = requests.get(file_endpt, params=meta_params)
    meta_file_name = "{} {} Metadata.json".format(project, data_type)
    # remove spaces 
    meta_file_name = meta_file_name.replace(" ", "_")
    meta_file_name = "{}/{}".format(out_dir, meta_file_name)
    with open(meta_file_name, "w") as out_file:
        out_file.write(meta_responseRAW.content)
    
    return file_name, meta_file_name

# File download

We can then download the files using the <a href="https://gdc.cancer.gov/access-data/gdc-data-transfer-tool">GDC data transfer tool</a>. The tool was downloaded and location of the program was added to the PATH environmental variable.

In [None]:
import subprocess

def fetch_files(manifest_file, file_dest_dir):
    """
        Fetches the files from TCGA using the GDC data transfer tool
        
        manifest_file: location of the manifest file
        file_dest_dir: directory to download the files to
    """
    run_str = 'gdc-client download -m "{}" -d "{}"'.format(manifest_file, file_dest_dir)
    print "Running command: {}".format(run_str)
    return_code = subprocess.call(run_str, shell=True)
    return return_code

# Fetch all files for Project

The below function uses the previously functions to download all of the mRNA-seq HTSeq-count, miRNA-seq Isoform Expression Quantification, and clinical data files for a given project.

In [None]:
import os

def fetch_all(project, output_dir):
    if not check_tcga():
        return "TCGA API did not respond"
    
    # fetch mRNA files
    mrna_out = "{}/mRNA-seq".format(output_dir)
    # Check if directory exists, if not create directory
    if not os.path.exists(mrna_out):
        os.makedirs(mrna_out)
        
    manifest_file, meta_file = fetch_manifest(project, "Transcriptome Profiling", mrna_out, 
                                              data_type="Gene Expression Quantification", workflow_type="HTSeq - Counts")
    return_code = fetch_files(manifest_file, mrna_out)
    if return_code == 0:
        print "mRNA gene expression files downloaded successfully"
    else:
        print "Error {}: error when downloading mRNA gene expression files".format(return_code)
        
    # fetch miRNA files
    mirna_out = "{}/miRNA-seq/Isoform Expression Quantification".format(output_dir)
    # Check if directory exists, if not create directory
    if not os.path.exists(mirna_out):
        os.makedirs(mirna_out)
        
    manifest_file, meta_file = fetch_manifest(project, "Transcriptome Profiling", mirna_out, 
                                              data_type="Isoform Expression Quantification")
    return_code = fetch_files(manifest_file, mirna_out)
    if return_code == 0:
        print "miRNA isoform expression files downloaded successfully"
    else:
        print "Error {}: error when downloading miRNA isoform expression files".format(return_code)
        
    # fetch clinical data files
    clinical_out = "{}/Clinical".format(output_dir)
    # Check if directory exists, if not create directory
    if not os.path.exists(clinical_out):
        os.makedirs(clinical_out)
        
    manifest_file, meta_file = fetch_manifest(project, "Clinical", clinical_out)
    return_code = fetch_files(manifest_file, clinical_out)
    if return_code == 0:
        print "Clinical data files downloaded successfully"
    else:
        print "Error {}: error when downloading clinical data files".format(return_code)

The below code can be used to download all of these files for all 33 TCGA projects.

In [None]:
import os

def get_all_tcga(out_dir):
    """
        Loops over all 33 TCGA projects and calls fetch_all
    """
    for x in ["CHOL", "UCS", "DLBC", "UVM", "MESO", "ACC", "KICH", "THYM", "TGCT", 
              "READ", "BRCA", "GBM", "LUAD", "KIRC", "HNSC", "LGG", "LUSC", "PRAD", 
              "SKCM", "COAD", "BLCA", "LIHC", "KIRP", "SARC", "LAML", "ESCA", "PAAD", 
              "PCPG", "OV", "UCEC", "THCA", "STAD", "CESC"]:
        out_dir = "{}/{}".format(out_dir, x)
        # Check if directory exists, if not create directory
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        fetch_all("TCGA-{}".format(x), out_dir)