In [1]:
import os
import requests
import json
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sn
from tqdm import tqdm, tqdm_notebook, tnrange
import pickle
import gc

In [2]:
tqdm_notebook().pandas()
# tqdm.pandas(tqdm_notebook)




In [26]:
if not os.path.isdir('../raw_tables'):
    if os.path.exists('../raw_tables'):
        os.remove('../raw_tables')
    os.makedirs('../raw_tables', exist_ok=True)

In [3]:
import urlpath
def generate_endpoints():
    host = urlpath.URL("https://api.gdc.cancer.gov")
    endpoints = dict()
    for point in ['files','cases','projects','annotations']:
        endpoints[point] = host / point
        endpoints["%s_mapping" % point] = host / point / "_mapping"
    endpoints['data'] = host / "data"
    endpoints['legacy_data'] = host / "legacy/data"
    endpoints['manifest'] = host / "manifest"
    return endpoints
endpoints = generate_endpoints()
endpoints

{'annotations': 'https://api.gdc.cancer.gov/annotations/',
 'annotations_mapping': 'https://api.gdc.cancer.gov/annotations/_mapping/',
 'cases': 'https://api.gdc.cancer.gov/cases/',
 'cases_mapping': 'https://api.gdc.cancer.gov/cases/_mapping/',
 'files': 'https://api.gdc.cancer.gov/files/',
 'files_mapping': 'https://api.gdc.cancer.gov/files/_mapping/',
 'projects': 'https://api.gdc.cancer.gov/projects/',
 'projects_mapping': 'https://api.gdc.cancer.gov/projects/_mapping/'}

In [4]:
def get_response_all(endpoint, add_fields=[]):
    params = dict()
    response = requests.get(endpoints[endpoint])
    params['size'] = response.json()['data']['pagination']['total']
    
    if add_fields is not None and len(add_fields) > 0:
        response = requests.get(endpoints[endpoint+"_mapping"])
        json_obj = response.json()
        fields = list( set(json_obj['defaults']) ^ set(add_fields))
        params['fields'] = ",".join(fields)
    return requests.get(endpoints[endpoint], params=params)

In [5]:
def get_dataframe_for(endpoint, add_fields=[]):
    response = get_response_all(endpoint, add_fields=add_fields)
#     print(json.dumps(response.json(), indent=2))
    return pd.read_json(json.dumps(response.json()['data']['hits']))

In [6]:
def fix_program_row(row_data):
    for field in row_data.program:
        row_data['program_%s' % field] = row_data.program[field]
    return row_data

def fix_program(df):
    return df.progress_apply(fix_program_row, axis=1).drop('program', axis='columns')

In [7]:
projects = get_dataframe_for('projects', ['program.name'])
projects = fix_program(projects)




In [8]:
# projects.to_csv("../raw_tables/projects.tsv", sep="\t")

In [21]:
with open('../raw_tables/projects.P', 'wb') as file:
    pickle.dump(projects, file)

In [9]:
projects

Unnamed: 0,dbgap_accession_number,disease_type,id,name,primary_site,project_id,released,state,program_name
0,,[Rectum Adenocarcinoma],TCGA-READ,Rectum Adenocarcinoma,[Colorectal],TCGA-READ,True,legacy,TCGA
1,,[Thyroid Carcinoma],TCGA-THCA,Thyroid Carcinoma,[Thyroid],TCGA-THCA,True,legacy,TCGA
2,phs000466,[Clear Cell Sarcoma of the Kidney],TARGET-CCSK,Clear Cell Sarcoma of the Kidney,[Kidney],TARGET-CCSK,True,legacy,TARGET
3,,[Mesothelioma],TCGA-MESO,Mesothelioma,[Pleura],TCGA-MESO,True,legacy,TCGA
4,,[Sarcoma],TCGA-SARC,Sarcoma,[Soft Tissue],TCGA-SARC,True,legacy,TCGA
5,phs000465,[Acute Myeloid Leukemia],TARGET-AML,Acute Myeloid Leukemia,[Blood],TARGET-AML,True,legacy,TARGET
6,,[Brain Lower Grade Glioma],TCGA-LGG,Brain Lower Grade Glioma,[Brain],TCGA-LGG,True,legacy,TCGA
7,phs000467,[Neuroblastoma],TARGET-NBL,Neuroblastoma,[Nervous System],TARGET-NBL,True,legacy,TARGET
8,,[Adrenocortical Carcinoma],TCGA-ACC,Adrenocortical Carcinoma,[Adrenal Gland],TCGA-ACC,True,legacy,TCGA
9,,[Ovarian Serous Cystadenocarcinoma],TCGA-OV,Ovarian Serous Cystadenocarcinoma,[Ovary],TCGA-OV,True,legacy,TCGA


In [10]:
def fix_projects_row(row_data):
    row_data['program_name'] = row_data['project']['program']['name']
    row_data['project_id'] = row_data['project']['project_id']
#     row_data['project_name'] = row_data['project']['name']
    return row_data

def fix_projects(df):
    return df.progress_apply(fix_projects_row, axis=1).drop('project', axis='columns')

In [11]:
cases = get_dataframe_for("cases",['project.program.name', 'project.project_id'])
cases = fix_projects(cases)




In [12]:
# cases.to_csv("../raw_tables/cases.tsv", sep="\t")

In [22]:
with open('../raw_tables/cases.P', 'wb') as file:
    pickle.dump(cases, file)

In [13]:
cases

Unnamed: 0,aliquot_ids,analyte_ids,case_id,created_datetime,disease_type,id,portion_ids,primary_site,sample_ids,slide_ids,state,submitter_aliquot_ids,submitter_analyte_ids,submitter_id,submitter_portion_ids,submitter_sample_ids,submitter_slide_ids,updated_datetime,program_name,project_id
0,"[148aece6-97ce-58e4-91d2-9410e5b147a9, 3e17d84...",,c71c69ba-35ea-5024-9118-900f04bb958a,,Neuroblastoma,c71c69ba-35ea-5024-9118-900f04bb958a,,Nervous System,"[b5f665fe-e556-5146-9fb2-41de0eb3f2d0, 4acb947...",,live,"[TARGET-30-PAUDFR-01A-01D, TARGET-30-PAUDFR-10...",,TARGET-30-PAUDFR,,"[TARGET-30-PAUDFR-01A, TARGET-30-PAUDFR-10A]",,2017-12-15T21:12:17.076001-06:00,TARGET,TARGET-NBL
1,"[21d8a70b-021a-45b7-968e-f31a152a030e, a50e60e...","[72efc95c-bfbd-42a5-9129-1bb7e8db5db6, ec96b00...",dce71741-ccbe-40b7-a0b8-2048d07187a4,,Ovarian Serous Cystadenocarcinoma,dce71741-ccbe-40b7-a0b8-2048d07187a4,"[a6d9de12-eaac-4b14-bf81-d00f175c170f, bfaa683...",Ovary,"[af849153-dc08-472a-ab16-26c25fe1751f, 3d0f248...","[a28f0475-589d-4831-9186-6b955153f5bb, 91c79a1...",live,"[TCGA-25-2409-10A-01W-0801-09, TCGA-25-2409-01...","[TCGA-25-2409-01A-01R, TCGA-25-2409-10A-01D, T...",TCGA-25-2409,"[TCGA-25-2409-10A-01, TCGA-25-2409-01A-01, TCG...","[TCGA-25-2409-10A, TCGA-25-2409-01A]","[TCGA-25-2409-01A-01-BS1, TCGA-25-2409-01A-01-...",2017-03-04T16:39:19.244769-06:00,TCGA,TCGA-OV
2,"[e1cf0ad1-a200-4c10-83fd-415eea479532, 633da6d...","[7547a920-71c7-42f3-b369-3dc59d61069f, d43df96...",e8f56d0f-eee4-4def-a43a-dec91f4382a1,,Thyroid Carcinoma,e8f56d0f-eee4-4def-a43a-dec91f4382a1,"[ded0161b-e75f-4a48-90b4-4d521661e123, 5ac1005...",Thyroid,"[e13c3f11-913d-4371-9a7d-5e4df15fb42a, e84f9fa...",[5c562496-ceb3-445f-89d8-607560279cc6],live,"[TCGA-EM-A3AQ-01A-11W-A21F-08, TCGA-EM-A3AQ-10...","[TCGA-EM-A3AQ-01A-11W, TCGA-EM-A3AQ-01A-11R, T...",TCGA-EM-A3AQ,"[TCGA-EM-A3AQ-10A-01, TCGA-EM-A3AQ-01A-11]","[TCGA-EM-A3AQ-10A, TCGA-EM-A3AQ-01A]",[TCGA-EM-A3AQ-01A-01-TS1],2017-03-09T09:59:52.675816-06:00,TCGA,TCGA-THCA
3,"[f785c907-8fff-4c84-8cf0-07ae5254cf34, 2c5b659...",,657566a4-d809-53af-9acd-d44fa6171b48,,Neuroblastoma,657566a4-d809-53af-9acd-d44fa6171b48,,Nervous System,"[b661f1db-b250-5ee9-997c-28796fee12a0, 36e4ad0...",,live,"[TARGET-30-PALETP-01A-01W, TARGET-30-PALETP-01...",,TARGET-30-PALETP,,"[TARGET-30-PALETP-10A, TARGET-30-PALETP-01A]",,2017-12-15T21:12:49.412376-06:00,TARGET,TARGET-NBL
4,[f357af2b-4c7b-5250-8e9a-f63ad8d0ccb5],,7ce2465b-5974-56f3-96b9-14057c306177,,Acute Myeloid Leukemia,7ce2465b-5974-56f3-96b9-14057c306177,,Blood,"[169a096b-b224-55b1-a0fc-74a905fc1b06, ed1261b...",,live,[TARGET-20-PADZKD-09A-02R],,TARGET-20-PADZKD,,"[TARGET-20-PADZKD-09A, TARGET-20-PADZKD-14A]",,2017-12-15T21:28:37.279413-06:00,TARGET,TARGET-AML
5,"[4092f0cb-b02b-4506-9b7f-610b1df3c8c8, a8c8d50...","[d8d04fc8-42e6-4291-8a52-a9d87c8c4682, bfaf86e...",b9e2f4d1-866c-4c62-a9c8-8de7eb364a6e,,Brain Lower Grade Glioma,b9e2f4d1-866c-4c62-a9c8-8de7eb364a6e,"[8fda93c9-9c0c-4fb4-926e-5c1b7c62979c, be83f0f...",Brain,"[96edec23-5d46-44c0-9975-625bd578598f, 3b97787...","[b792888a-361b-4a81-a7ba-f57f29f7b504, 5eb94ed...",live,"[TCGA-HT-7469-01A-11D-2253-08, TCGA-HT-7469-01...","[TCGA-HT-7469-10B-01W, TCGA-HT-7469-01A-11R, T...",TCGA-HT-7469,"[TCGA-HT-7469-10B-01, TCGA-HT-7469-01A-11, TCG...","[TCGA-HT-7469-01A, TCGA-HT-7469-10B]","[TCGA-HT-7469-01A-01-BS1, TCGA-HT-7469-01A-01-...",2017-03-04T16:39:19.244769-06:00,TCGA,TCGA-LGG
6,"[377047a2-2510-4c52-b73d-6766bcf14a18, 95bcced...","[e5119ee8-fd7d-4b68-baaa-824c0c3f00b1, 409cb5f...",6201394f-b70e-4f14-9519-ff9d5f06eb09,,Sarcoma,6201394f-b70e-4f14-9519-ff9d5f06eb09,"[47a065db-24c6-40ce-995a-b4e3ae48d096, c326ac6...",Soft Tissue,"[acea4025-f740-4a60-ad51-eaf61964256b, ccf3805...",[9f54b5db-8e94-4534-b0f3-4de132fe97f8],live,"[TCGA-DX-A3U6-10A-01D-A29M-01, TCGA-DX-A3U6-01...","[TCGA-DX-A3U6-10A-01W, TCGA-DX-A3U6-01A-11R, T...",TCGA-DX-A3U6,"[TCGA-DX-A3U6-10A-01, TCGA-DX-A3U6-01A-21-A455...","[TCGA-DX-A3U6-01A, TCGA-DX-A3U6-10A]",[TCGA-DX-A3U6-01A-01-TS1],2017-03-04T16:39:19.244769-06:00,TCGA,TCGA-SARC
7,"[5ec237ad-6ae4-4947-91ac-9f3f81df1981, 3ac962c...","[94406f32-af4a-494d-a796-f023a56172a4, 313b788...",c0c3caab-9277-4a31-a96c-c607e38d5ccc,,Ovarian Serous Cystadenocarcinoma,c0c3caab-9277-4a31-a96c-c607e38d5ccc,"[e51d32a0-c31c-408c-b920-5d93914fafca, 81c4529...",Ovary,"[424254f5-1bdf-43ed-b867-33a73a3c919f, 0b9e3de...","[e8ba0c6f-94e8-4ebb-86e4-882753640bbb, 453252e...",live,"[TCGA-23-1114-01B-01R-0564-01, TCGA-23-1114-01...","[TCGA-23-1114-01B-01T, TCGA-23-1114-01B-01W, T...",TCGA-23-1114,"[TCGA-23-1114-01B-01, TCGA-23-1114-10A-01]","[TCGA-23-1114-10A, TCGA-23-1114-01B]","[TCGA-23-1114-01B-01-BS1, TCGA-23-1114-01B-01-...",2017-03-04T16:39:19.244769-06:00,TCGA,TCGA-OV
8,"[90e96e37-0f23-40b7-8277-527e8d8f0b76, ddcd65b...","[da16bd5f-2db4-45c5-a693-d7ec7723669f, 943b1c4...",0c52a842-c220-486e-88de-8a4fec5d6a58,,Brain Lower Grade Glioma,0c52a842-c220-486e-88de-8a4fec5d6a58,"[bd6b5df0-11f3-48c1-bc7e-f5a2e3234988, 50041e6...",Brain,"[906f5742-9542-486c-93c3-fc4412764dd8, 73a3916...",[fe4b10ba-0756-40ad-8334-0730d45bdb12],live,"[TCGA-DB-A75O-01A-11R-A32L-13, TCGA-DB-A75O-10...","[TCGA-DB-A75O-10A-01D, TCGA-DB-A75O-01A-11R, T...",TCGA-DB-A75O,"[TCGA-DB-A75O-01A-21-A44D-20, TCGA-DB-A75O-10A...","[TCGA-DB-A75O-10A, TCGA-DB-A75O-01A]",[TCGA-DB-A75O-01A-01-TSA],2017-03-04T16:39:19.244769-06:00,TCGA,TCGA-LGG
9,"[0f0c88fd-30ec-49e1-bc2f-ce4f6c04c32b, 5305e38...","[3d611634-b074-4cb5-acea-b017a828b798, d0cd51a...",c5e23947-253b-40d2-916a-277b2a580be2,,Sarcoma,c5e23947-253b-40d2-916a-277b2a580be2,"[72db45a2-8e83-423d-b46c-2b22c3fb3db9, 2b165e2...",Soft Tissue,"[1fe30a2f-22a6-4f88-9a3a-7a11359cc151, 91f3ef0...",[90c35fae-996a-4f5e-9f45-5d79dfec9b59],live,"[TCGA-IE-A4EI-01A-11D-A24M-01, TCGA-IE-A4EI-01...","[TCGA-IE-A4EI-01A-11W, TCGA-IE-A4EI-01A-11D, T...",TCGA-IE-A4EI,"[TCGA-IE-A4EI-10A-01, TCGA-IE-A4EI-01A-21-A455...","[TCGA-IE-A4EI-01A, TCGA-IE-A4EI-10A]",[TCGA-IE-A4EI-01A-01-TS1],2017-03-04T16:39:19.244769-06:00,TCGA,TCGA-SARC


In [14]:
def fix_cases_row(row_data):
    row_data['program_name'] = row_data['cases'][0]['project']['program']['name']
    row_data['project_id'] = row_data['cases'][0]['project']['project_id']
    case_ids = list()
    for case in row_data['cases']:
        case_ids.append(case['case_id'])
    row_data['case_id'] = case_ids
    return row_data

def fix_cases(df):
    return df.progress_apply(fix_cases_row, axis=1).drop('cases', axis=1)

In [15]:
files = get_dataframe_for("files", ['cases.project.program.name', 'cases.project.project_id', 'cases.case_id'])
files = fix_cases(files)




In [16]:
# files.to_csv("../raw_tables/files.tsv", sep="\t")

In [23]:
with open('../raw_tables/files.P', 'wb') as file:
    pickle.dump(files, file)

In [17]:
files

Unnamed: 0,access,acl,created_datetime,data_category,data_format,data_type,error_type,experimental_strategy,file_id,file_name,...,md5sum,platform,state,state_comment,submitter_id,type,updated_datetime,program_name,project_id,case_id
0,controlled,[phs000178],2016-05-29T12:44:46.784898-05:00,Raw Sequencing Data,BAM,Aligned Reads,,RNA-Seq,40af20ee-2aa6-4c53-beac-3e166f2dfbe4,1f7d9145-c0d8-4a37-b8f1-81ddc3a0a3f7_gdc_realn...,...,e7df198c1dbee4b6ab7d0980d65ae62e,Illumina,live,,1f7d9145-c0d8-4a37-b8f1-81ddc3a0a3f7,aligned_reads,2017-03-04T16:37:26.081298-06:00,TCGA,TCGA-THCA,[6b529fb3-0b1b-4d3b-9309-bfcb063dbb71]
1,open,[open],2016-05-29T10:51:15.714019-05:00,Transcriptome Profiling,TXT,Gene Expression Quantification,,RNA-Seq,0934a0b3-858d-430c-a5c9-ea459104a392,81cab804-bfb1-44f0-ac2d-923e0766ac61.htseq.cou...,...,5cbfed753ee5073d3a8fcd68818ed0a1,,live,,81cab804-bfb1-44f0-ac2d-923e0766ac61_count,gene_expression,2017-03-04T16:39:14.644494-06:00,TCGA,TCGA-SARC,[8f3f6f7c-4b50-467e-a6e8-d836735913f6]
2,open,[open],2016-05-26T21:19:15.996103-05:00,Transcriptome Profiling,TXT,Gene Expression Quantification,,RNA-Seq,23f30088-b9d0-4657-b7d7-188e85230687,fc3748e5-79b4-4e5f-b0db-1d9cf4ed3cca.FPKM.txt.gz,...,4a0b260c9dd79a0eb4fe6199d7c1f010,,live,,fc3748e5-79b4-4e5f-b0db-1d9cf4ed3cca_fpkm,gene_expression,2017-03-04T16:39:14.644494-06:00,TCGA,TCGA-SARC,[1cbcfbb0-fcad-4ccd-8e83-120faa3e0df3]
3,controlled,[phs000178],2017-06-17T19:34:54.922599-05:00,Simple Nucleotide Variation,VCF,Annotated Somatic Mutation,,WXS,88be35fc-3a1d-47ad-93b4-90d786341be7,88be35fc-3a1d-47ad-93b4-90d786341be7.vep.vcf.gz,...,d07cf83738686ae3053fe72e95c409e1,,live,,TCGA-VM-A8CH-01A-12D-A36O-08_TCGA-VM-A8CH-10A-...,annotated_somatic_mutation,2017-06-18T00:28:03.854179-05:00,TCGA,TCGA-LGG,[484dd45b-a16c-4650-bac9-832fbe4d96eb]
4,controlled,[phs000178],2017-06-17T20:23:41.316128-05:00,Simple Nucleotide Variation,VCF,Annotated Somatic Mutation,,WXS,ad3dd7e9-808a-4e4b-8d43-dc3de68c8c5c,ad3dd7e9-808a-4e4b-8d43-dc3de68c8c5c.vep.vcf.gz,...,35f2de2ef0304fe3ee8bb1229eb4c3b8,,live,,TCGA-13-0886-01A-01W-0420-08_TCGA-13-0886-10A-...,annotated_somatic_mutation,2017-06-18T02:21:31.912059-05:00,TCGA,TCGA-OV,[91de8a74-a1e6-46b6-a06e-70aedf2c3eaf]
5,controlled,[phs000178],2017-06-17T20:18:02.498880-05:00,Simple Nucleotide Variation,VCF,Annotated Somatic Mutation,,WXS,0b1c205c-c227-43dd-a308-1a46b4f733df,0b1c205c-c227-43dd-a308-1a46b4f733df.vep.vcf.gz,...,9af292e502a6419905c3f25763e36b9c,,live,,TCGA-24-1564-01A-01W-0551-08_TCGA-24-1564-10A-...,annotated_somatic_mutation,2017-06-18T02:03:33.889192-05:00,TCGA,TCGA-OV,[d77ef9cf-f8e6-4ee9-8d4f-1106885f6b06]
6,controlled,[phs000178],2016-06-03T17:05:20.942354-05:00,Simple Nucleotide Variation,VCF,Raw Simple Somatic Mutation,,WXS,9c26fefe-1876-41d5-ae27-a665ed72643f,9c26fefe-1876-41d5-ae27-a665ed72643f.vcf.gz,...,fb3358bdea79e85074b9ab99845a790a,,live,,TCGA-OR-A5KU-01A-11D-A29I-10_TCGA-OR-A5KU-10A-...,simple_somatic_mutation,2017-03-04T16:45:40.925270-06:00,TCGA,TCGA-ACC,[424a497a-48b9-4507-b234-c4fd08c8acad]
7,open,[open],2016-05-03T01:59:05.690402-05:00,Copy Number Variation,TXT,Masked Copy Number Segment,,Genotyping Array,007eb487-5845-43ba-8fa1-2161ff76462b,MACON_p_TCGASNP_218_AML_PP_N_GenomeWideSNP_6_D...,...,95ead0f165325594229ee9facb72aa72,Affymetrix SNP 6.0,live,,49286f9e-4f03-4a25-af52-afdb4b571cee_nocnv,copy_number_segment,2017-03-04T16:40:26.360088-06:00,TCGA,TCGA-SARC,[e786de34-4c21-460f-89ab-008de4347049]
8,controlled,[phs000218],2017-06-17T17:50:33.451211-05:00,Simple Nucleotide Variation,VCF,Annotated Somatic Mutation,,WXS,2c9c09c4-4849-4333-b1ad-53699fde6072,2c9c09c4-4849-4333-b1ad-53699fde6072.vep.vcf.gz,...,5c289ee2e699d1b856c31735e1134a43,,live,,TARGET-30-PARMLF-01A-01D_TARGET-30-PARMLF-10A-...,annotated_somatic_mutation,2017-12-11T08:47:45.433110-06:00,TARGET,TARGET-NBL,[86d76ae8-07ff-5f48-af7a-f32f64f2b173]
9,open,[open],2016-10-27T21:58:12.297090-05:00,DNA Methylation,TXT,Methylation Beta Value,,Methylation Array,23fa7b4b-9d68-429b-aece-658b11124bb3,jhu-usc.edu_OV.HumanMethylation27.1.lvl-3.TCGA...,...,9163285d8eadc921d7244f29faca50da,Illumina Human Methylation 27,live,,cde73b7c-0a50-4444-bb33-11e3debd3f79-beta-value,methylation_beta_value,2017-03-24T18:43:16.886826-05:00,TCGA,TCGA-OV,[6746533a-8d0b-4ebc-87ec-49c8738121a8]


In [25]:
gc.collect()

3730664