In [1]:
import os
import requests
import json
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sn
import pickle
import gc
from tqdm import tqdm, tqdm_notebook, tnrange, trange

In [2]:
tqdm_notebook().pandas()
# tqdm.pandas(tqdm_notebook)




In [3]:
import urlpath
def generate_endpoints():
    host = urlpath.URL("https://api.gdc.cancer.gov")
    endpoints = dict()
    for point in ['files','cases','projects','annotations']:
        endpoints[point] = host / point
        endpoints["%s_mapping" % point] = host / point / "_mapping"
    endpoints['data'] = host / "data"
    endpoints['legacy_data'] = host / "legacy/data"
    endpoints['manifest'] = host / "manifest"
    return endpoints
endpoints = generate_endpoints()
endpoints

{'annotations': 'https://api.gdc.cancer.gov/annotations/',
 'annotations_mapping': 'https://api.gdc.cancer.gov/annotations/_mapping/',
 'cases': 'https://api.gdc.cancer.gov/cases/',
 'cases_mapping': 'https://api.gdc.cancer.gov/cases/_mapping/',
 'files': 'https://api.gdc.cancer.gov/files/',
 'files_mapping': 'https://api.gdc.cancer.gov/files/_mapping/',
 'projects': 'https://api.gdc.cancer.gov/projects/',
 'projects_mapping': 'https://api.gdc.cancer.gov/projects/_mapping/'}

In [15]:
tables = dict()
table_names = ["projects", "cases", "files"]
for i in tnrange(len(table_names)):
    table_name = table_names[i]
    with open("../raw_tables/%s.P" % table_name, 'rb') as file:
        tables[table_name] = pickle.load(file)




In [17]:
gc.collect()

7

In [16]:
tables['projects'].drop(['id', 'dbgap_accession_number', 'released', 'state'], axis='columns', inplace=True)

In [69]:
tables['projects'].rename(mapper={'name':'project_name'}, axis='columns', inplace=True)

In [70]:
tables['projects']

Unnamed: 0,disease_type,project_name,primary_site,project_id,program_name
0,[Rectum Adenocarcinoma],Rectum Adenocarcinoma,[Colorectal],TCGA-READ,TCGA
1,[Thyroid Carcinoma],Thyroid Carcinoma,[Thyroid],TCGA-THCA,TCGA
2,[Clear Cell Sarcoma of the Kidney],Clear Cell Sarcoma of the Kidney,[Kidney],TARGET-CCSK,TARGET
3,[Mesothelioma],Mesothelioma,[Pleura],TCGA-MESO,TCGA
4,[Sarcoma],Sarcoma,[Soft Tissue],TCGA-SARC,TCGA
5,[Acute Myeloid Leukemia],Acute Myeloid Leukemia,[Blood],TARGET-AML,TARGET
6,[Brain Lower Grade Glioma],Brain Lower Grade Glioma,[Brain],TCGA-LGG,TCGA
7,[Neuroblastoma],Neuroblastoma,[Nervous System],TARGET-NBL,TARGET
8,[Adrenocortical Carcinoma],Adrenocortical Carcinoma,[Adrenal Gland],TCGA-ACC,TCGA
9,[Ovarian Serous Cystadenocarcinoma],Ovarian Serous Cystadenocarcinoma,[Ovary],TCGA-OV,TCGA


In [24]:
tables['cases'].drop(['created_datetime', 'id', 'updated_datetime'], axis='columns', inplace=True)

In [56]:
tables['cases']['n_samples'] = tables['cases']['sample_ids'].progress_apply(lambda x: len(x) if type(x) is list else 0)

In [57]:
tables['cases']

Unnamed: 0,aliquot_ids,analyte_ids,case_id,disease_type,portion_ids,primary_site,sample_ids,slide_ids,state,submitter_aliquot_ids,submitter_analyte_ids,submitter_id,submitter_portion_ids,submitter_sample_ids,submitter_slide_ids,program_name,project_id,n_samples
0,"[148aece6-97ce-58e4-91d2-9410e5b147a9, 3e17d84...",,c71c69ba-35ea-5024-9118-900f04bb958a,Neuroblastoma,,Nervous System,"[b5f665fe-e556-5146-9fb2-41de0eb3f2d0, 4acb947...",,live,"[TARGET-30-PAUDFR-01A-01D, TARGET-30-PAUDFR-10...",,TARGET-30-PAUDFR,,"[TARGET-30-PAUDFR-01A, TARGET-30-PAUDFR-10A]",,TARGET,TARGET-NBL,2
1,"[21d8a70b-021a-45b7-968e-f31a152a030e, a50e60e...","[72efc95c-bfbd-42a5-9129-1bb7e8db5db6, ec96b00...",dce71741-ccbe-40b7-a0b8-2048d07187a4,Ovarian Serous Cystadenocarcinoma,"[a6d9de12-eaac-4b14-bf81-d00f175c170f, bfaa683...",Ovary,"[af849153-dc08-472a-ab16-26c25fe1751f, 3d0f248...","[a28f0475-589d-4831-9186-6b955153f5bb, 91c79a1...",live,"[TCGA-25-2409-10A-01W-0801-09, TCGA-25-2409-01...","[TCGA-25-2409-01A-01R, TCGA-25-2409-10A-01D, T...",TCGA-25-2409,"[TCGA-25-2409-10A-01, TCGA-25-2409-01A-01, TCG...","[TCGA-25-2409-10A, TCGA-25-2409-01A]","[TCGA-25-2409-01A-01-BS1, TCGA-25-2409-01A-01-...",TCGA,TCGA-OV,2
2,"[e1cf0ad1-a200-4c10-83fd-415eea479532, 633da6d...","[7547a920-71c7-42f3-b369-3dc59d61069f, d43df96...",e8f56d0f-eee4-4def-a43a-dec91f4382a1,Thyroid Carcinoma,"[ded0161b-e75f-4a48-90b4-4d521661e123, 5ac1005...",Thyroid,"[e13c3f11-913d-4371-9a7d-5e4df15fb42a, e84f9fa...",[5c562496-ceb3-445f-89d8-607560279cc6],live,"[TCGA-EM-A3AQ-01A-11W-A21F-08, TCGA-EM-A3AQ-10...","[TCGA-EM-A3AQ-01A-11W, TCGA-EM-A3AQ-01A-11R, T...",TCGA-EM-A3AQ,"[TCGA-EM-A3AQ-10A-01, TCGA-EM-A3AQ-01A-11]","[TCGA-EM-A3AQ-10A, TCGA-EM-A3AQ-01A]",[TCGA-EM-A3AQ-01A-01-TS1],TCGA,TCGA-THCA,2
3,"[f785c907-8fff-4c84-8cf0-07ae5254cf34, 2c5b659...",,657566a4-d809-53af-9acd-d44fa6171b48,Neuroblastoma,,Nervous System,"[b661f1db-b250-5ee9-997c-28796fee12a0, 36e4ad0...",,live,"[TARGET-30-PALETP-01A-01W, TARGET-30-PALETP-01...",,TARGET-30-PALETP,,"[TARGET-30-PALETP-10A, TARGET-30-PALETP-01A]",,TARGET,TARGET-NBL,2
4,[f357af2b-4c7b-5250-8e9a-f63ad8d0ccb5],,7ce2465b-5974-56f3-96b9-14057c306177,Acute Myeloid Leukemia,,Blood,"[169a096b-b224-55b1-a0fc-74a905fc1b06, ed1261b...",,live,[TARGET-20-PADZKD-09A-02R],,TARGET-20-PADZKD,,"[TARGET-20-PADZKD-09A, TARGET-20-PADZKD-14A]",,TARGET,TARGET-AML,2
5,"[4092f0cb-b02b-4506-9b7f-610b1df3c8c8, a8c8d50...","[d8d04fc8-42e6-4291-8a52-a9d87c8c4682, bfaf86e...",b9e2f4d1-866c-4c62-a9c8-8de7eb364a6e,Brain Lower Grade Glioma,"[8fda93c9-9c0c-4fb4-926e-5c1b7c62979c, be83f0f...",Brain,"[96edec23-5d46-44c0-9975-625bd578598f, 3b97787...","[b792888a-361b-4a81-a7ba-f57f29f7b504, 5eb94ed...",live,"[TCGA-HT-7469-01A-11D-2253-08, TCGA-HT-7469-01...","[TCGA-HT-7469-10B-01W, TCGA-HT-7469-01A-11R, T...",TCGA-HT-7469,"[TCGA-HT-7469-10B-01, TCGA-HT-7469-01A-11, TCG...","[TCGA-HT-7469-01A, TCGA-HT-7469-10B]","[TCGA-HT-7469-01A-01-BS1, TCGA-HT-7469-01A-01-...",TCGA,TCGA-LGG,2
6,"[377047a2-2510-4c52-b73d-6766bcf14a18, 95bcced...","[e5119ee8-fd7d-4b68-baaa-824c0c3f00b1, 409cb5f...",6201394f-b70e-4f14-9519-ff9d5f06eb09,Sarcoma,"[47a065db-24c6-40ce-995a-b4e3ae48d096, c326ac6...",Soft Tissue,"[acea4025-f740-4a60-ad51-eaf61964256b, ccf3805...",[9f54b5db-8e94-4534-b0f3-4de132fe97f8],live,"[TCGA-DX-A3U6-10A-01D-A29M-01, TCGA-DX-A3U6-01...","[TCGA-DX-A3U6-10A-01W, TCGA-DX-A3U6-01A-11R, T...",TCGA-DX-A3U6,"[TCGA-DX-A3U6-10A-01, TCGA-DX-A3U6-01A-21-A455...","[TCGA-DX-A3U6-01A, TCGA-DX-A3U6-10A]",[TCGA-DX-A3U6-01A-01-TS1],TCGA,TCGA-SARC,2
7,"[5ec237ad-6ae4-4947-91ac-9f3f81df1981, 3ac962c...","[94406f32-af4a-494d-a796-f023a56172a4, 313b788...",c0c3caab-9277-4a31-a96c-c607e38d5ccc,Ovarian Serous Cystadenocarcinoma,"[e51d32a0-c31c-408c-b920-5d93914fafca, 81c4529...",Ovary,"[424254f5-1bdf-43ed-b867-33a73a3c919f, 0b9e3de...","[e8ba0c6f-94e8-4ebb-86e4-882753640bbb, 453252e...",live,"[TCGA-23-1114-01B-01R-0564-01, TCGA-23-1114-01...","[TCGA-23-1114-01B-01T, TCGA-23-1114-01B-01W, T...",TCGA-23-1114,"[TCGA-23-1114-01B-01, TCGA-23-1114-10A-01]","[TCGA-23-1114-10A, TCGA-23-1114-01B]","[TCGA-23-1114-01B-01-BS1, TCGA-23-1114-01B-01-...",TCGA,TCGA-OV,2
8,"[90e96e37-0f23-40b7-8277-527e8d8f0b76, ddcd65b...","[da16bd5f-2db4-45c5-a693-d7ec7723669f, 943b1c4...",0c52a842-c220-486e-88de-8a4fec5d6a58,Brain Lower Grade Glioma,"[bd6b5df0-11f3-48c1-bc7e-f5a2e3234988, 50041e6...",Brain,"[906f5742-9542-486c-93c3-fc4412764dd8, 73a3916...",[fe4b10ba-0756-40ad-8334-0730d45bdb12],live,"[TCGA-DB-A75O-01A-11R-A32L-13, TCGA-DB-A75O-10...","[TCGA-DB-A75O-10A-01D, TCGA-DB-A75O-01A-11R, T...",TCGA-DB-A75O,"[TCGA-DB-A75O-01A-21-A44D-20, TCGA-DB-A75O-10A...","[TCGA-DB-A75O-10A, TCGA-DB-A75O-01A]",[TCGA-DB-A75O-01A-01-TSA],TCGA,TCGA-LGG,2
9,"[0f0c88fd-30ec-49e1-bc2f-ce4f6c04c32b, 5305e38...","[3d611634-b074-4cb5-acea-b017a828b798, d0cd51a...",c5e23947-253b-40d2-916a-277b2a580be2,Sarcoma,"[72db45a2-8e83-423d-b46c-2b22c3fb3db9, 2b165e2...",Soft Tissue,"[1fe30a2f-22a6-4f88-9a3a-7a11359cc151, 91f3ef0...",[90c35fae-996a-4f5e-9f45-5d79dfec9b59],live,"[TCGA-IE-A4EI-01A-11D-A24M-01, TCGA-IE-A4EI-01...","[TCGA-IE-A4EI-01A-11W, TCGA-IE-A4EI-01A-11D, T...",TCGA-IE-A4EI,"[TCGA-IE-A4EI-10A-01, TCGA-IE-A4EI-01A-21-A455...","[TCGA-IE-A4EI-01A, TCGA-IE-A4EI-10A]",[TCGA-IE-A4EI-01A-01-TS1],TCGA,TCGA-SARC,2


In [58]:
tables['cases_with_samples'] = tables['cases'].where(tables['cases']['n_samples'] > 0).dropna(how='all')
tables['cases_without_samples'] = tables['cases'].where(tables['cases']['n_samples'] == 0).dropna(how='all')

In [60]:
tables['cases_without_samples'].shape

(34, 18)

In [62]:
tables['cases_with_samples'].shape

(32521, 18)

In [32]:
tables['files'].drop(['acl', 'id', 'created_datetime', 'updated_datetime'], axis='columns', inplace=True)

In [52]:
tables['files']['n_cases'] = tables['files']['case_id'].progress_apply(lambda x: len(x))




In [53]:
tables['files']

Unnamed: 0,access,data_category,data_format,data_type,error_type,experimental_strategy,file_id,file_name,file_size,file_state,md5sum,platform,state,state_comment,submitter_id,type,program_name,project_id,case_id,n_cases
0,controlled,Raw Sequencing Data,BAM,Aligned Reads,,RNA-Seq,40af20ee-2aa6-4c53-beac-3e166f2dfbe4,1f7d9145-c0d8-4a37-b8f1-81ddc3a0a3f7_gdc_realn...,10316127217,submitted,e7df198c1dbee4b6ab7d0980d65ae62e,Illumina,live,,1f7d9145-c0d8-4a37-b8f1-81ddc3a0a3f7,aligned_reads,TCGA,TCGA-THCA,[6b529fb3-0b1b-4d3b-9309-bfcb063dbb71],1
1,open,Transcriptome Profiling,TXT,Gene Expression Quantification,,RNA-Seq,0934a0b3-858d-430c-a5c9-ea459104a392,81cab804-bfb1-44f0-ac2d-923e0766ac61.htseq.cou...,249298,submitted,5cbfed753ee5073d3a8fcd68818ed0a1,,live,,81cab804-bfb1-44f0-ac2d-923e0766ac61_count,gene_expression,TCGA,TCGA-SARC,[8f3f6f7c-4b50-467e-a6e8-d836735913f6],1
2,open,Transcriptome Profiling,TXT,Gene Expression Quantification,,RNA-Seq,23f30088-b9d0-4657-b7d7-188e85230687,fc3748e5-79b4-4e5f-b0db-1d9cf4ed3cca.FPKM.txt.gz,505610,submitted,4a0b260c9dd79a0eb4fe6199d7c1f010,,live,,fc3748e5-79b4-4e5f-b0db-1d9cf4ed3cca_fpkm,gene_expression,TCGA,TCGA-SARC,[1cbcfbb0-fcad-4ccd-8e83-120faa3e0df3],1
3,controlled,Simple Nucleotide Variation,VCF,Annotated Somatic Mutation,,WXS,88be35fc-3a1d-47ad-93b4-90d786341be7,88be35fc-3a1d-47ad-93b4-90d786341be7.vep.vcf.gz,165208,submitted,d07cf83738686ae3053fe72e95c409e1,,live,,TCGA-VM-A8CH-01A-12D-A36O-08_TCGA-VM-A8CH-10A-...,annotated_somatic_mutation,TCGA,TCGA-LGG,[484dd45b-a16c-4650-bac9-832fbe4d96eb],1
4,controlled,Simple Nucleotide Variation,VCF,Annotated Somatic Mutation,,WXS,ad3dd7e9-808a-4e4b-8d43-dc3de68c8c5c,ad3dd7e9-808a-4e4b-8d43-dc3de68c8c5c.vep.vcf.gz,1069229,submitted,35f2de2ef0304fe3ee8bb1229eb4c3b8,,live,,TCGA-13-0886-01A-01W-0420-08_TCGA-13-0886-10A-...,annotated_somatic_mutation,TCGA,TCGA-OV,[91de8a74-a1e6-46b6-a06e-70aedf2c3eaf],1
5,controlled,Simple Nucleotide Variation,VCF,Annotated Somatic Mutation,,WXS,0b1c205c-c227-43dd-a308-1a46b4f733df,0b1c205c-c227-43dd-a308-1a46b4f733df.vep.vcf.gz,110968,submitted,9af292e502a6419905c3f25763e36b9c,,live,,TCGA-24-1564-01A-01W-0551-08_TCGA-24-1564-10A-...,annotated_somatic_mutation,TCGA,TCGA-OV,[d77ef9cf-f8e6-4ee9-8d4f-1106885f6b06],1
6,controlled,Simple Nucleotide Variation,VCF,Raw Simple Somatic Mutation,,WXS,9c26fefe-1876-41d5-ae27-a665ed72643f,9c26fefe-1876-41d5-ae27-a665ed72643f.vcf.gz,198682,submitted,fb3358bdea79e85074b9ab99845a790a,,live,,TCGA-OR-A5KU-01A-11D-A29I-10_TCGA-OR-A5KU-10A-...,simple_somatic_mutation,TCGA,TCGA-ACC,[424a497a-48b9-4507-b234-c4fd08c8acad],1
7,open,Copy Number Variation,TXT,Masked Copy Number Segment,,Genotyping Array,007eb487-5845-43ba-8fa1-2161ff76462b,MACON_p_TCGASNP_218_AML_PP_N_GenomeWideSNP_6_D...,35995,submitted,95ead0f165325594229ee9facb72aa72,Affymetrix SNP 6.0,live,,49286f9e-4f03-4a25-af52-afdb4b571cee_nocnv,copy_number_segment,TCGA,TCGA-SARC,[e786de34-4c21-460f-89ab-008de4347049],1
8,controlled,Simple Nucleotide Variation,VCF,Annotated Somatic Mutation,,WXS,2c9c09c4-4849-4333-b1ad-53699fde6072,2c9c09c4-4849-4333-b1ad-53699fde6072.vep.vcf.gz,181009,submitted,5c289ee2e699d1b856c31735e1134a43,,live,,TARGET-30-PARMLF-01A-01D_TARGET-30-PARMLF-10A-...,annotated_somatic_mutation,TARGET,TARGET-NBL,[86d76ae8-07ff-5f48-af7a-f32f64f2b173],1
9,open,DNA Methylation,TXT,Methylation Beta Value,,Methylation Array,23fa7b4b-9d68-429b-aece-658b11124bb3,jhu-usc.edu_OV.HumanMethylation27.1.lvl-3.TCGA...,9951504,submitted,9163285d8eadc921d7244f29faca50da,Illumina Human Methylation 27,live,,cde73b7c-0a50-4444-bb33-11e3debd3f79-beta-value,methylation_beta_value,TCGA,TCGA-OV,[6746533a-8d0b-4ebc-87ec-49c8738121a8],1


In [82]:
tables['controlled_files'] = tables['files'].where(tables['files']['access'] == 'controlled').dropna(how='all')
tables['open_files'] = tables['files'].where(tables['files']['access'] == 'open').dropna(how='all')

In [45]:
tables['controlled_files'].program_name.unique()

array(['TCGA', 'TARGET', 'FM'], dtype=object)

In [46]:
tables['open_files'].program_name.unique()

array(['TCGA', 'TARGET', 'FM'], dtype=object)

In [51]:
if not os.path.isdir('../cleaned_tables'):
    if os.path.exists('../cleaned_tables'):
        os.remove('../cleaned_tables')
    os.makedirs('../cleaned_tables', exist_ok=True)

In [89]:
table_keys = list(tables.keys())
for i in tnrange(len(table_keys)):
    table = table_keys[i]
    with open('../cleaned_tables/%s.P' % table, 'wb') as file:
        pickle.dump(tables[table], file)