In [1]:
import os
import requests
import json
import pickle
import gc
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sn
import hashlib
import io
import pathlib
from tqdm import tqdm, tqdm_notebook, tnrange, trange

In [2]:
tqdm_notebook().pandas()
# tqdm.pandas(tqdm_notebook)




In [3]:
desired_categories = ['Masked Copy Number Segment', 'Copy Number Segment', 'Methylation Beta Value', 'Clinical Supplement', 'Biospecimen Supplement', 'Masked Somatic Mutation']

In [4]:
def should_keep(rowdata):
    return rowdata['data_type'] in desired_categories

In [5]:
import urlpath
def generate_endpoints():
    host = urlpath.URL("https://api.gdc.cancer.gov")
    endpoints = dict()
    for point in ['files','cases','projects','annotations']:
        endpoints[point] = host / point
        endpoints["%s_mapping" % point] = host / point / "_mapping"
    endpoints['data'] = host / "data"
    endpoints['legacy_data'] = host / "legacy/data"
    endpoints['manifest'] = host / "manifest"
    return endpoints
endpoints = generate_endpoints()
endpoints

{'annotations': URL('https://api.gdc.cancer.gov/annotations'),
 'annotations_mapping': URL('https://api.gdc.cancer.gov/annotations/_mapping'),
 'cases': URL('https://api.gdc.cancer.gov/cases'),
 'cases_mapping': URL('https://api.gdc.cancer.gov/cases/_mapping'),
 'data': URL('https://api.gdc.cancer.gov/data'),
 'files': URL('https://api.gdc.cancer.gov/files'),
 'files_mapping': URL('https://api.gdc.cancer.gov/files/_mapping'),
 'legacy_data': URL('https://api.gdc.cancer.gov/legacy/data'),
 'manifest': URL('https://api.gdc.cancer.gov/manifest'),
 'projects': URL('https://api.gdc.cancer.gov/projects'),
 'projects_mapping': URL('https://api.gdc.cancer.gov/projects/_mapping')}

In [6]:
rootpath = pathlib.Path("..")
cleanpath = rootpath / "cleaned_tables"

In [7]:
with open(cleanpath / 'open_files.P', 'rb') as file:
    open_files = pickle.load(file)

In [8]:
open_files

Unnamed: 0,access,data_category,data_format,data_type,error_type,experimental_strategy,file_id,file_name,file_size,file_state,md5sum,platform,state,state_comment,submitter_id,type,program_name,project_id,case_id,n_cases
1,open,Transcriptome Profiling,TXT,Gene Expression Quantification,,RNA-Seq,0934a0b3-858d-430c-a5c9-ea459104a392,81cab804-bfb1-44f0-ac2d-923e0766ac61.htseq.cou...,249298.0,submitted,5cbfed753ee5073d3a8fcd68818ed0a1,,live,,81cab804-bfb1-44f0-ac2d-923e0766ac61_count,gene_expression,TCGA,TCGA-SARC,[8f3f6f7c-4b50-467e-a6e8-d836735913f6],1.0
2,open,Transcriptome Profiling,TXT,Gene Expression Quantification,,RNA-Seq,23f30088-b9d0-4657-b7d7-188e85230687,fc3748e5-79b4-4e5f-b0db-1d9cf4ed3cca.FPKM.txt.gz,505610.0,submitted,4a0b260c9dd79a0eb4fe6199d7c1f010,,live,,fc3748e5-79b4-4e5f-b0db-1d9cf4ed3cca_fpkm,gene_expression,TCGA,TCGA-SARC,[1cbcfbb0-fcad-4ccd-8e83-120faa3e0df3],1.0
7,open,Copy Number Variation,TXT,Masked Copy Number Segment,,Genotyping Array,007eb487-5845-43ba-8fa1-2161ff76462b,MACON_p_TCGASNP_218_AML_PP_N_GenomeWideSNP_6_D...,35995.0,submitted,95ead0f165325594229ee9facb72aa72,Affymetrix SNP 6.0,live,,49286f9e-4f03-4a25-af52-afdb4b571cee_nocnv,copy_number_segment,TCGA,TCGA-SARC,[e786de34-4c21-460f-89ab-008de4347049],1.0
9,open,DNA Methylation,TXT,Methylation Beta Value,,Methylation Array,23fa7b4b-9d68-429b-aece-658b11124bb3,jhu-usc.edu_OV.HumanMethylation27.1.lvl-3.TCGA...,9951504.0,submitted,9163285d8eadc921d7244f29faca50da,Illumina Human Methylation 27,live,,cde73b7c-0a50-4444-bb33-11e3debd3f79-beta-value,methylation_beta_value,TCGA,TCGA-OV,[6746533a-8d0b-4ebc-87ec-49c8738121a8],1.0
11,open,Clinical,BCR XML,Clinical Supplement,,,edab6899-1b4a-457a-9bfe-046ed96abe2e,nationwidechildrens.org_clinical.TCGA-FG-A6IZ.xml,35886.0,submitted,a62b62b4c2e60846f6aed07dc33b34e1,,live,,,clinical_supplement,TCGA,TCGA-LGG,[da13446a-c4dc-4d02-80c8-410273925ac4],1.0
14,open,Transcriptome Profiling,TXT,Gene Expression Quantification,,RNA-Seq,7854b3e7-42c5-4252-98b7-babe0f3e34ae,17b38d44-cf29-495e-b145-68959926b3d9.FPKM.txt.gz,502934.0,submitted,48080cbe24fada50d7f80a5c10fdb381,,live,,17b38d44-cf29-495e-b145-68959926b3d9_fpkm,gene_expression,TCGA,TCGA-MESO,[c5d4d4b4-2c9e-4865-8860-68327356d461],1.0
15,open,DNA Methylation,TXT,Methylation Beta Value,,Methylation Array,2a6eec6e-cce4-418c-90c8-bb60ec5a33cc,jhu-usc.edu_THCA.HumanMethylation450.13.lvl-3....,141290301.0,submitted,0830796e3f0c6a7de862086c56d6c4b0,Illumina Human Methylation 450,live,,4a28321b-f56a-4fa2-8a88-26cbbcafbf85-beta-value,methylation_beta_value,TCGA,TCGA-THCA,[861f94ce-45ff-4588-addb-e1ea48878c2e],1.0
16,open,Transcriptome Profiling,TXT,Gene Expression Quantification,,RNA-Seq,137249ca-e2fb-458e-9255-cf4018f38c48,50bbf24f-b914-4e53-98ee-0cf22b2d9f01.FPKM.txt.gz,506375.0,submitted,310c3bb766a3d4171c63574a918a8597,,live,,50bbf24f-b914-4e53-98ee-0cf22b2d9f01_fpkm,gene_expression,TCGA,TCGA-SARC,[2905cbd1-719b-46d9-b8af-8fe4927bc473],1.0
21,open,DNA Methylation,TXT,Methylation Beta Value,,Methylation Array,a84d3dad-140e-4c9b-9e97-2f0ab98a0f97,jhu-usc.edu_OV.HumanMethylation27.13.lvl-3.TCG...,9953404.0,submitted,c6b4a28bc65ca942c9c8002a262c094c,Illumina Human Methylation 27,live,,b8f77354-a4b5-49fa-a855-fa0e9c15657b-beta-value,methylation_beta_value,TCGA,TCGA-OV,[3e8a51bf-7e1f-4eab-af83-3c60d04db1bf],1.0
22,open,Clinical,BCR XML,Clinical Supplement,,,756dd823-e5e5-4788-8892-6650059a918c,nationwidechildrens.org_clinical.TCGA-H2-A422.xml,64303.0,submitted,39005d40d94acbb40788a5957dba9840,,live,,,clinical_supplement,TCGA,TCGA-THCA,[8ed1b900-a8a8-4164-b96b-f060f15d7075],1.0


In [9]:
wanted_files = open_files.where(open_files.progress_apply(should_keep, axis='columns')).dropna(how='all')




In [10]:
wanted_files.shape

(79825, 20)

In [11]:
params = dict()
# params['pretty'] = 'True'


In [12]:
def raise_oserror(oserror):
    raise oserror

for root, dirs, files in os.walk("../GDC_Download", followlinks=True, onerror=raise_oserror):
    dirlist= pd.Series(dirs)
    break


In [13]:
files

[]

In [14]:
dirlist = dirlist.where(dirlist != "GDC_Initial_DL").dropna()

In [15]:
dirlist

0        00005051-36c7-4850-9e2c-243be54077ea
1        00006c36-08ae-4f93-a580-1b798d153d7c
2        0000c40e-9d45-4446-9dd9-a4676224d0ce
3        00015d5e-3aad-4f2a-9136-e0a40eacd0b6
4        0004067c-f0fd-4010-95b0-0310c076acb0
5        0004c5df-ba54-4ce5-b76a-7209d4a10db0
6        00052bb8-ca7a-45cd-b625-8f4e2f906448
7        000726ad-c183-4cdb-aea9-8a6af84be527
8        000917ce-772d-4a57-8be3-57d73c18cf2a
9        000977fc-eb56-4201-b247-0514cb1efae9
10       000ad857-288a-481f-9954-10fe0b4324a9
11       000bf387-328d-4119-b92b-d1715106f0b4
12       000c68e6-478f-4754-b753-9270b7ad7c61
13       000d2816-b40c-4323-ae07-faf50b43fdfb
14       000da838-2c48-4854-bcae-534e1aac2193
15       000f00b5-573a-4520-b3de-ba81e9671a71
16       00135244-3e40-44d6-8cf5-bc8e4a8c4ae1
17       00135bd8-170a-4976-b387-0e2d477d97d2
18       0014ba68-0c36-4df0-ad38-b668780142c2
19       00161f54-39bc-44f7-8af4-7d0e7970bed6
20       00164357-a731-40dd-a717-6f468d0d3498
21       00166964-4137-4b93-b61d-f

In [16]:
wanted_files_uuid_set = set(wanted_files['file_id'].values)
download_uuid_set = set(dirlist.values)

In [17]:
keep_these = wanted_files_uuid_set & download_uuid_set
remove_these = download_uuid_set - wanted_files_uuid_set
download_these = wanted_files_uuid_set - download_uuid_set

In [18]:
def to_df(setobj, other_df):
    df = pd.DataFrame()
    df['file_id'] = list(setobj)
    return df.merge(other_df, on='file_id')
keep_files_df = to_df(keep_these, open_files)
dl_files_df = to_df(download_these, open_files)

In [19]:
print("\n".join(dl_files_df.data_type.unique()))




In [20]:
print("\n".join(wanted_files.data_type.unique()))

Masked Copy Number Segment
Methylation Beta Value
Clinical Supplement
Copy Number Segment
Biospecimen Supplement
Masked Somatic Mutation


In [21]:
len(keep_these)

79825

In [22]:
len(remove_these)

0

In [23]:
len(download_these)

0

In [24]:
len(download_uuid_set)

79825

In [25]:
len(wanted_files_uuid_set)

79825

In [26]:
print("\n".join(open_files.data_type.unique()))

Gene Expression Quantification
Masked Copy Number Segment
Methylation Beta Value
Clinical Supplement
Copy Number Segment
miRNA Expression Quantification
Biospecimen Supplement
Isoform Expression Quantification
Masked Somatic Mutation


In [27]:
open_files.where(open_files.data_type == "Clinical Supplement").program_name.dropna().unique()

array(['TCGA', 'TARGET', 'FM'], dtype=object)

In [28]:
manifest = pd.read_csv("../manifest/gdc_manifest.2017-12-13T20_11_55.178417.txt", sep="\t")

In [29]:
manifest.rename(columns={'id':'file_id'}, inplace=True)

In [30]:
rm_files_df = to_df(remove_these, manifest)

In [31]:
keep_files_df

Unnamed: 0,file_id,access,data_category,data_format,data_type,error_type,experimental_strategy,file_name,file_size,file_state,md5sum,platform,state,state_comment,submitter_id,type,program_name,project_id,case_id,n_cases
0,12a87d54-7d1f-4e42-bf36-c1b3936f67bc,open,DNA Methylation,TXT,Methylation Beta Value,,Methylation Array,jhu-usc.edu_KICH.HumanMethylation450.1.lvl-3.T...,141283346.0,submitted,03aa7d67759854550b53065b3965b227,Illumina Human Methylation 450,live,,f56a75d1-72ab-44ed-a17d-8810ea12ee3c-beta-value,methylation_beta_value,TCGA,TCGA-KICH,[02ec7a3a-2812-4a65-93cd-07bf655cc91b],1.0
1,5ae85640-5f74-4cfe-9205-08b6c07e8f7a,open,Copy Number Variation,TXT,Masked Copy Number Segment,,Genotyping Array,RUNIC_p_2TCGA235_237_250_242_mN_GenomeWideSNP_...,3112.0,submitted,19d6b68175d10033d82995976bb97054,Affymetrix SNP 6.0,live,,859c035f-b6eb-450d-8fbc-bbb4775f930c_nocnv,copy_number_segment,TCGA,TCGA-THCA,[a09b20b5-c187-4632-b996-0d5314a9885c],1.0
2,abd4a6da-0fd8-4f45-9e99-5f9c901e4b37,open,Clinical,BCR XML,Clinical Supplement,,,nationwidechildrens.org_clinical.TCGA-KL-8344.xml,31021.0,submitted,ae38cead1fce18056e0463a77ec7ebbd,,live,,,clinical_supplement,TCGA,TCGA-KICH,[a5af1391-89d3-4f41-930e-7e8272afec98],1.0
3,5349b90d-5979-478b-b464-7bdded5cc4f6,open,Copy Number Variation,TXT,Masked Copy Number Segment,,Genotyping Array,BINGE_p_TCGAb_366_P01_NSP_GenomeWideSNP_6_B05_...,7411.0,submitted,c34e52579acc7d980700e80b4151bf8f,Affymetrix SNP 6.0,live,,720697c8-ee35-435e-936f-1e561e9fc932_nocnv,copy_number_segment,TCGA,TCGA-PCPG,[ee01b7b0-db62-48bf-804f-617735d234e1],1.0
4,87a67342-5b9c-4528-93cc-aca8565e47a4,open,Clinical,BCR XML,Clinical Supplement,,,nationwidechildrens.org_clinical.TCGA-DH-A7UT.xml,49661.0,submitted,878f7be4787178c3196fd032a1fe7c22,,live,,,clinical_supplement,TCGA,TCGA-LGG,[b2b57e93-2722-4546-9a0b-20155b86edef],1.0
5,978eff68-5139-4871-bf64-23dd38350c8a,open,Biospecimen,BCR XML,Biospecimen Supplement,,,nationwidechildrens.org_biospecimen.TCGA-HC-A4...,52303.0,submitted,d5bbc2d4337ea26e3a2420c667f7c71c,,live,,,biospecimen_supplement,TCGA,TCGA-PRAD,[bb8e6886-edc7-4127-a85b-c4f3769b6681],1.0
6,9f588f06-5fc7-4b0a-a927-9dfdd472b51b,open,Copy Number Variation,TXT,Masked Copy Number Segment,,Genotyping Array,PHYLE_p_TCGA_189_193_Rec_SNP_N_GenomeWideSNP_6...,14914.0,submitted,1b9435f4c8e571c868c8da592fa85d11,Affymetrix SNP 6.0,live,,bd500898-73d0-4960-a6db-55c339c28928_nocnv,copy_number_segment,TCGA,TCGA-LGG,[1f54d21e-752f-496d-8453-4b9b8738cbce],1.0
7,ea66c40d-3af4-413c-ba8a-2d117c7428f5,open,Copy Number Variation,TXT,Copy Number Segment,,Genotyping Array,SHUNS_p_TCGAb_388_389_NSP_GenomeWideSNP_6_A08_...,51382.0,submitted,da4929cba7edfa0480f8907312f7e852,Affymetrix SNP 6.0,live,,82017307-47fc-4421-866c-68d601d9890a_allcnv,copy_number_segment,TCGA,TCGA-SKCM,[1e3bc429-50eb-4087-b39a-6d92f03412f7],1.0
8,d27c111f-1b28-4c32-ad29-e0783388e4b9,open,Copy Number Variation,TXT,Copy Number Segment,,Genotyping Array,PEAKY_p_TCGA_b164_SNP_N_GenomeWideSNP_6_D09_86...,30484.0,submitted,ab6336ff8a0addd48cbb678fb29218b3,Affymetrix SNP 6.0,live,,44e8779d-1ae2-40d0-a9da-b86b32bb500e_allcnv,copy_number_segment,TCGA,TCGA-HNSC,[2c985b30-0f8f-4c8f-a924-cc6aad7ebf0d],1.0
9,d455aec1-86b5-4c18-ae99-95ad16a5352a,open,Copy Number Variation,TXT,Masked Copy Number Segment,,Genotyping Array,HERMS_p_TCGAb_391_NSP_GenomeWideSNP_6_D05_1473...,30318.0,submitted,34bbe777544edacbc33ee7b35af5c1a2,Affymetrix SNP 6.0,live,,646d81ef-c3eb-45f1-808d-838c2c567212_nocnv,copy_number_segment,TCGA,TCGA-BLCA,[bc6c516b-591e-4950-b6b0-decafa666f4e],1.0


In [32]:
def get_extension(text):
    fext1=""
    fname, fext2 = os.path.splitext(text)
    if fext2.lower() == '.gz':
        fname, fext1 = os.path.splitext(fname)
    return (fext1 + fext2).lower()

def is_TCGA(text):
    return 'TCGA' in text.upper()

In [34]:
rm_files_df['file_extension'] = rm_files_df['filename'].progress_apply(get_extension)




In [35]:
rm_files_df.file_extension.unique()

array([], dtype=object)

In [None]:
rm_files_df.where(rm_files_df['filename'].progress_apply(is_TCGA) != True).dropna(how='all')

In [36]:
import shutil
import pathlib

def remove_file(rowdata):
    fid = rowdata['file_id']
    path = pathlib.Path('../GDC_Download/%s' % fid)
    print("removing %s" % path)
    if path.exists:
        if path.is_file():
            os.remove(path)
            return
        if path.is_dir():
            shutil.rmtree(path)
            return

In [None]:
rm_files_df.apply(remove_file, axis='columns')

In [37]:
def verify_download(content, md5sum):
    md5_hasher = hashlib.md5()
    md5_hasher.update(content)
    return md5_hasher.hexdigest() == md5sum

def ensure_directory_exists(path):
    if type(path) is str:
        path = pathlib.Path(path)
    if path.exists() and path.is_dir():
#         print("dir_at_path_exists_already")
        return
    if path.exists() and not path.is_dir():
#         print("path_was_file")
        os.remove(path)
#         print("file_removed")
    os.makedirs(path, exist_ok=True)
#     print("dir_created")
    return
        
def download_GDC_file(data_row):
    response = requests.get(endpoints['data'] / data_row['file_id'], stream=True)
    total_response_size = int(response.headers.get('content-length', 0))
    if total_response_size != int(data_row['file_size']):
        print("mismatched response size: %d, File size should be: %d" % (total_response_size, int(data_row['file_size'])))
    blocksize = 32*1024
    with io.BytesIO() as f:
        for data in tqdm(response.iter_content(blocksize), total=total_response_size, unit='B', unit_scale=True):
            f.write(data)
        content = f.getvalue()
    parent_path = pathlib.Path('../GDC_Download') / data_row['file_id']
    ensure_directory_exists(parent_path)
    target = parent_path / data_row['file_name']
    with open(target, 'wb') as f:
        print("writing %s" % target)
        f.write(content)
        f.close()
    return verify_download(content, data_row['md5sum'])

In [38]:
# dl_files_df.progress_apply(download_GDC_file, axis='columns')
dl_files_df.progress_apply(remove_file, axis='columns')

  total = df.size // len(df)





Unnamed: 0,access,data_category,data_format,data_type,error_type,experimental_strategy,file_id,file_name,file_size,file_state,md5sum,platform,state,state_comment,submitter_id,type,program_name,project_id,case_id,n_cases


In [39]:
from datetime import datetime
def generate_manifest_file(df):
    payload = {'ids': df['file_id'].tolist()}
    response = requests.post(endpoints['manifest'], data=payload)
    manifest_path = pathlib.Path('../manifest') / ("gdc_manifest_%s.txt" % datetime.today().strftime("%Y_%m_%d_%H_%M"))
    with open(manifest_path, 'wb') as file:
        file.write(response.content)
        file.close()
    return
        
        

In [None]:
generate_manifest_file(wanted_files)

In [None]:
import gc
gc.collect()

In [None]:
new_manifest = pd.read_csv(pathlib.Path('../manifest')/"gdc_manifest_2018_02_22_17_43.txt", sep="\t")

In [None]:
new_manifest

In [None]:
def clean_path_for_download(row_data):
    root_path = pathlib.Path("../GDC_Download")
    target_dir = root_path / row_data['id']
    target_file = target_dir / row_data['filename']
    parcel_file = target_dir / "logs" / (row_data['filename'] + ".parcel")
    if target_dir.exists():
        if not target_dir.is_dir():
            os.remove(target_dir)
            return False
        else:
            if target_file.is_file() and parcel_file.is_file():
                return True
            else:
                shutil.rmtree(target_dir)
                return False
    else:
        return False
            

In [None]:
new_manifest.progress_apply(clean_path_for_download, axis='columns')