In [1]:
import csv
import json
import os
import re
import tarfile

import requests

In [2]:
def query_gdc(
    samples,
    data_type="Methylation Beta Value",
    platform="Illumina Human Methylation 450",
):

    base_url = "https://api.gdc.cancer.gov/files"
    headers = {"Content-Type": "application/json"}

    # Define filters for the query
    filters = {
        "op": "and",
        "content": [
            {"op": "in", "content": {"field": "cases.submitter_id", "value": samples}},
            {
                "op": "in",
                "content": {"field": "data_category", "value": ["DNA Methylation"]},
            },
            {"op": "in", "content": {"field": "data_type", "value": [data_type]}},
            {"op": "in", "content": {"field": "platform", "value": [platform]}},
        ],
    }

    # Define API query parameters
    params = {
        "filters": json.dumps(filters),
        "fields": "file_id,file_name",
        "format": "JSON",
        "size": "1000",
    }

    # Make the API request
    response = requests.get(base_url, headers=headers, params=params)
    if response.status_code == 200:
        results = response.json()["data"]["hits"]
        file_ids = [result["file_id"] for result in results]
        return file_ids
    else:
        raise Exception(
            f"API request failed with status code {response.status_code}: {response.text}"
        )

In [3]:
def download_files_batch(file_ids, output_dir="downloads"):
    data_endpt = "https://api.gdc.cancer.gov/data"

    ids = file_ids

    params = {"ids": ids}

    response = requests.post(
        data_endpt, data=json.dumps(params), headers={"Content-Type": "application/json"}
    )

    response_head_cd = response.headers["Content-Disposition"]

    file_name = output_dir + "/" + re.findall("filename=(.+)", response_head_cd)[0]

    os.makedirs(os.path.dirname(file_name), exist_ok=True)

    with open(file_name, "wb") as output_file:
        output_file.write(response.content)
    
    if file_name.endswith(".tar.gz"):
        with tarfile.open(file_name, "r:gz") as tar:
            tar.extractall(path=os.path.dirname(file_name))
        # print(f"Extracted {file_name} to {os.path.dirname(file_name)}")

In [28]:
def get_metadata(file_ids):
    metadata_endpoint = "https://api.gdc.cancer.gov/files"
    params = {
        "filters": {"op": "in", "content": {"field": "file_id", "value": file_ids}},
        "fields": "file_id,file_name,cases.samples.sample_type,cases.project.project_id,cases.case_id,cases.submitter_id,cases.samples.sample_id,cases.samples.submitter_id",  # Added case details
        "format": "JSON",
    }

    response = requests.post(
        metadata_endpoint,
        data=json.dumps(params),
        headers={"Content-Type": "application/json"},
    )

    if response.status_code == 200:
        return response.json()["data"]["hits"]
    else:
        raise Exception(
            f"Failed to fetch metadata: {response.status_code} - {response.text}"
        )


def flatten_metadata(metadata):
    flattened = {}

    def flatten_helper(d, parent_key=""):
        """Helper function to flatten nested structures."""
        if isinstance(d, dict):
            for k, v in d.items():
                new_key = f"{parent_key}__{k}" if parent_key else k
                flatten_helper(v, new_key)
        elif isinstance(d, list):
            if d:  # Ensure list is not empty
                flatten_helper(d[0], parent_key)  # Assuming only 1 item per list
        else:
            flattened[parent_key] = d

    flatten_helper(metadata)
    return flattened


def save_metadata_to_csv(metadata, metadata_file="downloads/metadata.csv"):
    flattened_metadata = [flatten_metadata(file_meta) for file_meta in metadata]

    headers = list(flattened_metadata[0].keys())

    file_exists = os.path.exists(metadata_file)

    with open(metadata_file, mode="a" if file_exists else "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=headers)

        if not file_exists:
            writer.writeheader()

        writer.writerows(flattened_metadata)

In [5]:
import pandas as pd

In [6]:
meth_data = cancer_methylation = pd.read_csv(
    "/uufs/chpc.utah.edu/common/home/u0914269/clement/projects/20230828_tcga_methylation/side_projects/20232110_TCGA_METHYLATION_CLINICAL_ML/data/methylation/hm27_hm450_merge_meth_data.tsv",
    sep="\t",
)

In [7]:
samples = list(meth_data.columns[4:])
len(samples)

2583

In [8]:
sample_ids = [sample[:-3] for sample in samples]

In [9]:
import time
from http.client import RemoteDisconnected

file_ids = []
batch_size = 100
max_retries = 5
retry_wait_time = 1

for i in range(0, len(sample_ids), batch_size):
    batch_samples = sample_ids[i : i + batch_size]
    retries = 0
    while retries < max_retries:
        try:
            batch_file_ids = query_gdc(batch_samples)
            file_ids += batch_file_ids
            break
        except RemoteDisconnected:
            retries += 1
            if retries >= max_retries:
                raise Exception("Too many retries, failed to query GDC.")
            time.sleep(retry_wait_time)
            retry_wait_time = retry_wait_time * 5
            if retry_wait_time > 300:
                retries = max_retries
                raise Exception("Too many retries, failed to query GDC.")
print(f"Found {len(file_ids)} files to download.")

Found 2555 files to download.


In [10]:
existing_folders = [folder for folder in os.listdir("downloads") if os.path.isdir(os.path.join("downloads", folder))]
download_file_ids = [file_id for file_id in file_ids if file_id not in existing_folders]
print(f"Filtered file_ids, {len(download_file_ids)} files remaining to download.")

Filtered file_ids, 0 files remaining to download.


In [11]:
retry_wait_time = 1
max_retries = 5
for i in range(0, len(download_file_ids), batch_size):
    batch_file_ids = download_file_ids[i : i + batch_size]
    retries = 0
    while retries < max_retries:
        try:
            download_files_batch(batch_file_ids)
            break
        except RemoteDisconnected:
            retries += 1
            if retries >= max_retries:
                raise Exception("Too many retries, failed to download files.")
            time.sleep(retry_wait_time)
            retry_wait_time = retry_wait_time * 5
            if retry_wait_time > 300:
                retries = max_retries
                raise Exception("Too many retries, failed to download files.")
    time.sleep(300)  # Sleep for 5 minutes to avoid rate limiting

In [32]:
metadata_file = "downloads/metadata.csv"

if os.path.exists(metadata_file):
    existing_metadata = pd.read_csv(metadata_file)
    existing_file_ids = existing_metadata["file_id"].tolist()
    needed_meta_file_ids = [
        file_id for file_id in file_ids if file_id not in existing_file_ids
    ]
    print(f"Filtered file_ids, {len(needed_meta_file_ids)} files remaining to download.")
else:
    needed_meta_file_ids = file_ids
    print("Metadata file does not exist.")

Metadata file does not exist.


In [33]:
retry_wait_time = 1
max_retries = 10
batch_size = 10
for i in range(0, len(needed_meta_file_ids), batch_size):
    batch_file_ids = needed_meta_file_ids[i : i + batch_size]
    retries = 0
    while retries < max_retries:
        try:
            meta_data = get_metadata(batch_file_ids)
            save_metadata_to_csv(meta_data)
            break
        except RemoteDisconnected:
            retries += 1
            if retries >= max_retries:
                raise Exception("Too many retries, failed to download files.")
            time.sleep(retry_wait_time)
            retry_wait_time = retry_wait_time * 5
            if retry_wait_time > 300:
                retries = max_retries
                raise Exception("Too many retries, failed to download files.")
    time.sleep(5)  # Sleep for 5 sec to avoid rate limiting