In [4]:
from idc_index import index

client = index.IDCClient()
query = f"""
select collection_id
from index
group by collection_id
order by collection_id asc
"""
df = client.sql_query(query)
result = df.to_json(orient="records")
result

'[{"collection_id":"4d_lung"},{"collection_id":"acrin_6698"},{"collection_id":"acrin_contralateral_breast_mr"},{"collection_id":"acrin_flt_breast"},{"collection_id":"acrin_nsclc_fdg_pet"},{"collection_id":"adrenal_acc_ki67_seg"},{"collection_id":"advanced_mri_breast_lesions"},{"collection_id":"anti_pd_1_lung"},{"collection_id":"b_mode_and_ceus_liver"},{"collection_id":"breast_cancer_screening_dbt"},{"collection_id":"breast_diagnosis"},{"collection_id":"breast_mri_nact_pilot"},{"collection_id":"c4kc_kits"},{"collection_id":"cc_tumor_heterogeneity"},{"collection_id":"cmb_aml"},{"collection_id":"cmb_crc"},{"collection_id":"cmb_gec"},{"collection_id":"cmb_lca"},{"collection_id":"cmb_mel"},{"collection_id":"cmb_mml"},{"collection_id":"cmb_pca"},{"collection_id":"cmmd"},{"collection_id":"colorectal_liver_metastases"},{"collection_id":"covid_19_ar"},{"collection_id":"covid_19_ny_sbu"},{"collection_id":"cptac_aml"},{"collection_id":"cptac_brca"},{"collection_id":"cptac_ccrcc"},{"collection_id"

In [7]:
import json
from IPython.display import HTML, display

data = json.loads(result)

# Convert JSON to HTML table
html_table = "<table><tr><th>collection_id</th></tr>"
for record in data:
    html_table += "<tr><td>{}</td></tr>".format(record['collection_id'])
html_table += "</table>"

# Display HTML table in Jupyter Notebook
display(HTML(html_table))

collection_id
4d_lung
acrin_6698
acrin_contralateral_breast_mr
acrin_flt_breast
acrin_nsclc_fdg_pet
adrenal_acc_ki67_seg
advanced_mri_breast_lesions
anti_pd_1_lung
b_mode_and_ceus_liver
breast_cancer_screening_dbt


In [8]:
from idc_index import index

client = index.IDCClient()
query = f"""
select *
from index
"""
#query = f"""
#select StudyInstanceUID, SeriesInstanceUID, series_aws_url, collection_id, PatientID, StudyDate, PatientAge, "Modality", "Manufacturer", "ManufacturerModelName", collection_id, "StudyDate", "StudyDescription", "SeriesDescription", collection_id
#from index
#"""
df = client.sql_query(query)
# fields not found in idc index: ImageType, Patient Weight, SliceThickness, PixelSpacing, tcia_species, tcia_tumorLocation
result = df.columns
result

Index(['collection_id', 'analysis_result_id', 'PatientID', 'SeriesInstanceUID',
       'StudyInstanceUID', 'source_DOI', 'PatientAge', 'PatientSex',
       'StudyDate', 'StudyDescription', 'BodyPartExamined', 'Modality',
       'Manufacturer', 'ManufacturerModelName', 'SeriesDate',
       'SeriesDescription', 'SeriesNumber', 'instanceCount',
       'license_short_name', 'series_aws_url', 'series_size_MB'],
      dtype='object')

In [9]:
import subprocess

def get_collections():
    client = index.IDCClient()
    query = f"""
    select collection_id
    from index
    group by collection_id
    order by collection_id asc
    """
    df = client.sql_query(query)
    #result = df.to_json(orient="records")
    #result = df.values.tolist()
    result_list = [item for sublist in df.values.tolist() for item in sublist]
    #result_string = ','.join(result_list)
    return result_list

def get_patients(collection, filters):
    client = index.IDCClient()
    
    query = f"""
    select PatientID
    from index
    where collection_id = '{collection}'
    """
    
    df = client.sql_query(query)
    df_unique = df.drop_duplicates()
    patient_ids = df_unique['PatientID'].tolist()

    if filters:
        filtered_patient_ids = filter_primary_sites(patient_ids, filters)
        # Update patient_ids to only include filtered IDs
        patient_ids = "\"" + "\",\"".join(patient_ids) + "\""
        #patient_ids = filtered_patient_ids.split(',') if filtered_patient_ids else []

    result = jsonify(patient_ids)
    return result

# $patient_ids = "TCGA-EY-A214,TCGA-02-0003"
# $primary_sites = "Brain"
# $results = "TCGA-02-0003" because TCGA-02-0003 has primary sites Brain but TCGA-EY-A214
# curl -sX GET "http://index:5000/patients/tcga_ov/Brain"
def filter_primary_sites(patient_ids, primary_sites):
    if primary_sites is None or primary_sites == "":
        return patient_ids
    
    include_or_exclude = "in"   
    #include_or_exclude = "ex"

    patient_ids = "\"" + "\",\"".join(patient_ids.split(',')) + "\""
    primary_sites = "\"" + "\",\"".join(primary_sites.split(',')) + "\""

    command = f"curl -s --data 'filters={{\"op\":\"and\",\"content\":[{{\"op\":\"in\",\"content\":{{\"field\":\"cases.submitter_id\",\"value\":[{patient_ids}]}}}},{{\"op\":\"{include_or_exclude}\",\"content\":{{\"field\":\"cases.primary_site\",\"value\":[{primary_sites}]}}}}]}}&fields=case_id,submitter_id,primary_site&size=10' 'https://api.gdc.cancer.gov/cases' | jq -r '.data.hits | map(.submitter_id) | join(\",\")'"
    
    output = subprocess.check_output(command, shell=True, text=True, stderr=subprocess.STDOUT)
    return output.strip()

def filter_experimental_strategies(patient_ids, experimental_strategies):
    if experimental_strategies is None or experimental_strategies == "":
        return patient_ids
        
    patient_ids = "\"" + "\",\"".join(patient_ids.split(',')) + "\""
    experimental_strategies = "\"" + "\",\"".join(experimental_strategies.split(',')) + "\""

    command = f"curl -s --data 'filters={{\"op\":\"and\",\"content\":[{{\"op\":\"in\",\"content\":{{\"field\":\"cases.submitter_id\",\"value\":[{patient_ids}]}}}}]}}&fields=files.experimental_strategy,submitter_id&size=10' https://api.gdc.cancer.gov/cases| jq -r '.data.hits[] | select(.files[].experimental_strategy | IN({experimental_strategies})) | .submitter_id' | sort -u"

    output = subprocess.check_output(command, shell=True, text=True, stderr=subprocess.STDOUT)
    return output.strip()

# if collections is None:
#     collections = get_collections()
collections = get_collections()

In [13]:
collections_str = ','.join(collections)
collections_str

'4d_lung,acrin_6698,acrin_contralateral_breast_mr,acrin_flt_breast,acrin_nsclc_fdg_pet,adrenal_acc_ki67_seg,advanced_mri_breast_lesions,anti_pd_1_lung,b_mode_and_ceus_liver,breast_cancer_screening_dbt,breast_diagnosis,breast_mri_nact_pilot,c4kc_kits,cc_tumor_heterogeneity,cmb_aml,cmb_crc,cmb_gec,cmb_lca,cmb_mel,cmb_mml,cmb_pca,cmmd,colorectal_liver_metastases,covid_19_ar,covid_19_ny_sbu,cptac_aml,cptac_brca,cptac_ccrcc,cptac_cm,cptac_coad,cptac_gbm,cptac_hnscc,cptac_lscc,cptac_luad,cptac_ov,cptac_pda,cptac_sar,cptac_ucec,ct_colonography,ct_lymph_nodes,ct_phantom4radiomics,ct_vs_pet_ventilation_imaging,ctpred_sunitinib_pannet,dro_toolkit,duke_breast_cancer_mri,ea1141,gbm_dsc_mri_dro,hcc_tace_seg,htan_hms,htan_ohsu,htan_vanderbilt,htan_wustl,icdc_glioma,ispy1,ispy2,lctsc,lidc_idri,lung_fused_ct_pathology,lung_pet_ct_dx,lung_phantom,lungct_diagnosis,midrc_ricord_1a,midrc_ricord_1b,midrc_ricord_1c,mouse_astrocytoma,mouse_mammary,naf_prostate,nlm_visible_human_project,nlst,nsclc_radiogenomi

In [24]:
patients = get_patients('TCGA-GBM')
patients_str = ','.join(patients)
# #print(patients_str)
print(len(patients))

# filtered_patients = filter_primary_sites(patients_str, "Breast")
filtered_patients_str = filter_primary_sites(patients_str, "Brain")
filtered_patients = filtered_patients_str.split(',')
print(len(filtered_patients))
print(filtered_patients)

TypeError: get_patients() missing 1 required positional argument: 'filters'

In [20]:
#filtered_patients = filter_primary_sites(patients_str, 'Breast')
filtered_patients = filter_primary_sites("TCGA-EY-A214,TCGA-BH-A0GY,TCGA-02-0003", "Breast")
#filtered_patients = filter_primary_sites("TCGA-EY-A214,TCGA-BH-A0GY,TCGA-02-0003", "")
filtered_patients

'TCGA-BH-A0GY'

In [8]:
#select = 'select PatientID from index '
select = 'select * from index '
key = 'PatientID'
where = 'where 1 = 1'
values = filtered_patients.split(',')
if len(values) > 0:
    where += " and " + key + " in ('" + "','".join(values) + "')"
else:
    where += " and " + key + " = " + value + "'"

client = index.IDCClient()
query = select + where
print(query)
df = client.sql_query(query)
df

NameError: name 'filtered_patients' is not defined

In [16]:
df['series_aws_url']

0     s3://idc-open-data/25d0a388-67fe-4fc3-9b93-b17...
1     s3://idc-open-data/6a37f4a0-aeda-4b1d-9945-e6d...
2     s3://idc-open-data/6ea28924-2058-4933-b72c-aad...
3     s3://idc-open-data/1c6eac41-33fc-4372-a201-036...
4     s3://idc-open-data/df866683-b180-47db-ac04-c5f...
5     s3://idc-open-data/a8a6b4d8-0c0a-4e49-b40e-189...
6     s3://idc-open-data/447342ef-c225-49f4-8dd4-659...
7     s3://idc-open-data/54d23b21-843d-4931-8ea5-300...
8     s3://idc-open-data/7992bbaf-0ba9-4814-96f3-8ff...
9     s3://idc-open-data/61ec2800-b4cd-45d4-a5b8-354...
10    s3://idc-open-data/9c1215aa-ab97-4480-a523-0d6...
11    s3://idc-open-data/f69bb966-fbda-41cc-af4a-f2b...
12    s3://idc-open-data/c2ff8d6f-5783-4bcf-a051-39b...
13    s3://idc-open-data/1e4939ae-492d-46e9-b3e6-1c6...
14    s3://idc-open-data/5d1e2e48-6874-4f5f-b689-473...
15    s3://idc-open-data/01a52e2d-a05d-4c00-9f3e-bec...
16    s3://idc-open-data/03553c0a-b248-4b00-8ba8-566...
Name: series_aws_url, dtype: object

In [17]:
import os
import sys
import shutil

def execute_command(command, output=True):
    try:
        result = subprocess.run(command, shell=True, text=True, capture_output=True, check=True)
        result = result.stdout.strip() if result.stdout else ""
    except subprocess.CalledProcessError as e:
        result = e.stderr.strip()
    if output:
        return result

def download_dicom_dir(aws_s3_path):
    aws_profile = "default"  # Adjust as needed
    aws_config_dir = "/root/.aws/"
    os.environ['AWS_CONFIG_FILE'] = os.path.join(aws_config_dir, 'config')
    os.environ['AWS_SHARED_CREDENTIALS_FILE'] = os.path.join(aws_config_dir, 'credentials')

    dicom_dir = aws_s3_path.split("/")[-2]
    dicom_tar_file = dicom_dir + ".tgz"

    if not os.path.exists(dicom_tar_file):
        execute_command(f"aws s3 s3://idc-open-data/{dicom_dir}/ /app/{dicom_dir} --recursive", output=False)
        execute_command(f"tar -czvf {dicom_tar_file} /app/{dicom_dir}")
        shutil.rmtree(dicom_dir)

    if os.path.exists(dicom_tar_file):
        return dicom_tar_file

# download dicom dir
download_dicom_dir(df['series_aws_url'][0])

'25d0a388-67fe-4fc3-9b93-b1720b4e7129.tgz'

In [1]:
#select = 'select PatientID from index '
select = 'select * from index '
key = 'PatientID'
where = 'where 1 = 1'
values = patients_str.split(',')
if len(values) > 0:
    where += " and " + key + " in ('" + "','".join(values) + "')"
else:
    where += " and " + key + " = " + value + "'"

client = index.IDCClient()
query = select + where
print(query)
df = client.sql_query(query)
df

NameError: name 'patients_str' is not defined

In [21]:
filter_primary_sites("Breast_MRI_001", "Breast")

''

In [22]:
filter_primary_sites("TCGA-EY-A214,TCGA-BH-A0GY,TCGA-02-0003", "Breast")

'TCGA-BH-A0GY'

In [14]:
from idc_index import index
import subprocess

#from flask import jsonify
import json

def filter_primary_sites(patient_ids, primary_sites):
    if primary_sites is None or primary_sites == "":
        return patient_ids
    
    include_or_exclude = "in"
    #include_or_exclude = "ex"

    patient_ids = "\"" + "\",\"".join(patient_ids.split(',')) + "\""
    primary_sites = "\"" + "\",\"".join(primary_sites.split(',')) + "\""

    command = f"curl -s --data 'filters={{\"op\":\"and\",\"content\":[{{\"op\":\"in\",\"content\":{{\"field\":\"cases.submitter_id\",\"value\":[{patient_ids}]}}}},{{\"op\":\"{include_or_exclude}\",\"content\":{{\"field\":\"cases.primary_site\",\"value\":[{primary_sites}]}}}}]}}&fields=case_id,submitter_id,primary_site&size=10' 'https://api.gdc.cancer.gov/cases' | jq -r '.data.hits | map(.submitter_id) | join(\",\")'"
    
    output = subprocess.check_output(command, shell=True, text=True, stderr=subprocess.STDOUT)
    return output.strip()
    
def get_patients(collection, filters):
    client = index.IDCClient()
    
    query = f"""
    select PatientID
    from index
    where collection_id = '{collection}'
    """
    
    df = client.sql_query(query)
    df_unique = df.drop_duplicates()
    patient_ids = df_unique['PatientID'].tolist()

    if filters:
        patient_ids = ",".join(patient_ids)
        filtered_patient_ids = filter_primary_sites(patient_ids, filters)

    #result = jsonify(patient_ids) # for flask only
    result = json.dumps(data)
    return result

get_patients("tcga_ov", "Brain")

RuntimeError: Working outside of application context.

This typically means that you attempted to use functionality that needed
the current application. To solve this, set up an application context
with app.app_context(). See the documentation for more information.

In [3]:
import requests
import json

def get_genomic_data_uuids(patient_id, data_categories=None, data_types=None, experimental_strategies=None):
    """
    Get genomic data file UUIDs for a given PatientID with optional filters.
    
    Args:
    patient_id (str): The patient ID to query.
    data_categories (list): Optional list of data categories to filter by.
    data_types (list): Optional list of data types to filter by.
    experimental_strategies (list): Optional list of experimental strategies to filter by.
    
    Returns:
    A list of dictionaries, each containing 'uuid', 'data_type', 'data_category', and 'experimental_strategy' for available genomic data.
    """
    base_url = "https://api.gdc.cancer.gov/files"
    
    filters = {
        "op": "and",
        "content": [
            {
                "op": "in",
                "content": {
                    "field": "cases.submitter_id",
                    "value": [patient_id]
                }
            }
        ]
    }
    
    if data_categories:
        filters["content"].append({
            "op": "in",
            "content": {
                "field": "data_category",
                "value": data_categories
            }
        })
    
    if data_types:
        filters["content"].append({
            "op": "in",
            "content": {
                "field": "data_type",
                "value": data_types
            }
        })
    
    if experimental_strategies:
        filters["content"].append({
            "op": "in",
            "content": {
                "field": "experimental_strategy",
                "value": experimental_strategies
            }
        })
    
    params = {
        "filters": json.dumps(filters),
        "fields": "file_id,data_type,data_category,experimental_strategy",
        "format": "JSON",
        "size": "100"
    }
    
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        return [{
            "uuid": hit["file_id"],
            "data_type": hit["data_type"],
            "data_category": hit["data_category"],
            "experimental_strategy": hit.get("experimental_strategy", "N/A")
        } for hit in data["data"]["hits"]]
    else:
        print(f"Error: {response.status_code}")
        return []

# Test the function with a sample patient ID and various filters
sample_patient_id = "TCGA-AC-A3W5"  # Replace with a valid patient ID from your dataset

print("Test 1: No filters")
results = get_genomic_data_uuids(sample_patient_id)
print(f"Genomic data for patient {sample_patient_id}:")
for item in results:
    print(f"UUID: {item['uuid']}, Data Type: {item['data_type']}, Data Category: {item['data_category']}, Experimental Strategy: {item['experimental_strategy']}")

print("\nTest 2: Filter by data category")
results = get_genomic_data_uuids(sample_patient_id, data_categories=["Transcriptome Profiling"])
print(f"Genomic data for patient {sample_patient_id} (Transcriptome Profiling only):")
for item in results:
    print(f"UUID: {item['uuid']}, Data Type: {item['data_type']}, Data Category: {item['data_category']}, Experimental Strategy: {item['experimental_strategy']}")

print("\nTest 3: Filter by data type")
results = get_genomic_data_uuids(sample_patient_id, data_types=["Gene Expression Quantification"])
print(f"Genomic data for patient {sample_patient_id} (Gene Expression Quantification only):")
for item in results:
    print(f"UUID: {item['uuid']}, Data Type: {item['data_type']}, Data Category: {item['data_category']}, Experimental Strategy: {item['experimental_strategy']}")

print("\nTest 4: Filter by experimental strategy")
results = get_genomic_data_uuids(sample_patient_id, experimental_strategies=["RNA-Seq"])
print(f"Genomic data for patient {sample_patient_id} (RNA-Seq only):")
for item in results:
    print(f"UUID: {item['uuid']}, Data Type: {item['data_type']}, Data Category: {item['data_category']}, Experimental Strategy: {item['experimental_strategy']}")

print("\nTest 5: Combining multiple filters")
results = get_genomic_data_uuids(sample_patient_id, 
                                 data_categories=["Transcriptome Profiling"],
                                 data_types=["Gene Expression Quantification"],
                                 experimental_strategies=["RNA-Seq"])
print(f"Genomic data for patient {sample_patient_id} (Transcriptome Profiling, Gene Expression Quantification, RNA-Seq):")
for item in results:
    print(f"UUID: {item['uuid']}, Data Type: {item['data_type']}, Data Category: {item['data_category']}, Experimental Strategy: {item['experimental_strategy']}")

# Generate download URLs
base_download_url = "https://api.gdc.cancer.gov/data/"
print("\nDownload links for filtered results:")
for item in results:
    download_url = f"{base_download_url}{item['uuid']}"
    print(f"  {item['data_type']} ({item['experimental_strategy']}): {download_url}")

Test 1: No filters
Genomic data for patient TCGA-AC-A3W5:
UUID: a88c168e-4bba-4bd2-9c0c-77934444cc1c, Data Type: Clinical Supplement, Data Category: Clinical, Experimental Strategy: N/A
UUID: 0a40467f-9495-4c5b-b56e-3347a3ee0572, Data Type: Clinical Supplement, Data Category: Clinical, Experimental Strategy: N/A
UUID: 950ee56c-df20-4b39-867e-c73f17d0f4f6, Data Type: Biospecimen Supplement, Data Category: Biospecimen, Experimental Strategy: N/A
UUID: 62d4515f-a30b-4b1a-b2dd-c8bf9476e803, Data Type: Clinical Supplement, Data Category: Clinical, Experimental Strategy: N/A
UUID: c5039e7e-14d8-4e59-ba20-0d3fa38b86df, Data Type: Biospecimen Supplement, Data Category: Biospecimen, Experimental Strategy: N/A
UUID: c10090b8-fa42-4e32-86a0-9dedceb5b605, Data Type: Biospecimen Supplement, Data Category: Biospecimen, Experimental Strategy: N/A
UUID: d0c967b5-78ee-4bc4-9493-3cbe44af1bf5, Data Type: Biospecimen Supplement, Data Category: Biospecimen, Experimental Strategy: N/A
UUID: 8162d394-8b64-4d

In [2]:
get_genomic_data_uuids('TCGA-AC-A3W5')

[{'uuid': 'a88c168e-4bba-4bd2-9c0c-77934444cc1c',
  'data_type': 'Clinical Supplement',
  'data_category': 'Clinical',
  'experimental_strategy': 'N/A'},
 {'uuid': '0a40467f-9495-4c5b-b56e-3347a3ee0572',
  'data_type': 'Clinical Supplement',
  'data_category': 'Clinical',
  'experimental_strategy': 'N/A'},
 {'uuid': '950ee56c-df20-4b39-867e-c73f17d0f4f6',
  'data_type': 'Biospecimen Supplement',
  'data_category': 'Biospecimen',
  'experimental_strategy': 'N/A'},
 {'uuid': '62d4515f-a30b-4b1a-b2dd-c8bf9476e803',
  'data_type': 'Clinical Supplement',
  'data_category': 'Clinical',
  'experimental_strategy': 'N/A'},
 {'uuid': 'c5039e7e-14d8-4e59-ba20-0d3fa38b86df',
  'data_type': 'Biospecimen Supplement',
  'data_category': 'Biospecimen',
  'experimental_strategy': 'N/A'},
 {'uuid': 'c10090b8-fa42-4e32-86a0-9dedceb5b605',
  'data_type': 'Biospecimen Supplement',
  'data_category': 'Biospecimen',
  'experimental_strategy': 'N/A'},
 {'uuid': 'd0c967b5-78ee-4bc4-9493-3cbe44af1bf5',
  'data_