In [1]:
import requests
import json

# Step 1: Fetch the study and protocol data
study_protocol_url = "https://pdc.cancer.gov/graphql"
study_protocol_query = """
query($acceptDUA: Boolean!) {
  study(acceptDUA: $acceptDUA) {
    study_id
    pdc_study_id
    study_submitter_id
    program_id
    project_id
    study_name
    program_name
    project_name
    disease_type
    primary_site
    analytical_fraction
    experiment_type
    embargo_date
    cases_count
    aliquots_count
  }
  protocolPerStudy {
    protocol_id
    protocol_submitter_id
    study_id
    pdc_study_id
    study_submitter_id
    program_id
    program_submitter_id
    protocol_name
    protocol_date
    document_name
    quantitation_strategy
    experiment_type
    label_free_quantitation
    labeled_quantitation
    isobaric_labeling_reagent
    reporter_ion_ms_level
    starting_amount
    starting_amount_uom
    digestion_reagent
    alkylation_reagent
    enrichment_strategy
    enrichment
    chromatography_dimensions_count
    one_d_chromatography_type
    two_d_chromatography_type
    fractions_analyzed_count
    column_type
    amount_on_column
    amount_on_column_uom
    column_length
    column_length_uom
    column_inner_diameter
    column_inner_diameter_uom
    particle_size
    particle_size_uom
    particle_type
    gradient_length
    gradient_length_uom
    instrument_make
    instrument_model
    dissociation_type
    ms1_resolution
    ms2_resolution
    dda_topn
    normalized_collision_energy
    acquistion_type
    dia_multiplexing
    dia_ims
    analytical_technique
    chromatography_instrument_make
    chromatography_instrument_model
    polarity
    reconstitution_solvent
    reconstitution_volume
    reconstitution_volume_uom
    internal_standards
    extraction_method
    ionization_mode
  }
}
"""
variables = {
    "acceptDUA": True  # Set this to the appropriate value
}
response = requests.post(study_protocol_url, json={'query': study_protocol_query, 'variables': variables})
data = response.json()

# Step 2: Merge study and protocol data
merged_data = {}

for study in data['data']['study']:
    pdc_study_id = study['pdc_study_id']
    if pdc_study_id not in merged_data:
        merged_data[pdc_study_id] = study
    else:
        merged_data[pdc_study_id].update(study)

for protocol in data['data']['protocolPerStudy']:
    pdc_study_id = protocol['pdc_study_id']
    if pdc_study_id not in merged_data:
        merged_data[pdc_study_id] = protocol
    else:
        for key, value in protocol.items():
            if key not in merged_data[pdc_study_id]:
                merged_data[pdc_study_id][key] = value

# Step 3: Fetch publication data
publication_url = "https://pdc.cancer.gov/graphql"
publication_query = """
{
  getPaginatedUIPublication(offset: 0, limit: 50) {
    uiPublication {
      publication_id
      pubmed_id
      doi
      author
      title
      journal
      journal_url
      year
      abstract
      citation
      studies {
        pdc_study_id
        submitter_id_name
      }
    }
  }
}
"""
pub_response = requests.post(publication_url, json={'query': publication_query})
pub_data = pub_response.json()

# Step 4: Create a mapping of pdc_study_id to publication info
publication_mapping = {}

for publication in pub_data['data']['getPaginatedUIPublication']['uiPublication']:
    for study in publication['studies']:
        pdc_study_id = study['pdc_study_id']
        if pdc_study_id not in publication_mapping:
            publication_mapping[pdc_study_id] = []
        publication_mapping[pdc_study_id].append({
            'publication_id': publication['publication_id'],
            'pubmed_id': publication['pubmed_id'],
            'doi': publication['doi'],
            'author': publication['author'],
            'title': publication['title'],
            'journal': publication['journal'],
            'journal_url': publication['journal_url'],
            'year': publication['year'],
            'abstract': publication['abstract'],
            'citation': publication['citation']
        })

# Step 5: Add publication info to merged data
for pdc_study_id, publications in publication_mapping.items():
    if pdc_study_id in merged_data:
        merged_data[pdc_study_id]['publications'] = publications
    else:
        merged_data[pdc_study_id] = {'publications': publications}

# Convert merged_data back to a list if needed
merged_list = list(merged_data.values())

# Output the merged data
print(json.dumps(merged_list, indent=2))


[
  {
    "study_id": "96296e87-89a4-11ea-b1fd-0aad30af8a83",
    "pdc_study_id": "PDC000220",
    "study_submitter_id": "Academia Sinica LUAD100-Phosphoproteome",
    "program_id": "c3408a52-f1e8-11e9-9a07-0a80fada099c",
    "project_id": "05b0ec25-6947-11ea-b1fd-0aad30af8a83",
    "study_name": "Academia Sinica LUAD100-Phosphoproteome",
    "program_name": "International Cancer Proteogenome Consortium",
    "project_name": "Academia Sinica LUAD-100",
    "disease_type": "Lung Adenocarcinoma;Lung Squamous Cell Carcinoma;Other",
    "primary_site": "Lung",
    "analytical_fraction": "Phosphoproteome",
    "experiment_type": "TMT10",
    "embargo_date": null,
    "cases_count": 86,
    "aliquots_count": 170,
    "protocol_id": "dd0a0b4c-8721-11ea-b1fd-0aad30af8a83",
    "protocol_submitter_id": "LUAD Phosphoproteomics Copy - RRT",
    "program_submitter_id": "International Cancer Proteogenome Consortium",
    "protocol_name": "LUAD Phosphoproteomics Copy - RRT",
    "protocol_date": "20