In [None]:
import requests
import json

# Step 1: Fetch the study and protocol data
study_protocol_url = "https://pdc.cancer.gov/graphql"  # The URL to fetch study and protocol data
study_protocol_query = """
query($acceptDUA: Boolean!) {
  study(acceptDUA: $acceptDUA) {
    study_id
    pdc_study_id
    study_submitter_id
    program_id
    project_id
    study_name
    program_name
    project_name
    disease_type
    primary_site
    analytical_fraction
    experiment_type
    embargo_date
    cases_count
    aliquots_count
  }
  protocolPerStudy {
    protocol_id
    protocol_submitter_id
    study_id
    pdc_study_id
    study_submitter_id
    program_id
    program_submitter_id
    protocol_name
    protocol_date
    document_name
    quantitation_strategy
    experiment_type
    label_free_quantitation
    labeled_quantitation
    isobaric_labeling_reagent
    reporter_ion_ms_level
    starting_amount
    starting_amount_uom
    digestion_reagent
    alkylation_reagent
    enrichment_strategy
    enrichment
    chromatography_dimensions_count
    one_d_chromatography_type
    two_d_chromatography_type
    fractions_analyzed_count
    column_type
    amount_on_column
    amount_on_column_uom
    column_length
    column_length_uom
    column_inner_diameter
    column_inner_diameter_uom
    particle_size
    particle_size_uom
    particle_type
    gradient_length
    gradient_length_uom
    instrument_make
    instrument_model
    dissociation_type
    ms1_resolution
    ms2_resolution
    dda_topn
    normalized_collision_energy
    acquistion_type
    dia_multiplexing
    dia_ims
    analytical_technique
    chromatography_instrument_make
    chromatography_instrument_model
    polarity
    reconstitution_solvent
    reconstitution_volume
    reconstitution_volume_uom
    internal_standards
    extraction_method
    ionization_mode
  }
}
"""
variables = {
    "acceptDUA": True  # Accept DUA (Data Use Agreement) as a variable
}
response = requests.post(study_protocol_url, json={'query': study_protocol_query, 'variables': variables})
data = response.json()

# Step 2: Merge study and protocol data
merged_data = {}

for study in data['data']['study']:
    pdc_study_id = study['pdc_study_id']
    if pdc_study_id not in merged_data:
        merged_data[pdc_study_id] = study
    else:
        merged_data[pdc_study_id].update(study)

for protocol in data['data']['protocolPerStudy']:
    pdc_study_id = protocol['pdc_study_id']
    if pdc_study_id not in merged_data:
        merged_data[pdc_study_id] = protocol
    else:
        for key, value in protocol.items():
            if key not in merged_data[pdc_study_id]:
                merged_data[pdc_study_id][key] = value

# Step 3: Fetch publication data
publication_url = "https://pdc.cancer.gov/graphql"  # The URL to fetch publication data
publication_query = """
{
  getPaginatedUIPublication(offset: 0, limit: 50) {
    uiPublication {
      publication_id
      pubmed_id
      doi
      author
      title
      journal
      journal_url
      year
      abstract
      studies {
        pdc_study_id
        submitter_id_name
      }
    }
  }
}
"""
pub_response = requests.post(publication_url, json={'query': publication_query})
pub_data = pub_response.json()

# Step 4: Create a mapping of pdc_study_id to publication info
publication_mapping = {}

for publication in pub_data['data']['getPaginatedUIPublication']['uiPublication']:
    for study in publication['studies']:
        pdc_study_id = study['pdc_study_id']
        if pdc_study_id not in publication_mapping:
            publication_mapping[pdc_study_id] = []
        publication_mapping[pdc_study_id].append({
            'publication_id': publication['publication_id'],
            'pubmed_id': publication['pubmed_id'],
            'doi': publication['doi'],
            'author': publication['author'],
            'title': publication['title'],
            'journal': publication['journal'],
            'journal_url': publication['journal_url'],
            'year': publication['year'],
            'abstract': publication['abstract'],
        })

# Step 5: Add publication info to merged data
for pdc_study_id, publications in publication_mapping.items():
    if pdc_study_id in merged_data:
        merged_data[pdc_study_id]['publications'] = publications
    else:
        merged_data[pdc_study_id] = {'publications': publications}

# Convert merged_data back to a list if needed
merged_list = list(merged_data.values())

# Step 6: Fetch the list of PubMed IDs
publication_query_pubmed_ids = """
{
  getPaginatedUIPublication(offset: 0, limit: 50) {
    total
    uiPublication {
      pubmed_id
    }
  }
}
"""
pub_response_pubmed_ids = requests.post(publication_url, json={'query': publication_query_pubmed_ids})
response_json_pubmed_ids = pub_response_pubmed_ids.json()

# Extract the list of publications
publications = response_json_pubmed_ids['data']['getPaginatedUIPublication']['uiPublication']

# Extract pubmed_id from each publication and create a list
pubmed_ids = [publication['pubmed_id'] for publication in publications]

# Step 7: Fetch article JSON from PubMed API and extract text
def fetch_article_json(pubmed_id):
    url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pubmed_id}/unicode"  # PubMed API URL for fetching article JSON
    response = requests.get(url)
    response.raise_for_status()  # Check for request errors
    return response.json()

def extract_article_text(article_json):
    article_text = []
    for document in article_json[0]['documents']:
        for passage in document['passages']:
            if passage['infons'].get('section_type') != 'REF':
                article_text.append(passage['text'])
    return "\n\n".join(article_text)

all_articles = []

for pubmed_id in pubmed_ids:
    try:
        article_json = fetch_article_json(pubmed_id)
        article_text = extract_article_text(article_json)
        all_articles.append({
            'pubmed_id': pubmed_id,
            'full_text': article_text
        })
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data for PubMed ID {pubmed_id}: {e}")

# Step 8: Merge all_articles with merged_list based on pubmed_id
pubmed_to_full_text = {article['pubmed_id']: article['full_text'] for article in all_articles}

for study in merged_list:
    if 'publications' in study:
        for publication in study['publications']:
            pubmed_id = publication['pubmed_id']
            if pubmed_id in pubmed_to_full_text:
                publication['full_text'] = pubmed_to_full_text[pubmed_id]

# Output the merged data
print(json.dumps(merged_list, indent=2))


[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
    "cases_count": 6,
    "aliquots_count": 10,
    "protocol_id": "9b89ed0a-7c9d-4d5e-aaeb-3d6f8b3b5bfe",
    "protocol_submitter_id": "PDX core-Phosphoproteome-TMT10",
    "program_submitter_id": "Clinical Proteomic Tumor Analysis Consortium",
    "protocol_name": "PDX core-Phosphoproteome-TMT10",
    "protocol_date": "2021-06-17",
    "document_name": "Methods",
    "quantitation_strategy": "Isobaric label quantitation analysis",
    "label_free_quantitation": "N/A",
    "labeled_quantitation": "TMT",
    "isobaric_labeling_reagent": "TMT10",
    "reporter_ion_ms_level": "MS2",
    "starting_amount": "250.00",
    "starting_amount_uom": "\u00b5g",
    "digestion_reagent": "Lys-C,Trypsin",
    "alkylation_reagent": "Iodoacetamide (IAA)",
    "enrichment_strategy": "Phosphoproteome",
    "enrichment": "Fe3+-IMAC",
    "chromatography_dimensions_count": "2",
    "one_d_chromatography_type": "bRPLC",
    "two_d_chromatography_type": "RPLC",
    "fracti