# Dependencies
Execute the following in the terminal before running any notebooks:
`pip install -r requirements.txt`

# Exercise 1: 1000 Alzheimer's disease and 1000 cancer papers from PubMed 

In [3]:
# Query Entrez API for PubMed IDs given search term and year
import requests

def get_pmids(query, year, retmax):
    # Define efetch URL
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}+AND+{year}[pdat]&retmode=json&retmax={retmax}"
    
    # Query the Entrez API
    r = requests.get(url)
    r.raise_for_status()  # Raise an exception for HTTP error responses

    # Grab the list of PMIDs
    pmids = r.json()["esearchresult"]["idlist"]
    
    # Return list of PMIDs with associated queries
    return [(pmid, query) for pmid in pmids]

# Test (featuring House MD; it is, unfortunately, never lupus)
get_pmids("lupus", "2004", 10)

[('21473028', 'lupus'),
 ('21210563', 'lupus'),
 ('18958642', 'lupus'),
 ('18202459', 'lupus'),
 ('17822285', 'lupus'),
 ('17642789', 'lupus'),
 ('17642773', 'lupus'),
 ('17642626', 'lupus'),
 ('17642623', 'lupus'),
 ('17491665', 'lupus')]

In [4]:
# Query Entrez for metadata of a PubMed paper given its PMID
from lxml import etree

def get_full_abstract(abstract_elements):
    # Concatenate all abstract text elements to form the full abstract
    full_abstract = ' '.join([abstract_elem.xpath("string()") for abstract_elem in abstract_elements])
    return full_abstract.strip()

def get_metadata(pmids_with_queries):
    # Convert list of PMIDs to a string for POST request
    pmids_string = ",".join([pmid for pmid, _ in pmids_with_queries])

    # Define parameters for payload
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'pubmed',
        'id': pmids_string,
        'retmode': 'xml'
    }

    # Query the Entrez API
    r = requests.post(url, params)
    r.raise_for_status()  # Raise an exception for HTTP error responses

    # Parse the entire XML response
    doc = etree.fromstring(r.content) # Use r.content for binary response
    papers_dict = {}

    # Iterate through each article in the entire response
    for pmid, query in pmids_with_queries:
        # Find the article node that corresponds to the current PMID
        article = doc.xpath(f".//PubmedArticle[MedlineCitation/PMID/text()='{pmid}']")[0]

        # Extract "ArticleTitle" for this article
        title = article.findtext(".//ArticleTitle")
        
        # Grab all tags named "AbstractText" for this article
        abstract_elements = article.xpath(".//AbstractText")
        
        # Build full abstract from tags
        full_abstract = get_full_abstract(abstract_elements)
        
        # Populate paper dictionary
        papers_dict[pmid] = {
            "ArticleTitle": title,
            "AbstractText": full_abstract,
            "query": query
        } 

    return papers_dict

In [5]:
# Process all 2000 papers and save metadata to JSON file
import json
 
pmids_with_queries = get_pmids("alzheimers", "2023", 1000) + get_pmids("cancer", "2023", 1000)

papers_dict = get_metadata(pmids_with_queries)

with open("papers_dict.json", "w") as f:
    json.dump(papers_dict, f)

In [9]:
# Find overlap in PMIDs between two queries
pmids = [pmid[0] for pmid in pmids_with_queries] # Extract only PMIDs
split = len(pmids) // 2
alzheimers, cancer = pmids[split:], pmids[:split] # Split PMIDs into queries

def intersection(list_1, list_2):
    overlap = [value for value in list_1 if value in list_2]
    return overlap

print(f"The following PMIDs appear in both the Alzheimer's and the cancer sets of papers: {intersection(alzheimers, cancer)}")

The following PMIDs appear in both the Alzheimer's and the cancer sets of papers: ['37943296', '37937963', '37936448']
