# Dependencies
Execute the following in the terminal before running any notebooks:
`pip install -r requirements.txt`

# Exercise 1: 1000 Alzheimer's disease and 1000 cancer papers from PubMed 

In [1]:
# Query Entrez API for PubMed IDs of search term and year and return as a list
import requests

def get_pmids(term, year, retmax):
    # Define efetch URL
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={term}+AND+{year}[pdat]&retmode=json&retmax={retmax}"
    
    # Query the Entrez API
    r = requests.get(url)

    # Grab the list of PMIDs
    pmids = r.json()["esearchresult"]["idlist"]
    
    return pmids

get_pmids("lupus", "2004", 10)

['21473028',
 '21210563',
 '18958642',
 '18202459',
 '17822285',
 '17642789',
 '17642773',
 '17642626',
 '17642623',
 '17491665']

In [28]:
# Query Entrez for metadata of a PubMed paper given its PMID
from lxml import etree

# def get_full_abstract(abstract_elements):
#     # Concatenate all abstract text elements to form the full abstract
#     full_abstract = ' '.join([abstract_elem.text for abstract_elem in abstract_elements if abstract_elem.text])
#     return full_abstract

def get_metadata(pmids):
    # Convert list of PMIDs to a string for POST payload
    pmids_string = ",".join(pmids)

    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    # Define parameters for POST payload
    params = {
        'db': 'pubmed',
        'id': pmids_string,
        'retmode': 'xml'
    }

    # Query the Entrez API
    r = requests.post(url, params)

    # Parse the XML response if the response was successful
    if r.status_code == 200:
        doc = etree.fromstring(r.text)
        titles = doc.xpath("//ArticleTitle")
        abstracts = doc.xpath("//AbstractText")
        queries = ["alzheimers"] * 1000 + ["cancer"] * 1000

        papers_dict = {}
        for i, pmid in enumerate(pmids):
            full_abstract = get_full_abstract(abstracts[i])

            papers_dict[pmid] = {
                "ArticleTitle": etree.tostring(titles[i], method = "text", encoding='unicode'),
                "AbstractText": etree.tostring(abstracts[i], method = "text", encoding='unicode'),
                #"AbstractText": full_abstract,
                "query": queries[i]
            } 
        return papers_dict
    
    else:
        print(f"Error: {r.status_code}")
        return None

In [30]:
# Process all 2000 papers and save metadata to JSON file
import json

pmids = get_pmids("alzheimers", "2023", 1000) + get_pmids("cancer", "2023", 1000)
papers_dict = get_metadata(pmids)

with open("papers_dict.json", "w") as f:
    json.dump(papers_dict, f)

In [35]:
# Find PMID overlap
split = len(pmids) // 2
alzheimers, cancer = pmids[split:], pmids[:split]

def intersection(list_1, list_2):
    overlap = [value for value in list_1 if value in list_2]
    return overlap

print(f"The following PMIDs appear in both the Alzheimer's and the cancer sets of papers: {intersection(alzheimers, cancer)}")

The following PMIDs appear in both the Alzheimer's and the cancer sets of papers: ['37937963', '37936448', '37933846', '37933726']
