Use the requests module (or urllib) to use the Entrez API (see slides8) to identify the PubMed IDs for 1000 Alzheimers papers from 2019 and for 1000 cancer papers from 2019. (9 points)

In [1]:
import requests
import xml.dom.minidom as m
def get_id(disease):
    r = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={disease}+AND+2019[pdat]&retmode=xml&retmax=1000")
    doc = m.parseString(r.text)
    pm_id_list = doc.getElementsByTagName("Id")
    
    # Reference:https://stackoverflow.com/questions/317413/get-element-value-with-minidom-with-python
    my_n_node = [pm_id_list[i] for i in range(len(pm_id_list))]
    my_child = [my_n_node[i].firstChild for i in range(len(my_n_node))]
    my_text = [my_child[i].data for i in range(len(my_child))]
    return my_text

# Find all pmIDs for Alzheimers papers
if __name__ == "__main__":
    pm_id_alzheimer = get_id("Alzheimers")
    print(pm_id_alzheimer)

['33939349', '33841007', '33627920', '33463291', '33323224', '33243028', '33097841', '32954270', '32821704', '32802097', '32641955', '32598641', '32501203', '32489952', '32477473', '32477472', '32399472', '32375155', '32363346', '32341985', '32341979', '32322468', '32322464', '32308848', '32269839', '32269835', '32254802', '32232173', '32222300', '32209796', '32202743', '32202739', '32186116', '32147039', '32140393', '32140387', '32118475', '32116643', '32116624', '32114583', '32099861', '32096053', '32096052', '32096051', '32096050', '32096047', '32096046', '32096045', '32096044', '32096043', '32096042', '32096041', '32096039', '32096038', '32096036', '32096034', '32096033', '32096032', '32096030', '32096029', '32096028', '32096027', '32096025', '32096024', '32089828', '32087948', '32083083', '32082970', '32082407', '32072882', '32065917', '32063857', '32062666', '32057309', '32057308', '32057307', '32049662', '32048434', '32048245', '32045917', '32045913', '32042569', '32037030', '32

In [2]:
# Find all pmIDs for Cancer papers
if __name__ == "__main__":
    pm_id_cancer = get_id("Cancer")
    print(pm_id_cancer)

['34643516', '34590506', '34539049', '34539046', '34493369', '34460208', '34460207', '34460206', '34460205', '34460204', '34460203', '34460202', '34460201', '34460200', '34460199', '34460198', '34460197', '34460196', '34460195', '34460194', '34460193', '34460192', '34460191', '34460190', '34460189', '34460188', '34460187', '34426479', '34414848', '34414847', '34306915', '34243914', '34243910', '34239389', '34190022', '34177378', '34175032', '34096432', '34059235', '34027418', '34026405', '34026404', '34024926', '33998520', '33994729', '33994726', '33994724', '33994723', '33979095', '33979094', '33979093', '33979092', '33979091', '33979090', '33979089', '33979088', '33979087', '33979086', '33979085', '33979084', '33979083', '33979082', '33979081', '33979080', '33979079', '33979078', '33979077', '33979076', '33979075', '33979074', '33979073', '33969772', '33939350', '33927493', '33911342', '33907724', '33899924', '33884377', '33870087', '33869785', '33867720', '33867704', '33854408', '33

Use the Entrez API via requests/urllib to pull the metadata for each such paper found above (both cancer and Alzheimers) (and save a JSON file storing each paper's title, abstract, MeSH terms (DescriptorName inside of MeshHeading), and the query that found it that is of the general form: (12 points) (Edited 2021-10-11 to clarify which papers).

In [3]:
import json
import time
from os.path import exists
def get_infor(pmid_list,query): 
    # Create a file path to save the JSON file
    file_path = "../JSON_file/pubmed_articles.json"
    # Check if the file_path exists; if yes, append articles info
    if exists(file_path): # Reference:https://stackoverflow.com/questions/2967194/open-in-python-does-not-create-a-file-if-it-doesnt-exist 
        # Reference: https://stackoverflow.com/questions/1466000/difference-between-modes-a-a-w-w-and-r-in-built-in-open-function
        f = open(file_path, "a") # append mode
    else:
        f = open(file_path, "w") # write mode 
    
    # Create a dictionary that store the article ID with the article infor stored inside a nested dictionary
    infor_dict= {}
    # Loop through all pm IDs
    for pmid in pmid_list: 
        # Nested dictionary storing each article information
        infor_dict[pmid] = {}
        r = requests.post(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id={pmid}") 
        doc = m.parseString(r.text)

        ArticleTitle = doc.getElementsByTagName("ArticleTitle")
        title = ""
        # Check if the title exists
        if len(ArticleTitle) !=0:
            # Loop through all element nodes
            for elm in ArticleTitle: # Reference: https://docs.python.org/3/library/xml.dom.minidom.html
                # Loop through all childnodes
                for text in elm.childNodes:
                    # Because there might be some edge cases in the abstract, e.g.,italized 
                    # Need to check if the node is a text node. If yes, add to title
                    try: # Reference:https://docs.python.org/3/tutorial/errors.html (Errors and exceptions)
                        title += text._get_wholeText() 
                    except AttributeError: # If error raises, it is not a text node
                        for x in text.childNodes:
                            # Check if the embedded child text is a text node
                            if x.nodeType == m.Node.TEXT_NODE:
                                title += x.data

        AbstractTexts = doc.getElementsByTagName("AbstractText")
        abstract = ""
        # Check if the abstract exists
        if len(AbstractTexts) !=0:
            # Loop through all element nodes
            for elm in AbstractTexts: # Reference: https://docs.python.org/3/library/xml.dom.minidom.html
                # Loop through all childnodes
                for text in elm.childNodes:
                    # Need to check if the node is an element node              
                    try: # Reference:https://docs.python.org/3/tutorial/errors.html (Errors and exceptions)
                        abstract += text._get_wholeText() # Check if the node is a text node. If yes, add to abstract
                    except AttributeError: # If error raises, it is not a text node
                        for x in text.childNodes:
                            # Check if the embedded child text is a text node
                            if x.nodeType == m.Node.TEXT_NODE:
                                abstract += x.data
 
        mesh_terms = doc.getElementsByTagName("DescriptorName")
        # Make an empty mesh terms list to store multiple mesh terms 
        mesh_terms_list = []
        # Check if mesh terms exist
        if len(mesh_terms) !=0:
            for elm in mesh_terms:
                for text in elm.childNodes: 
                    # Loop through all element child nodes                    
                    try: # Reference:https://docs.python.org/3/tutorial/errors.html (Errors and exceptions)
                        mesh_terms_list.append(text._get_wholeText()) # Check if the node is a text node. If yes, append to the mesh terms list
                    except AttributeError: # If error raises, it might be a text node
                        for x in text.childNodes:
                        # Check if the embedded child text is a text node
                            if x.nodeType == m.Node.TEXT_NODE:
                                mesh_terms_list.append(x._get_wholeText())

        # Save all article information in the infor_dict
        infor_dict[pmid]["ArticleTitle"] = title
        infor_dict[pmid]["AbstractText"] = abstract
        infor_dict[pmid]["query"] = query
        infor_dict[pmid]["mesh"] = mesh_terms_list
        
        time.sleep(1)

    return infor_dict    

There are of course many more papers of each category, but is there any overlap in the two sets of papers that you identified? (3 points)

In [4]:
# Find overlap pm ID 
overlap_pmID = list(set(pm_id_alzheimer).intersection(pm_id_cancer))
overlap_pmID

['32501203']

In [6]:
# Run the function to store all article information into the json file
if __name__ == "__main__":
    all_data = get_infor(pm_id_alzheimer,"Alzheimer")
    cancer_data = get_infor(pm_id_cancer,"Cancer")
    all_data.update(cancer_data)

    all_data[overlap_pmID[0]]["query"] = "Alzheimer/Cancer"

    with open("pubmed_articles.json", "w") as f:  # write mode 
        f.write(json.dumps(all_data))
                                   
    f.close() # close the file to save memory for run time 