# Download MEDLINE topics for all MeSH Topical Descriptors and SCR Diseases

In [1]:
import datetime
import gzip
import pathlib

import tenacity
import jsonlines
import tqdm
import pandas as pd
from pubmedpy.eutilities import esearch_query
from nxontology import NXOntology

In [2]:
@tenacity.retry(wait=tenacity.wait_exponential(min=2, max=2**10))
def query_topic(mesh_term: str, scr: bool = False) -> dict:
    """
    mesh_term is the name/label of a MeSH Term.
    scr: whether the MeSH term is a supplementary concept.
         See https://github.com/hetio/medline/issues/4.
    """
    result = {}
    # https://pubmed.ncbi.nlm.nih.gov/help/#pubmed-format
    term_query = f'"{mesh_term}" [{"Supplementary Concept" if scr else "MeSH Terms"}:noexp]'
    result["pubmed_search"] = term_query
    payload = {'db': 'pubmed', 'term': term_query}
    result["timestamp"] = datetime.datetime.utcnow().isoformat(timespec="seconds")
    result["pubmed_ids"] = sorted(esearch_query(payload, retmax = 5000, tqdm=None))
    return result

In [3]:
# example query
query_topic("Tabatznik syndrome", scr=True)

{'pubmed_search': '"Tabatznik syndrome" [Supplementary Concept:noexp]',
 'timestamp': '2021-04-12T20:05:16',
 'pubmed_ids': []}

## Load MeSH Ontology

In [4]:
# read the MeSH ontology
nxo = NXOntology.read_node_link_json("data/mesh-nxo-node-link.json.gz")
nxo.n_nodes

300093

In [5]:
nodes_data = [data for node, data in nxo.graph.nodes(data=True)]
nodes_data.sort(key=lambda x: (x["mesh_class"], x["mesh_id"]))
term_df = pd.DataFrame(nodes_data)
term_df.head(2)

Unnamed: 0,mesh_id,mesh_class,mesh_uri,mesh_label,tree_numbers
0,D005260,CheckTag,http://id.nlm.nih.gov/mesh/2020/D005260,Female,
1,D008297,CheckTag,http://id.nlm.nih.gov/mesh/2020/D008297,Male,


In [6]:
term_df.mesh_class.value_counts()

SCR_Chemical              243740
TopicalDescriptor          29054
SCR_Organism               19019
SCR_Disease                 6479
SCR_Protocol                1215
GeographicalDescriptor       397
PublicationType              187
CheckTag                       2
Name: mesh_class, dtype: int64

In [7]:
# filter to classes of interest
keep_classes = {"TopicalDescriptor", "SCR_Disease"}
nodes_data = [info for info in nodes_data if info["mesh_class"] in keep_classes]
mesh_ids = [x["mesh_id"] for x in nodes_data]
len(nodes_data)

35533

In [8]:
nodes_data[0]

{'mesh_id': 'C000591739',
 'mesh_class': 'SCR_Disease',
 'mesh_uri': 'http://id.nlm.nih.gov/mesh/2020/C000591739',
 'mesh_label': 'Familial gynecomastia, due to increased aromatase activity'}

## Perform queries

In [9]:
# read already queried affiliations
path = pathlib.Path('data/mesh-term-topics-noexp.jsonl.gz')
lines = jsonlines.Reader(gzip.open(path, "rt")) if path.exists() else []
existing = {row['mesh_id'] for row in lines}
new = sorted(set(mesh_ids) - existing)
print(f"{len(mesh_ids):,} total mesh_ids: {len(existing):,} already queried, {len(new):,} new")

35,533 total mesh_ids: 0 already queried, 35,533 new


In [10]:
# query new affiliations and append to JSON Lines file
write_file = gzip.GzipFile(filename=path, mode="ab", mtime=0)
with write_file:
    with jsonlines.Writer(write_file) as writer:
        for mesh_id in tqdm.tqdm(new):
            result = nxo.graph.nodes[mesh_id].copy()
            result.update(query_topic(result["mesh_label"], result["mesh_class"] != "TopicalDescriptor"))
            writer.write(result)

100%|██████████| 35533/35533 [15:30:42<00:00,  1.57s/it]      


In [11]:
# Read the jsonlines file
with jsonlines.Reader(gzip.open(path, "rt")) as reader:
    lines = list(reader)
len(lines)

35533

In [12]:
# Show keys for a single line
list(lines[0])

['mesh_id',
 'mesh_class',
 'mesh_uri',
 'mesh_label',
 'pubmed_search',
 'timestamp',
 'pubmed_ids']