# Cooccurrence of a user-selected term against all MeSH terms with citations

In [1]:
! git log -1 --oneline

[33m7744a88[m[33m ([m[1;36mHEAD -> [m[1;32mmain[m[33m, [m[1;31morigin/main[m[33m)[m Query pubmed with quoted MeSH terms and [nm]


In [2]:
import datetime
import gzip
import pathlib
from typing import List, Set

import scipy.stats
import tqdm
import jsonlines
import tqdm
import pandas as pd
from nxontology import NXOntology

from cooccurrence import cooccurrence_metrics

In [3]:
# read the MeSH ontology
nxo = NXOntology.read_node_link_json("data/mesh-nxo-node-link.json.gz")
nxo.freeze()
nxo.n_nodes

300093

In [4]:
# Read the jsonlines file
path = pathlib.Path('data/mesh-term-topics-noexp.jsonl.gz')
with jsonlines.Reader(gzip.open(path, "rt")) as reader:
    lines = list(reader)
for line in lines:
    line["pumbed_ids"] = set(line["pubmed_ids"])
len(lines)

35533

In [5]:
# filter topics without mesh_ids since cooccurrence cannot be computed
mesh_id_to_line = {line["mesh_id"]: line for line in lines if line["pubmed_ids"]}

In [6]:
all_pmids: Set[str] = set()
for line in lines:
    all_pmids |= set(line["pubmed_ids"])
len(all_pmids)

27698253

In [7]:
def explode_pubmid_ids(nxo: NXOntology, mesh_id_to_line: dict, topic: str):
    exploded_pubmed_ids = set()
    for descendant in nxo.node_info(topic).descendants:
        if descendant not in mesh_id_to_line:
            continue
        exploded_pubmed_ids |= set(mesh_id_to_line[descendant]["pubmed_ids"])
    return exploded_pubmed_ids

def cooccurrence_result(source_mesh_id: str, target_mesh_id: str, nxo: NXOntology, mesh_id_to_line: dict, total_pmids: int) -> dict:
    source_pmids = explode_pubmid_ids(nxo, mesh_id_to_line, source_mesh_id)
    target_pmids = explode_pubmid_ids(nxo, mesh_id_to_line, target_mesh_id)
    result = {
        "source_mesh_id": source_mesh_id,
        "target_mesh_id": target_mesh_id,
        "source_mesh_label": nxo.node_info(source_mesh_id).label,
        "target_mesh_label": nxo.node_info(target_mesh_id).label,
    }
    result.update(cooccurrence_metrics(source_pmids, target_pmids, total_pmids=total_pmids))
    return result

In [8]:
source_mesh_id = "D005357" # Fibrous Dysplasia of Bone
target_mesh_id = "D009103"
cooccurrence_result(source_mesh_id, target_mesh_id, nxo, mesh_id_to_line, total_pmids=len(all_pmids))

{'source_mesh_id': 'D005357',
 'target_mesh_id': 'D009103',
 'source_mesh_label': 'Fibrous Dysplasia of Bone',
 'target_mesh_label': 'Multiple Sclerosis',
 'cooccurrence': 0,
 'expected': 10.945734736410992,
 'enrichment': 0.0,
 'odds_ratio': 0.0,
 'p_fisher': 1.0,
 'n_source': 4985,
 'n_target': 60818}

In [9]:
source_mesh_id = "D005357" # Fibrous Dysplasia of Bone

rows = list()
for target_mesh_id in tqdm.tqdm(mesh_id_to_line):
    # for development
#     if len(rows) > 1000:
#         break
    row = cooccurrence_result(source_mesh_id, target_mesh_id, nxo, mesh_id_to_line, total_pmids=len(all_pmids))
    rows.append(row)

100%|██████████| 32568/32568 [05:11<00:00, 104.58it/s] 


In [10]:
cooccur_df = pd.DataFrame(rows)
cooccur_df = cooccur_df.sort_values(by=["p_fisher", "enrichment"], ascending=[True, False])
cooccur_df.head(15)

Unnamed: 0,source_mesh_id,target_mesh_id,source_mesh_label,target_mesh_label,cooccurrence,expected,enrichment,odds_ratio,p_fisher,n_source,n_target
8235,D005357,D002636,Fibrous Dysplasia of Bone,Cherubism,432,0.077749,5556.319559,inf,0.0,4985,432
10805,D005357,D005357,Fibrous Dysplasia of Bone,Fibrous Dysplasia of Bone,4985,0.897177,5556.319559,inf,0.0,4985,4985
10806,D005357,D005358,Fibrous Dysplasia of Bone,"Fibrous Dysplasia, Monostotic",455,0.081889,5556.319559,inf,0.0,4985,455
10807,D005357,D005359,Fibrous Dysplasia of Bone,"Fibrous Dysplasia, Polyostotic",1446,0.260244,5556.319559,inf,0.0,4985,1446
15098,D005357,D010002,Fibrous Dysplasia of Bone,Osteitis Fibrosa Cystica,570,0.29192,1952.59072,3398.490955,0.0,4985,1622
15105,D005357,D010009,Fibrous Dysplasia of Bone,Osteochondrodysplasias,4985,5.510482,904.639526,inf,0.0,4985,30618
15096,D005357,D010000,Fibrous Dysplasia of Bone,Osteitis,508,0.711802,713.681501,911.497502,0.0,4985,3955
23170,D005357,D019205,Fibrous Dysplasia of Bone,"GTP-Binding Protein alpha Subunits, Gs",218,0.385507,565.489105,658.188528,0.0,4985,2142
7503,D005357,D001848,Fibrous Dysplasia of Bone,"Bone Diseases, Developmental",4985,14.657724,340.093722,inf,0.0,4985,81443
16640,D005357,D011629,Fibrous Dysplasia of Bone,"Puberty, Precocious",190,0.841564,225.770042,244.573598,0.0,4985,4676


In [11]:
# cooccur_df.head(1000).to_excel("data/medline-cooccurrence.xlsx", index=False, freeze_panes=(0, 1))