# Cooccurrence of a user-selected term against all MeSH terms with citations

In [1]:
import datetime
import gzip
import pathlib
from typing import List, Set

import scipy.stats
import tqdm
import jsonlines
import tqdm
import pandas as pd
from nxontology import NXOntology

from cooccurrence import cooccurrence_metrics

In [2]:
# read the MeSH ontology
nxo = NXOntology.read_node_link_json("data/mesh-nxo-node-link.json.gz")
nxo.freeze()
nxo.n_nodes

300093

In [3]:
# Read the jsonlines file
path = pathlib.Path('data/mesh-term-topics-noexp.jsonl.gz')
with jsonlines.Reader(gzip.open(path, "rt")) as reader:
    lines = list(reader)
for line in lines:
    line["pumbed_ids"] = set(line["pubmed_ids"])
len(lines)

35533

In [4]:
# filter topics without mesh_ids since cooccurrence cannot be computed
mesh_id_to_line = {line["mesh_id"]: line for line in lines if line["pubmed_ids"]}

In [5]:
all_pmids: Set[str] = set()
for line in lines:
    all_pmids |= set(line["pubmed_ids"])
len(all_pmids)

27700101

In [6]:
def explode_pubmid_ids(nxo: NXOntology, mesh_id_to_line: dict, topic: str):
    exploded_pubmed_ids = set()
    for descendant in nxo.node_info(topic).descendants:
        if descendant not in mesh_id_to_line:
            continue
        exploded_pubmed_ids |= set(mesh_id_to_line[descendant]["pubmed_ids"])
    return exploded_pubmed_ids

def cooccurrence_result(source_mesh_id: str, target_mesh_id: str, nxo: NXOntology, mesh_id_to_line: dict, total_pmids: int) -> dict:
    source_pmids = explode_pubmid_ids(nxo, mesh_id_to_line, source_mesh_id)
    target_pmids = explode_pubmid_ids(nxo, mesh_id_to_line, target_mesh_id)
    result = {
        "source_mesh_id": source_mesh_id,
        "target_mesh_id": target_mesh_id,
        "source_mesh_label": nxo.node_info(source_mesh_id).label,
        "target_mesh_label": nxo.node_info(target_mesh_id).label,
    }
    result.update(cooccurrence_metrics(source_pmids, target_pmids, total_pmids=total_pmids))
    return result

In [7]:
source_mesh_id = "D005357" # Fibrous Dysplasia of Bone
target_mesh_id = "D009103"
cooccurrence_result(source_mesh_id, target_mesh_id, nxo, mesh_id_to_line, total_pmids=len(all_pmids))

{'source_mesh_id': 'D005357',
 'target_mesh_id': 'D009103',
 'source_mesh_label': 'Fibrous Dysplasia of Bone',
 'target_mesh_label': 'Multiple Sclerosis',
 'cooccurrence': 1,
 'expected': 11.212882581186257,
 'enrichment': 0.08918313312919807,
 'odds_ratio': 0.08898974017319211,
 'p_fisher': 0.9999867425484787,
 'n_source': 5106,
 'n_target': 60830}

In [8]:
source_mesh_id = "D005357" # Fibrous Dysplasia of Bone
rows = list()
for target_mesh_id in tqdm.tqdm(mesh_id_to_line):
    # for development
    if len(rows) > 1000:
        break
    row = cooccurrence_result(source_mesh_id, target_mesh_id, nxo, mesh_id_to_line, total_pmids=len(all_pmids))
    rows.append(row)

  3%|▎         | 1001/31045 [00:00<00:22, 1363.74it/s]


In [9]:
cooccur_df = pd.DataFrame(rows)
cooccur_df.sort_values("p_fisher").head(10)

Unnamed: 0,source_mesh_id,target_mesh_id,source_mesh_label,target_mesh_label,cooccurrence,expected,enrichment,odds_ratio,p_fisher,n_source,n_target
80,D005357,C535285,Fibrous Dysplasia of Bone,Ramon Syndrome,108,0.019908,5425.009988,inf,0.0,5106,108
773,D005357,C537045,Fibrous Dysplasia of Bone,Albright's hereditary osteodystrophy,101,0.27576,366.260701,400.610658,3.0102020000000004e-219,5106,1496
541,D005357,C536444,Fibrous Dysplasia of Bone,Corneodermatoosseous syndrome,282,21.623739,13.041223,13.775837,8.109068e-207,5106,117309
218,D005357,C535654,Fibrous Dysplasia of Bone,Rosenthal-Kloepfer syndrome,282,21.623739,13.041223,13.775837,8.109068e-207,5106,117309
780,D005357,C537058,Fibrous Dysplasia of Bone,Fitzsimmons-McLachlan-Gilbert syndrome,282,21.623739,13.041223,13.775837,8.109068e-207,5106,117309
957,D005357,C537518,Fibrous Dysplasia of Bone,Podder-Tolmie syndrome,282,21.623739,13.041223,13.775837,8.109068e-207,5106,117309
107,D005357,C535335,Fibrous Dysplasia of Bone,Abderhalden-Kaufmann-Lignac syndrome,282,21.623739,13.041223,13.775837,8.109068e-207,5106,117309
297,D005357,C535874,Fibrous Dysplasia of Bone,Rowley-Rosenberg syndrome,282,21.623739,13.041223,13.775837,8.109068e-207,5106,117309
535,D005357,C536428,Fibrous Dysplasia of Bone,Cleidorhizomelic syndrome,282,21.623739,13.041223,13.775837,8.109068e-207,5106,117309
136,D005357,C535458,Fibrous Dysplasia of Bone,Chudley-Rozdilsky syndrome,282,21.623739,13.041223,13.775837,8.109068e-207,5106,117309
