In [1]:
# Distemist -> Easy
# Quaero -> MEDLINE / EMEA (2016)
# MM -> ML and CL
# 
# MantraGSC -> DE, EN, ES, FR, NL * EMEA, MEDLINE, PATENTS

In [2]:
from pathlib import Path
from xmen import load_kb, load_config
from xmen.log import logger
from dataloaders import load_dataset

In [3]:
config_name = 'benchmark/quaero.yaml'
#config_name = 'medmentions/medmentions_monoling.yaml'

In [4]:
config = load_config(config_name)

In [5]:
config

{'defaults': [{'benchmark': '???'}, '_self_'], 'base_dir': '${oc.env:HOME}/runs/xmen/', 'work_dir': '${base_dir}/${name}/', 'linker': {'candidate_generation': {'sapbert': {'k': 1000}, 'ngram': {'k': 100}}}, 'save_intermediate': True, 'data': {'expand_abbreviations': True}, 'hydra_work_dir': '${base_dir}/${benchmark.name}/', 'hydra': {'job': {'chdir': True}, 'run': {'dir': '${hydra_work_dir}/outputs/${now:%Y-%m-%d_%H-%M-%S}'}}, 'base_config': '../benchmark.yaml', 'name': 'quaero_benchmark', 'dataset': 'quaero', 'dict': {'umls': {'lang': ['fr', 'en'], 'meta_path': '${oc.env:UMLS_HOME}/2014AB/META', 'semantic_groups': ['ANAT', 'CHEM', 'DEVI', 'DISO', 'GEOG', 'LIVB', 'OBJC', 'PHEN', 'PHYS', 'PROC']}}}

In [6]:
config = load_config(config_name)
base_path = Path(config.work_dir)

In [7]:
dict_name = base_path / f"{config.name}.jsonl"

if not dict_name.exists():
    logger.error(f"{dict_name} does not exist, please run: xmen dict path/to/config.yaml")
    
index_base_path = base_path / 'index'

if not index_base_path.exists():
    logger.error(f"{index_base_path} does not exist, please run: xmen index path/to/config.yaml --all")

In [8]:
%%time
kb = load_kb(dict_name)

CPU times: user 22.1 s, sys: 1.6 s, total: 23.7 s
Wall time: 23.7 s


In [None]:
%%time
from xmen.kb import KnowledgeBase
kb2 = KnowledgeBase(str(dict_name))

In [None]:
import orjson

In [None]:
%%time
raw = [orjson.loads(line) for line in open(dict_name)]

In [None]:
from dataclasses import dataclass, field
from typing import Optional, List

class MyEntity():
    
    def __init__(self,    
        concept_id: str,
        canonical_name: str,
        aliases: List[str],
        types: List[str] = [],
        definition: Optional[str] = None
    ):
        pass

    def __repr__(self):
        rep = ""
        num_aliases = len(self.aliases)
        rep = rep + f"CUI: {self.concept_id}, Name: {self.canonical_name}\n"
        rep = rep + f"Definition: {self.definition}\n"
        rep = rep + f"TUI(s): {', '.join(self.types)}\n"
        if num_aliases > 10:
            rep = (
                rep
                + f"Aliases (abbreviated, total: {num_aliases}): \n\t {', '.join(self.aliases[:10])}"
            )
        else:
            rep = (
                rep + f"Aliases: (total: {num_aliases}): \n\t {', '.join(self.aliases)}"
            )
        return rep

In [None]:
%%timeit
from scispacy.linking_utils import KnowledgeBase, Entity
from collections import defaultdict
import sys

alias_to_cuis = defaultdict(set)
cui_to_entity = {}

for entry in raw:
    if not entry:
        continue
    if type(entry) != list:
        entry = [entry]
    for concept in entry:
        if type(concept["concept_id"]) == int:
            concept["concept_id"] = str(concept["concept_id"])
        unique_aliases = set(concept["aliases"])
        if "canonical_name" in concept:
            unique_aliases.add(concept["canonical_name"])
        for alias in unique_aliases:
            alias_to_cuis[alias].add(concept["concept_id"])
        concept_id = concept["concept_id"]
        if not concept_id in cui_to_entity:
            cui_to_entity[concept_id] = Entity(**concept)
        else:
            cui_to_entity[concept_id] = _merge_entities(
                Entity(**concept), self.cui_to_entity[concept_id]
            )

In [None]:
dataset = load_dataset(config.dataset)

In [None]:
kb = load_kb(dict_name)

In [None]:
from xmen.data import CUIReplacer