In [1]:
import sys

from os import path
from pathlib import Path

from quickumls.constants import MEDSPACY_DEFAULT_SPAN_GROUP_NAME
import quickumls.spacy_component

sys.path.insert(0, "..")
from medspacy.util import get_quickumls_demo_dir



# This notebook provides a brief introduction into using QuickUMLS in medspacy as well as details on how it can be used in any spacy pipeline
### Data: A full QuickUMLS resource of the UMLS is not provided here as this would violate license agreements.  However, below are some resources on how to generate such resources.  In this medspacy repo is a small sample of UMLS (RRF files) which contains less than 100 concepts which can be found here:
https://www.nlm.nih.gov/research/umls/new_users/online_learning/Meta_006.html
### Usage: The cells below show how to use the QuickUMLS component on its own or in combination with other medspacy components out of the box such as `medspacy.context` for detecting semantic modifiers and attributes of entities, including negation, uncertainty and others.
### NOTE: There are two major result "modes" for using QuickUMLS.  The default is to use entities in spacy documents.  The limitation of this is that matches cannot overlap.  The other "mode" is to use (SpanGroups)[https://spacy.io/api/spangroup] which were introduced in spacy 3.  These spans can overlap and several users have requested such functionality since UMLS has many concepts which may overlap which you may want to process after the QuickUMLS component.  More of this is demonstrated in the notebook for Advanced QuickUMLS behavior.
### Generating QuickUMLS resources: Given RRF UMLS files, you can generate your own QuickUMLS resources with parameters such as language, character case and more.  To see more, consult the documentation here from the original QuickUMLS repo:
https://github.com/Georgetown-IR-Lab/QuickUMLS

In [2]:
import sys

import spacy
import nltk
sys.path.insert(0, "..")
import medspacy

from medspacy.util import DEFAULT_PIPE_NAMES
from medspacy.visualization import visualize_ent
from medspacy.section_detection import Sectionizer

In [3]:
print('Running on platform: {}'.format(sys.platform))

Running on platform: darwin


# Enable the QuickUMLS component by name since it is not enabled by default

In [4]:
medspacy_pipes = DEFAULT_PIPE_NAMES.copy()

if 'medspacy_quickumls' not in medspacy_pipes: 
    medspacy_pipes.add('medspacy_quickumls')
    
print(medspacy_pipes)
    
nlp = medspacy.load(enable = medspacy_pipes)

{'medspacy_tokenizer', 'medspacy_quickumls', 'medspacy_target_matcher', 'medspacy_pyrush', 'medspacy_context'}


# Check which pipe components have been enabled.  This way we ensure that the QuickUMLS matcher is in the list

In [5]:
nlp.pipe_names

['medspacy_pyrush', 'medspacy_target_matcher', 'medspacy_context']

# First's see a visualization of one of the concepts in the small sample of UMLS provided.  This concept is "dipalmitoyllecithin" which is Concept Unique Identifier (CUI) C0000039 in UMLS.

In [6]:
concept_text = 'Decreased dipalmitoyllecithin content found in lung specimens'

In [7]:
doc = nlp(concept_text)

In [8]:
visualize_ent(doc)

## However, there is additional metadata about any concept extracted by QuickUMLS.  For example, any CUI like this one can be a member of multiple Semantic Type.  In this case, the concept belongs to more than one Semantic Type.  Additionally, since QuickUMLS performs approximate matching, note that the similarity of the extracted concept from our text to the canonical resources can be observed.  In this case, since there is no lexical difference, we see 100% (1.0) similarity.

In [9]:
for ent in doc.ents:
    print('Entity text : {}'.format(ent.text))
    
    # each span may have multiple matches (different CUIs and similarity) so let's loop over that
    for umls_match in ent._.umls_matches:
        print('Label (UMLS CUI) : {}'.format(ent.label_))
        print('CUI: {}'.format(umls_match.cui))
        print('Similarity : {}'.format(umls_match.similarity))
        print('Semtypes : {}'.format(umls_match.semtypes))

# So this is an example of how to use QuickUMLS on its own.  What if we want to see negation as well?  Remember that QuickUMLS enables the `medspacy.context` component by default.  It's here in this list and we did not need to explicitly enable it.  Further, notice that the QuickUMLS component is ordered just before context.  This ensures that the Entity objects are added to the spacy `Doc` before `context` runs.

In [10]:
nlp.pipe_names

['medspacy_pyrush', 'medspacy_target_matcher', 'medspacy_context']

# Now let's come up with a different example with negation on a different lexical variant for the same UMLS concept in the relatively small "sample" resource.

In [11]:
negation_text = 'No findings of Dipalmitoyl Phosphatidylcholine in pulmonary specimen.'

In [12]:
negation_doc = nlp(negation_text)

In [13]:
for ent in negation_doc.ents:
    print('Entity text : {}'.format(ent.text))
    
    # each span may have multiple matches (different CUIs and similarity) so let's loop over that
    for umls_match in ent._.umls_matches:
        print('Label (UMLS CUI) : {}'.format(ent.label_))
        print('CUI: {}'.format(umls_match.cui))
        print('Similarity : {}'.format(umls_match.similarity))
        print('Semtypes : {}'.format(umls_match.semtypes))

## Note that the `context` component adds the attributes about the entity such as negation and others into members of the "underscore" (`_`) which can also be examined like this: 

In [14]:
for ent in negation_doc.ents:
    if any([ent._.is_negated, ent._.is_uncertain, ent._.is_historical, ent._.is_family, ent._.is_hypothetical, ]):
        print("'{0}' modified by {1} in: '{2}'".format(ent, ent._.modifiers, ent.sent))
        print()