In [1]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
nlp = spacy.load('en_core_sci_sm') 
# nlp = spacy.load('en_core_sci_md') 
# nlp = spacy.load('en_core_sci_lg') 

nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})


def extract_information(text, model):
    doc = model(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


original_text = "The 60-year-old female patient reports persistent abdominal pain and intermittent fever."
entities = extract_information(original_text, nlp)
print("Extracted Entities:", entities)


  from .autonotebook import tqdm as notebook_tqdm
Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib
  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Extracted Entities: [('female', 'ENTITY'), ('patient reports', 'ENTITY'), ('persistent', 'ENTITY'), ('abdominal pain', 'ENTITY'), ('intermittent fever', 'ENTITY')]


In [2]:
import spacy
import scispacy
import re
from scispacy.linking import EntityLinker

# Load the model and add the linker
nlp = spacy.load('en_core_sci_sm')
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

def extract_entities_with_age(text):
    """
    Extract entities using UMLS linking and add custom age pattern matching.
    Returns both the semantic type from UMLS and a custom AGE category for numeric ages.
    """
    # Pattern for numeric age mentions
    age_pattern = r'\b\d+[-\s]?(?:year|yr)s?[-\s]?old\b'
    
    doc = nlp(text)
    entities = []
    
    # Process each entity found by spaCy/UMLS
    for ent in doc.ents:
        entity_info = {'text': ent.text, 'type': 'UNKNOWN'}
        
        # Check if there's a UMLS link
        if ent._.kb_ents and len(ent._.kb_ents) > 0:
            umls_id, score = ent._.kb_ents[0]
            linker = nlp.get_pipe("scispacy_linker")
            concept = linker.kb.cui_to_entity[umls_id]
            
            entity_info.update({
                'umls_id': umls_id,
                'score': score,
                'semantic_types': concept.types,
                'definition': concept.definition
            })
            
            # Get the first semantic type as the main type
            if concept.types:
                entity_info['type'] = concept.types[0]
    
        # Check if this entity matches our age pattern
        if re.search(age_pattern, ent.text, re.IGNORECASE):
            entity_info['type'] = 'AGE_VALUE'
        
        entities.append(entity_info)
    
    # Add any age patterns that weren't caught as entities
    text_spans = [(m.start(), m.end(), m.group()) for m in re.finditer(age_pattern, text, re.IGNORECASE)]
    for start, end, age_text in text_spans:
        # Check if this span was already caught as an entity
        if not any(age_text in e['text'] for e in entities):
            entities.append({
                'text': age_text,
                'type': 'AGE_VALUE',
                'umls_id': None,
                'score': None,
                'semantic_types': None,
                'definition': None
            })
    
    return entities

def print_entity_analysis(text):
    """
    Print detailed analysis of entities found in the text
    """
    print(f"\nAnalyzing text: {text}")
    print("\nEntities found:")
    
    entities = extract_entities_with_age(text)
    
    for entity in entities:
        print(f"\nEntity: {entity['text']}")
        print(f"Type: {entity['type']}")
        
        if entity['umls_id']:
            print(f"UMLS ID: {entity['umls_id']}")
            print(f"Confidence Score: {entity['score']:.2f}")
            print(f"Semantic Types: {entity['semantic_types']}")
            print(f"Definition: {entity['definition']}")

# Main
if __name__ == "__main__":
    test_texts = [
        "The 60-year-old female patient reports persistent abdominal pain."
    ]
    
    for text in test_texts:
        print_entity_analysis(text)


Analyzing text: The 60-year-old female patient reports persistent abdominal pain.

Entities found:

Entity: female
Type: T098
UMLS ID: C0043210
Confidence Score: 0.99
Semantic Types: ['T098']
Definition: Human females as cultural, psychological, sociological, political, and economic entities.

Entity: patient reports
Type: T170
UMLS ID: C0747307
Confidence Score: 0.97
Semantic Types: ['T170']
Definition: An indication that the patient provides information about their vital signs, status, or subjective feelings. Some systems also consider information reported by the patient's family or guardian as patient-reported.

Entity: persistent
Type: T079
UMLS ID: C0205322
Confidence Score: 0.98
Semantic Types: ['T079']
Definition: Retained; never-ceasing.

Entity: abdominal pain
Type: T184
UMLS ID: C0000737
Confidence Score: 0.97
Semantic Types: ['T184']
Definition: Sensation of discomfort, distress, or agony in the abdominal region.

Entity: 60-year-old
Type: AGE_VALUE
