In [None]:
"""
Using a spaCy EntityRuler pipeline component to identify entities (virus terms, drug terms, etc.) in text passages.

"""

In [1]:
import json
import spacy
import en_core_sci_lg
from spacy.pipeline import EntityRuler
import re

In [2]:
def generate_entityruler_pattern_list(pattern_phrases, label):
    """
    Generate a list of dicts defining patterns for an EntityRuler object.
    """
    
    patterns = []
    pattern_phrases = set(pattern_phrases)
    for pattern_phrase in pattern_phrases:
        patterns.append({
            "label" : label,
            "pattern" : pattern_phrase
        })
    
    return patterns

def extract_patterns_from_text_file(file_path):
    """
    Extract a set of patterns from a text file downloaded from the VT Task Drug Analysis notebook.
    """
    
    terms = set([])
    with open(file_path) as f:
        for line in f.readlines():
            term = line.strip().lower()
            if term:
                terms.add(term)
    
    return terms

In [3]:
"""
Config files


"""

date = '200501'

ent_ruler_config_path = 'resources/spacy_entity_rulers/input/vt_task_entity_ruler_writer_config_%s.json' % date

config_dict = {
    'outpath' : 'resources/spacy_entity_rulers/vt_task_entity_ruler_%s.jsonl' % date,
    'label_term_paths': {
            'VIRUS' : 'resources/spacy_entity_rulers/input/virus_words.txt',
            'DRUG' : 'resources/spacy_entity_rulers/input/DrugNames.txt'
        }
}

In [4]:
with open(ent_ruler_config_path, 'w') as f:
    json.dump(config_dict, f)

with open(ent_ruler_config_path) as f:
    input_dict = json.load(f)
    
all_patterns = []

for label, term_file_path in input_dict['label_term_paths'].items():
    label_terms = extract_patterns_from_text_file(term_file_path)
    all_patterns += generate_entityruler_pattern_list(label_terms, label)

In [5]:
"""
Add pattern phrase list to EntityRuler, and write ruler to jsonl file.
"""
output_path = input_dict['outpath']

nlp = en_core_sci_lg.load()
ruler = EntityRuler(nlp)

print("Adding patterns to ruler...")    
ruler.add_patterns(all_patterns)
print("Saving patterns to %s" % output_path)
ruler.to_disk(output_path)

Adding patterns to ruler...
Saving patterns to resources/spacy_entity_rulers/vt_task_entity_ruler_200501.jsonl
