In [48]:
# How to add a custom NER Ruler to an existing Model to extend NER
import spacy
import json
import os
from spacy.pipeline import EntityRuler

text= "Martha, a senior, moved to Spain where she will be playing basketball until June 2022 or until she can't play any longer. She previously played soccer and made the national under 15 team."
nlp = spacy.load("en_core_web_sm")
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "SPORT", "pattern" : "basketball"},{"label": "SPORT", "pattern" : "soccer"}]
ruler.add_patterns(patterns)
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)
    
    
nlp.to_disk("./NER_Models/sports_ner")



            
            

Martha PERSON
Spain GPE
basketball SPORT
June 2022 DATE
soccer SPORT
15 CARDINAL


In [53]:
# How to add a custom Syslog NER Ruler to an existing Model to extend NER
import spacy
import json
import os
from spacy.pipeline import EntityRuler

text= "Jul 11 16:38:47 snuc-sdkvm app.py: Workspace_Management: Terminating workspace 5 /workspace - new workspace"

nlp = spacy.load("en_core_web_sm")
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "COMPONENT", "pattern" : "workspace"},{"label": "COMPONENT", "pattern" : "Workspace_Management"}]
ruler.add_patterns(patterns)
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)
    
    
nlp.to_disk("./NER_Models/syslog_a_ner")

Jul 11 LAW
16:38:47 TIME
Workspace_Management COMPONENT
workspace COMPONENT
5 CARDINAL
workspace COMPONENT


In [54]:
# An example of using the previously extended model
import spacy
import json
import os
from spacy.pipeline import EntityRuler

text= "Martha, a senior, moved to Spain where she will be playing basketball until June 2022 or until she can't play any longer. She previously played soccer and made the national under 15 team."
nlp=spacy.load("./NER_Models/sports_ner")
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)
print(spacy.__version__)

Martha PERSON
Spain GPE
basketball SPORT
June 2022 DATE
soccer SPORT
15 CARDINAL
3.5.2


In [11]:
# How to add a custom NER Ruler (based on patterns stored in a json file) to an existing Model to extend NER

import spacy
import json
import os
from spacy.pipeline import EntityRuler

 

current_dir = os.getcwd()
print("Current Working Directory:", current_dir)

# format of Spacy yTraining Data
# TRAIN_DATA = [(text, {"entities": [(start,end,label)]})]

def save_data(file,data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data,f, indent=4)

def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)  

data = load_data("ReferenceFiles/syslog_entities.json")
print(data)



def test_model(model, text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        #results.append((ent.start_char,ent.end_char, ent.label))
        something = ((ent.start_char,ent.end_char, ent.label))
        print(something)
        
    return (results)  


def create_training_data(file,type):
    data = load_data(file)
    patterns = []
    for item in data:
        pattern = {"label" : type , "pattern" : item} # this is what spacy expects
        patterns.append(pattern)
    return patterns    
        
        
def generate_rules(nlp,patterns):
    #nlp = spacy.load("en_core_web_sm")
    ruler = nlp.add_pipe("entity_ruler")
    #patterns = [{"label": "COMPONENT", "pattern" : "workspace"},{"label": "COMPONENT", "pattern" : "Workspace_Management"}]
    ruler.add_patterns(patterns)
    doc = nlp(text)
    nlp.to_disk("./NER_Models/syslog_b_ner")

nlp = spacy.load("en_core_web_sm", disable=["tagger","parser","lemmatizer","tok2vec", "tagger"])

#nlp = spacy.load("en_core_web_sm")            
patterns = create_training_data("ReferenceFiles/syslog_entities.json","COMPONENT") 
print(patterns)
generate_rules(nlp,patterns)  


print(nlp.pipe_names)  

    

Current Working Directory: /home/johnos/PythonForDigitalHumanities
['Status', 'Workspace_Management', 'client', 'OSD', 'workspace', 'app.py:']
[{'label': 'COMPONENT', 'pattern': 'Status'}, {'label': 'COMPONENT', 'pattern': 'Workspace_Management'}, {'label': 'COMPONENT', 'pattern': 'client'}, {'label': 'COMPONENT', 'pattern': 'OSD'}, {'label': 'COMPONENT', 'pattern': 'workspace'}, {'label': 'COMPONENT', 'pattern': 'app.py:'}]
['attribute_ruler', 'ner', 'entity_ruler']


In [1]:
# How to use the previously extended model

import spacy
import json
import os
from spacy.pipeline import EntityRuler

def test_model(nlp, text):
    doc = nlp(text)
    results = []
    entities = []
    for doc in nlp.pipe(text, disable=["tok2vec","tagger", "parser", "attribute_ruler", "lemmatizer"]):
        # Do something with the doc here
        print([(ent.text, ent.label_) for ent in doc.ents])
    # for ent in doc.ents:
    #     #results.append((ent.start_char,ent.end_char, ent.label))
    #     something = ((ent.start_char,ent.end_char, ent.label))
    #     print(something)        
    return (results)  

nlp = spacy.load("./NER_Models/syslog_b_ner")
with open("ReferenceFiles/syslog", "r") as f:
    text = f.read()
#text= "Jul 11 16:38:47 snuc-sdkvm app.py: Workspace_Management: Terminating workspace 5 /workspace - new workspace"
doc = nlp(text)    
# for ent in doc.ents:
#     print(ent.text, ent.label_)
# print(spacy.__version__)    
test_model(nlp, text)


[]
[]
[]
[]
[]
[('1', 'CARDINAL')]
[('1', 'CARDINAL')]
[]
[('1', 'CARDINAL')]
[('6', 'CARDINAL')]
[]
[('3', 'CARDINAL')]
[('8', 'CARDINAL')]
[]
[('4', 'CARDINAL')]
[('7', 'CARDINAL')]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('1', 'CARDINAL')]
[('1', 'CARDINAL')]
[]
[('1', 'CARDINAL')]
[('6', 'CARDINAL')]
[]
[('3', 'CARDINAL')]
[('8', 'CARDINAL')]
[]
[('4', 'CARDINAL')]
[('7', 'CARDINAL')]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('1', 'CARDINAL')]
[('5', 'CARDINAL')]
[('4', 'CARDINAL')]
[('0', 'CARDINAL')]
[('2', 'CARDINAL')]
[('9', 'CARDINAL')]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]

[]

In [1]:
# Another example of extending an existing model by adding new NER rules

import spacy
from spacy.pipeline import EntityRuler

nlp = spacy.load("en_core_web_sm")
entity_ruler = EntityRuler(nlp)
nlp.add_pipe("entity_ruler", before="ner")
ruler = nlp.get_pipe('entity_ruler')

skills = [
    {'label': 'SKILL', 'pattern': [{"lower": "python"}]},
    {'label': 'SKILL', 'pattern': [{"lower": "sql"}]},
    {'label': 'SKILL', 'pattern': [{"lower": "mysql"}]},
    {'label': 'SKILL', 'pattern': [{"lower": "pandas"}]},
    {'label': 'SKILL', 'pattern': [{"lower": "spacy"}]},
    {'label': 'SKILL', 'pattern': [{"lower": "scikit-learn"}]},
    {'label': 'SKILL', 'pattern': [{"lower": "scikit"}, {"lower": "learn"}]},
    {'label': 'SKILL', 'pattern': [{"lower": "sklearn"}]},
    {'label': 'SKILL', 'pattern': [{"lower": "tensor"}, {"lower": "flow"}]},
]

ruler.add_patterns(skills)

text = """
We are looking for a data scientist with knowledge of Python and MySQL. 
The role will involve working with Pandas, scikit-learn, and Spacy.
Knowledge of Tensor Flow would be advantageous.
"""

doc = nlp(text)

entities = [(ent.text, ent.label_) for ent in doc.ents]
print(entities)
print(nlp.pipe_names)



[('Python', 'SKILL'), ('MySQL', 'SKILL'), ('Pandas', 'SKILL'), ('Spacy', 'SKILL'), ('Tensor Flow', 'SKILL')]
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'entity_ruler', 'ner']


In [5]:
# How to add just the NER pipline for a model
import spacy

# You can generate a base config here: https://spacy.io/usage/training#config
nlp = spacy.blank("en") # a blank model
print(nlp.pipe_names)
nlp.add_pipe("ner") # just the ner model
print(nlp.pipe_names)
nlp = spacy.load("en_core_web_sm") #the standard small english model
print(nlp.pipe_names)


[]
['ner']
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("ner", name="syslog_ner", before="lemmatizer") # we can position our pipe before or after other components
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'syslog_ner', 'lemmatizer', 'ner']


In [82]:
# https://www.youtube.com/watch?v=sp4B-JbEu7M
# How to Delete a Specific Entity from a spaCy Doc Object with a Custom Factory (spaCy 3x Tutorials)
# In this example we remove the PERSON Entity

import spacy
from spacy.language import Language

text = "John enjoys playing basketball in Berlin in June."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
for ent in doc.ents:
    print(ent,ent.label_)

@Language.component("person_removal")
def person_removal(doc):
    ents = [ent for ent in list(doc.ents) if ent.label_ != "PERSON"]
    doc.ents = ents
    ents = tuple(ents)
    doc.ents = ents
    return (doc)    
Language.component("person_removal",func=person_removal)     

nlp = spacy.load("en_core_web_sm")

print(nlp.pipe_names)
nlp.add_pipe("person_removal")
print(nlp.pipe_names)

doc2 = nlp(text)
for ent in doc2.ents:
    print(ent,ent.label_)
    


John PERSON
Berlin GPE
June DATE
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'person_removal']
Berlin GPE
June DATE


In [9]:
# How to Add Custom Factories with Language component in spaCy 3x (SpaCy 3x Tutorials)
# https://www.youtube.com/watch?v=rAtlntEhJsg

import spacy
from spacy.language import Language

@Language.component("cap_maker")
def cap_maker(doc): # we make a custom language component that we can add to a pipe line. This pipeline will perform a specified action on the doc
    print(doc.text.capitalize())
    return (doc)

Language.component("cap_maker",func=cap_maker)
nlp = spacy.blank("en")
nlp.add_pipe("cap_maker")
print(nlp.pipe_names)

text = "this is a test string"
doc = nlp(text)
nlp.to_disk("./NER_Models/test_model")


['cap_maker']
This is a test string
This is another test string


In [2]:
# An example of trying to use a previous model that referenced a custom factory. The model fail because in that instance the factory was not packaged with the model
import spacy
from spacy.language import Language

nlp_new = spacy.load("./NER_Models/test_model")
text = "this is another test string"
doc = nlp_new(text)

ValueError: [E002] Can't find factory for 'cap_maker' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, ner, beam_ner, entity_ruler, tagger, morphologizer, senter, sentencizer, textcat, spancat, spancat_singlelabel, future_entity_ruler, span_ruler, textcat_multilabel, en.lemmatizer

In [12]:
# How to remove unwanted Spacy pipeline components
import spacy
import json
import os
from spacy.pipeline import EntityRuler

def test_model(nlp, text):
    doc = nlp(text)
    results = []
    entities = []

    for ent in doc.ents:
        if(ent.label_ != "CARDINAL"):
            print(f"text: {ent.text}, label {ent.label_}")
        # Do something with the doc here
        # print([(ent.text, ent.label_) for ent in doc.ents])
    # for ent in doc.ents:
    #     #results.append((ent.start_char,ent.end_char, ent.label))
    #     something = ((ent.start_char,ent.end_char, ent.label))
    #     print(something)        
    return (results)  

nlp = spacy.load("./NER_Models/syslog_b_ner", disable=["tagger","parser","lemmatizer","tok2vec", "tagger"])
with open("ReferenceFiles/syslog_truncated", "r") as f:
    text = f.read()
text= "Jul 11 16:38:47 snuc-sdkvm app.py: Workspace_Management: Terminating workspace 5 /workspace - new workspace"
doc = nlp(text)    
# for ent in doc.ents:
#     print(ent.text, ent.label_)
# print(spacy.__version__)    
test_model(nlp, text)

text: Jul 11, label LAW
text: 16:38:47, label TIME
text: app.py:, label COMPONENT
text: Workspace_Management, label COMPONENT
text: workspace, label COMPONENT
text: workspace, label COMPONENT


[]

In [None]:
# How to Package spaCy Models (without Custom Factories) 
# https://www.youtube.com/watch?v=AGM7lXfQECM

# 1. Edit the meta.json of the source library you wish to package and change its name to something that suitably describes the model
# 2. Run the command: python -m spacy <location of source library> <location of target or output library>
# 3. The previous command will create a dist folder in the location specified for the target library, change to that folder and run: python setup.py sdist
# 4. This will create an installable library in the dist folder: en_syslog_sm-3.5.0.tar.gz
# 5. This can be installed with pip install.

In [None]:
# How to Package spaCy Models (Even with Custom Factories) 
# https://www.youtube.com/watch?v=AGM7lXfQECM
