### Readme:

This notebook documents what I learnt from https://course.spacy.io/en/

Special thanks to the content creators and the presenter Ines

If you want to learn more about spaCy, please visit https://spacy.io/ or https://course.spacy.io/en/

Thank you!

### Chapter 3: Processing Pipelines

tagger -> pos tagger

parser -> dependency parser

ner -> named entity recognizer

textcat -> text classifier

The tokenizer turns a string of text into a Doc object. spaCy then applies every component in the pipeline on document, in order.

In [4]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Print the names of the pipeline components
print(nlp.pipe_names)

# Print the full pipeline of (name, component) tuples
print(nlp.pipeline)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7fcff205c0e0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7fcff2d2ff90>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7fcff2d04dc0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7fcff2e2d160>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7fcff2689380>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7fcff27086c0>)]


nlp.add_pipe(component, last/first/before/after)

In [7]:
from spacy.language import Language
import spacy

# Define the custom component
@Language.component("length_component")
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print(f"This document is {doc_length} tokens long.")
    # Return the doc
    return doc


# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe('length_component', first=True)
print(nlp.pipe_names)

# Process a text
doc = nlp('This is a sentence.')

['length_component', 'tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
This document is 5 tokens long.


In [8]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)


animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]


In [9]:
# Define the custom component
@Language.component("animal_component")
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans, only matches animals patterns will they get to be printed out from the doc.ents
    doc.ents = spans
    return doc


# Add the component to the pipeline after the "ner" component
nlp.add_pipe('animal_component', after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

['tok2vec', 'tagger', 'parser', 'ner', 'animal_component', 'attribute_ruler', 'lemmatizer']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


### Setting Custom attributes
### Attributes (._.)

    * attribute extensions ( set a default value that can be overwritten)
    * property extensions (getter)
    * method extensions (function/arguments)

In [13]:
from spacy.tokens import Token
Token.set_extension("is_color", default = False, force = True) # force = True overwrite existing attr
doc = nlp("The sky is blue.")
doc[3]._.is_color = True

In [1]:
# attribute extensions
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

# Register the Token extension attribute "is_country" with the default value False
Token.set_extension("is_country", default=False)

# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True

# Print the token text and the is_country attribute for all tokens
print([(token.text, token._.is_country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


In [2]:
# property extensions
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]


# Register the Token property extension "reversed" with the getter get_reversed
Token.set_extension("reversed", getter=get_reversed)

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print("reversed:", token._.reversed)

reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


In [5]:
# method extensions
from spacy.lang.en import English
from spacy.tokens import Doc

nlp = English()

# Define the getter function
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)  # true if condition matches


# Register the Doc property extension "has_number" with the getter get_has_number
Doc.set_extension("has_number_", getter=get_has_number)

# Process the text and check the custom has_number attribute
doc = nlp("The museum closed for five years in 2012.")
print("has_number_:", doc._.has_number)

has_number_: True


In [6]:
from spacy.lang.en import English
from spacy.tokens import Span

nlp = English()

# Define the method
def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return f"<{tag}>{span.text}</{tag}>"


# Register the Span method extension "to_html" with the method to_html
Span.set_extension("to_html", method=to_html)

# Process the text and call the to_html method on the span with the tag name "strong"
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html("strong"))

<strong>Hello world</strong>


In [7]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")


def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using the getter get_wikipedia_url
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

over fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


In [9]:
# import json
# from spacy.lang.en import English
# from spacy.tokens import Span
# from spacy.matcher import PhraseMatcher

# with open("exercises/en/countries.json", encoding="utf8") as f:
#     COUNTRIES = json.loads(f.read())
# with open("exercises/en/capitals.json", encoding="utf8") as f:
#     CAPITALS = json.loads(f.read())

# nlp = English()
# matcher = PhraseMatcher(nlp.vocab)
# matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))

# def countries_component(doc):
#     # Create an entity Span with the label "GPE" for all matches
#     matches = matcher(doc)
#     doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matches]
#     return doc

# # Add the component to the pipeline
# nlp.add_pipe(countries_component)
# print(nlp.pipe_names)

# # Getter that looks up the span text in the dictionary of country capitals
# get_capital = lambda span: CAPITALS.get(span.text)

# # Register the Span extension attribute "capital" with the getter get_capital
# Span.set_extension("capital", getter=get_capital)

# # Process the text and print the entity text, label and capital attributes
# doc = nlp("Czech Republic may help Slovakia protect its airspace")
# print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

# solution
# ['countries_component']
# [('Czech Republic', 'GPE', 'Prague'), ('Slovakia', 'GPE', 'Bratislava')]

### Scaling and performance
    * use nlp.pipe method
    * processes texts as a stream, yields on doc object
    * much faster than calling nlp on each text
       ` * Example: list(nlp.pipe(TEXT)) is good
                    [nlp[text] for text in TEXT]
    * nlp.make_doc 
         * Example : nlp("Hello word") is nad
                     nlp.make_doc("Hello Word") is good
    * you can temporarily disable one ore more pipes
         * with will close it, restore back

In [None]:
# this is bad
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/en/tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the adjectives
for text in TEXTS:
    doc = nlp(text)
    print([token.text for token in doc if token.pos_ == "ADJ"])

In [None]:
# update, this is good
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/en/tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the adjectives
for doc in nlp.pipe(TEXTS):
    print([token.text for token in doc if token.pos_ == "ADJ"])
    
#['favorite']
# ['sick']
# []
# ['happy']
# ['delicious', 'fast']
# ['BAD']
# ['terrible', 'payin']
# ✔ Nice!

In [None]:
# this is bad
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/en/tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the entities
docs = [nlp(text) for text in TEXTS]
entities = [doc.ents for doc in docs]
print(*entities)

In [None]:
# update
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/en/tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the entities
docs = list(nlp.pipe(TEXTS))
entities = [doc.ents for doc in docs]
print(*entities)

# (McDonalds,) (@McDonalds,) (McDonalds,) (McDonalds, Spain) (The Arch Deluxe,) () ()

In [10]:
# this is bad
from spacy.lang.en import English

nlp = English()

people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

# Create a list of patterns for the PhraseMatcher
patterns = [nlp(person) for person in people]

In [11]:
# update
from spacy.lang.en import English

nlp = English()

people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

# Create a list of patterns for the PhraseMatcher
patterns = list(nlp.pipe(people))

In [None]:
import json
from spacy.lang.en import English
from spacy.tokens import Doc

with open("exercises/en/bookquotes.json", encoding="utf8") as f:
    DATA = json.loads(f.read())

nlp = English()

# Register the Doc extension "author" (default None)
Doc.set_extension("author", default=None)

# Register the Doc extension "book" (default None)
Doc.set_extension("book", default=None)

for doc, context in nlp.pipe(DATA, as_tuples=True):
    # Set the doc._.book and doc._.author attributes from the context
    doc._.book = context["book"]
    doc._.author = context["author"]

    # Print the text and custom attribute data
    print(f"{doc.text}\n — '{doc._.book}' by {doc._.author}\n")
    
# One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.
#  — 'Metamorphosis' by Franz Kafka

# doc._.book and doc._.author are overwritten with the respective info passed in as the context (instead of none)

In [12]:
# Selective Processing

In [13]:
# this is bad
import spacy

nlp = spacy.load("en_core_web_sm")
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

# Only tokenize the text
doc = nlp(text)
print([token.text for token in doc])

['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']


In [15]:
# update
import spacy

nlp = spacy.load("en_core_web_sm")
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

# Only tokenize the text
doc = nlp.make_doc(text)
print([token.text for token in doc])

['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']


In [16]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

# Disable the tagger and parser
with nlp.disable_pipes("tagger", "parser"):
    # Process the text
    doc = nlp(text)
    # Print the entities in the doc
    print(doc.ents)
    
# did not recognize that Chick-fil-A is a named entity, need to add it

[W108] The rule-based lemmatizer did not find POS annotation for the token 'Chick'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.
[W108] The rule-based lemmatizer did not find POS annotation for the token '-'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.
[W108] The rule-based lemmatizer did not find POS annotation for the token 'fil'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.
[W108] The rule-based lemmatizer did not find POS annotation for the token '-'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.
[W108] The rule-based lemmatizer did not find POS annotation for the token 'A'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attrib

(American, College Park, Georgia)


In [22]:
import spacy
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Span


# Create the match patterns, follow the orders
pattern1 = [{"TEXT": "Chick"}, {"TEXT": "-"},{"TEXT": "fil"},{"TEXT": "-"},{"TEXT": "A"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("Chick-Fil-a", [pattern1])


nlp = spacy.load("en_core_web_sm")
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)


# Disable the tagger and parser
with nlp.disable_pipes("tagger", "parser"):
    for match_id, start, end in matcher(doc):
    # Process the text
        doc = nlp(text)
        span = Span(doc, start, end)
    # Print the entities in the doc
        print(list(doc.ents) + [span])
    

[W108] The rule-based lemmatizer did not find POS annotation for the token 'Chick'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.
[W108] The rule-based lemmatizer did not find POS annotation for the token '-'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.
[W108] The rule-based lemmatizer did not find POS annotation for the token 'fil'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.
[W108] The rule-based lemmatizer did not find POS annotation for the token '-'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.
[W108] The rule-based lemmatizer did not find POS annotation for the token 'A'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attrib

[American, College Park, Georgia, Chick-fil-A]
