### Properties
1. Lexical Attributes: They don't depend on the attributes of the words. Refers words from the vocab(String store)

In [1]:
import spacy

nlp = spacy.load("en_core_web_md")
doc = nlp("It costs $5")

print("Index:",[token.i for token in doc])
print("Text:",[token.text for token in doc])
print("is_alpha:",[token.is_alpha for token in doc])
print("is_punct:",[token.is_punct for token in doc])
print("like_num:",[token.like_num for token in doc])

print(nlp.pipe_names)

Index: [0, 1, 2, 3]
Text: ['It', 'costs', '$', '5']
is_alpha: [True, True, False, False]
is_punct: [False, False, False, False]
like_num: [False, False, False, True]
['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [11]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

It PRON nsubj costs
costs VERB ROOT costs
$ SYM nmod 5
5 NUM dobj costs


In [12]:
for ent in doc.ents:
    print(ent.text, ent.label_)

$5 MONEY


The labelled data that the model was trained on is __NOT__ included in spacy.

In [24]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
pattern = [{"TEXT":"iPhone"}, {"TEXT":"X"}]
matcher.add("IPHONE_PATTERN",[pattern])
doc  = nlp("the phone I found was iPhone X yesterday")
matcher(doc)

[(9528407286733565721, 5, 7)]

In [33]:
string = """
python 3.5 and python 3.7 onwards have the difference that the former is not async.
python 3 and python 2 are very different in itself.
"""
doc3 = nlp(string)
pattern = [{"TEXT":"python"}, {"LIKE_NUM":True}]

matcher2 = Matcher(nlp.vocab)
matcher2.add("pyt_pattern",[pattern])

for hashv, start, end in matcher2(doc3):
    print(doc3[start:end])
    

python 3.5
python 3.7
python 3
python 2


|OP	|Description|
|---|-----------|
|!	|Negate the pattern, by requiring it to match exactly 0 times.|
|?	|Make the pattern optional, by allowing it to match 0 or 1 times.|
|+	|Require the pattern to match 1 or more times.|
|*	|Allow the pattern to match 0 or more times.|

In [45]:
# creating a doc manually
from spacy.tokens import Doc,Span

words = ["This", "is", "it", "Carlos"]
spaces = [True, True, True, False]
doc= Doc(nlp.vocab, words=words, spaces=spaces)
span = Span(doc, 0,2)

### Tokenizer cases

The tokenizer doesn’t create tokens for single spaces, so there’s no token with the value " " in between.


Sometimes it’s more efficient to match exact strings instead of writing patterns describing the individual tokens. This is especially true for finite categories of things – like all countries of the world. We already have a list of countries, so let’s use this as the basis of our information extraction script. A list of string names is available as the variable COUNTRIES.

    Import the PhraseMatcher and initialize it with the shared vocab as the variable matcher.
    Add the phrase patterns and call the matcher on the doc

In [60]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_md")

pattern1 = [{"LOWER": "silicon"}, {"TEXT": " "}, {"LOWER":"Valley"}]
pattern2 = [{"LOWER": "silicon"}, {"LOWER":"valley"}]
doc111 = nlp("People in the silicon Valley etc")
matcher11 = Matcher(nlp.vocab)
matcher11.add("S_V_PAT",[pattern2])

for hsh,start,end in matcher11(doc111):
    print(doc111.vocab.strings[hsh])
    print(doc111[start:end])



S_V_PAT
silicon Valley


In [74]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"},{"IS_PUNCT":True},{"LOWER":"free"}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", [pattern1])
matcher.add("PATTERN2", [pattern2])

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


In [4]:
# checking the names in the nlp pipeline components
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


## Custom NLP Pipelines

1. Cand add custom components to the pipeline
2. The function takes doc and returns doc can be a custom function.
3. They are useful to 
    
    a. Calculate custom value based on the token and their attribute.

    b. Added named entitiy based on a dictionary.
3. ```python
nlp.add_pipe(component, 
last/first/before=bool/bool/'comp')
    ```


In [2]:
## e.g. of a custom entities in pipeline, rule based
from spacy.matcher import PhraseMatcher
from spacy.language import Language
from spacy.tokens import Span
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
@Language.component("animal_detect")
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc

#instead of annotation also: Language.component("my_component2", func=my_component)
# Add the component to the pipeline after the "ner" component
# depricated ----
nlp.add_pipe("animal_detect",after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tok2vec', 'tagger', 'parser', 'ner', 'animal_detect', 'attribute_ruler', 'lemmatizer']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


In [None]:
import spacy

nlp = spacy.load("en_core_web_md")



accessible with `._ property`

### Types of extensions attributes:
1. Attribute extensions
   :Sets a default value that can be overwritten.
2. Property extensions
   : Behaves like properties is python, has both getter and setter.
3. Method extensions: Makes the attributes a method that can be calculated dynamically

In [17]:
from spacy.tokens import Token
#force=True to reset extension
Token.set_extension("is_color", default=False, force=True)
doc = nlp("The sky is blue.")
doc[3]._.is_color=True

In [31]:
# property extension
from spacy.tokens import Span

def get_has_color(token):
    colors = ["red","yellow","blue"]
    return token.text in colors

Token.set_extension("has_color", getter=get_has_color, force=True)

In [35]:
doc = nlp("this is a blue dog")

doc[3]._.has_color

True

In [44]:
# property extension for span

def has_color(span):
    colors = ["red","yellow","blue"]
    return any([token.text in colors for token in span])
Span.set_extension("has_al_colors", getter=has_color, force=True)

doc = nlp("the colur of our sky is not blue")
print("_.has_color", doc[1:8]._.has_color, doc[1:5]._.has_color)

_.has_color True False


In [52]:
from spacy.tokens import Doc

def has_token(doc:Doc, token_text:str)-> bool:
    has_tok = token_text in [token.text for token in doc]
    return has_tok

Doc.set_extension("has_token", method=has_token, force=True)

doc= nlp("My dog did go crazy when he saw the color blue")

doc._.has_token("did")

True

In [None]:
# creating custom components and entities
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")


def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using the getter get_wikipedia_url
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)