# 1. Processing pipelines

In [4]:
import spacy

In [3]:
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

In [4]:
doc = nlp('Hi my name is ttt')

In [5]:
doc.cats

{}

In [6]:
print(nlp.pipe_names)

['tagger', 'parser', 'ner']


In [7]:
print(nlp.pipeline)

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x108e2f390>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x154588108>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x154588168>)]


# 4. Custom pipeline components

In [8]:
# Define a custom component
def custom_component(doc):
    # Print the doc's length
    print('Doc length:', len(doc))
    # Return the doc object
    return doc

# Add the component first in the pipleline
nlp.add_pipe(custom_component, first=True)

# Print the pipeline component names
print('Pipeline:',nlp.pipe_names)

Pipeline: ['custom_component', 'tagger', 'parser', 'ner']


In [9]:
doc = nlp('Hi my name is ttt')

Doc length: 5


In [34]:
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))



print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns) # matcher의 패턴 입력은 tuple 아니면 doc만 가능

Doc length: 2
Doc length: 1
Doc length: 1
Doc length: 2
animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]


In [16]:
type(animal_patterns[0])

spacy.tokens.doc.Doc

In [19]:
tt = ['a','b','c']
for t in tt: print(type(t))

<class 'str'>
<class 'str'>
<class 'str'>


In [26]:
print([type(doc) for doc in animal_patterns])

[<class 'spacy.tokens.doc.Doc'>, <class 'spacy.tokens.doc.Doc'>, <class 'spacy.tokens.doc.Doc'>, <class 'spacy.tokens.doc.Doc'>]


In [29]:
print([type(animal_patterns[i]) for i in range(len(animal_patterns))])

[<class 'spacy.tokens.doc.Doc'>, <class 'spacy.tokens.doc.Doc'>, <class 'spacy.tokens.doc.Doc'>, <class 'spacy.tokens.doc.Doc'>]


In [27]:
type(animals), type(animal_patterns)

(list, list)

In [31]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'ANIMAL'
    spans = [Span(doc, start, end, label='ANIMAL') for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the 'ner' component
nlp.add_pipe(animal_component, after='ner')
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text,ent.label, ent.label_) for ent in doc.ents])

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tagger', 'parser', 'ner', 'animal_component']
[('cat', 6303828839600189595, 'ANIMAL'), ('Golden Retriever', 6303828839600189595, 'ANIMAL')]


# 8. Extension attributes

* Add custom metadata to documents, tokens and spans
* Accsible via the `._` property

In [36]:
doc.title

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'title'

In [39]:
doc._.title = 'My document' # 미리 사전 등록이 필요함

AttributeError: [E047] Can't assign a value to unregistered extension attribute 'title'. Did you forget to call the `set_extension` method?

In [40]:
# Import global classes
from spacy.tokens import Doc, Token, Span

In [41]:
# Set extensions on the Doc, Token and Span
Doc.set_extension('title', default=None)
Token.set_extension('is_color', default=False)
Span.set_extension('has_color', default=False)

In [45]:
doc._.title = 'My document'
print(doc._.title)

My document


In [47]:
token = doc[0]
span = doc[0:3]
token._.is_color = True
span._.has_color = False

In [49]:
token._.is_color

True

In [51]:
doc = nlp('The sky is blue.')

Doc length: 5


In [52]:
# Overwrite extension attribute value
doc[3]._.is_color = True

### Property extensions

* Define a getter and an optional setter function
* Getter only called when you retrieve the attribute value

In [53]:
# Define getter function
def get_is_color(token):
    colors = ['red', 'yellow', 'blue']
    return token.text in colors

In [55]:
Token.remove_extension('is_color')

(False, None, None, None)

In [56]:
# Set extension on the Token with getter
Token.set_extension('is_color', getter=get_is_color)

In [57]:
print(doc[3]._.is_color, '-', doc[3].text)

True - blue


In [59]:
def get_has_color(span):
    colors = ['red', 'yellow', 'blue']
    return any(token.text in colors for token in span)

# Set extension on the Span with getter
Span.remove_extension('has_color')
Span.set_extension('has_color', getter=get_has_color)

In [60]:
print(doc[1:4]._.has_color, '-', doc[1:4].text)
print(doc[0:2]._.has_color, '-', doc[0:2].text)

True - sky is blue
False - The sky


### Method extensions

* Assign a <b>function</b> that becomes available as an object method
* Lets you pass <b>arguments</b> to the extension function

In [63]:
# Define method with arguments
def has_token(doc, token_text): # method 전달 시 첫번째가 해당 object, 그 이후가 함수로 전달하는 인자
    return  token_text in [token.text for token in doc]

In [65]:
# Set extension on the Doc with method
Doc.set_extension('has_token', method=has_token)

In [66]:
print(doc._.has_token('blue'), ' - blue')
print(doc._.has_token('cloud'), ' - cloud')

True  - blue
False  - cloud


In [67]:
doc

The sky is blue.

### Exercises : Setting extension attributes 

In [74]:
tt = 'text'
tt[::-1]

'txet'

In [75]:
def get_reversed(token):
    return token.text[::-1]

Token.set_extension('reversed',getter=get_reversed)

doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print('reversed', token.text, token._.reversed)

Doc length: 9
reversed All llA
reversed generalizations snoitazilareneg
reversed are era
reversed false eslaf
reversed , ,
reversed including gnidulcni
reversed this siht
reversed one eno
reversed . .


In [79]:
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)

Doc.set_extension('has_number', getter=get_has_number)

doc = nlp('The museum closed for five years in 2012.')
print('has_number:', doc._.has_number)

Doc length: 9
has_number: True


In [80]:
def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return "<{tag}>{text}</{tag}>".format(tag=tag, text=span.text)

Span.set_extension('to_html',method=to_html)

doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span.text, span._.to_html('strong'))

Doc length: 8
Hello world <strong>Hello world</strong>


### Exercises : Entities and extensions

In this exercise, you'll combine custom extension attributes with the model's predictions and create an attribute getter that returns a Wikipedia search URL if the span is a person, organization, or location

* Complete the `get_wikipedia_url` getter so it oly returns the URL if the span's label is in the list of labels.
* Set the `span` extension `'wikipedia_url'` using the getter `get_wikipedia_url`.
* Iterate over the entities in the `doc` and output their Wikipedia URL.

In [87]:
def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", 'ORG', 'GPE', 'LOCATION'):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search="+entity_text
    
#Span.set_extension('wikipedia_url', getter=get_wikipedia_url)

doc = nlp(
    " in over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)

for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent.label_, ent._.wikipedia_url)

Doc length: 27
over fifty years DATE None
first ORDINAL None
David Bowie PERSON https://en.wikipedia.org/w/index.php?search=David_Bowie


### Exercises : Components with extensions

In this exercise, you'll write a pipeline component that finds country names and a custom extension attribute that returns a country's capital, if available.

A phrase matcher with all countries is available as the variable `matcher`. A dictionary of countries mapped to their capital cities is available as the variable `CAPITALS`.

In [90]:
import json
from spacy.matcher import PhraseMatcher

with open('spaCy/exercises/countries.json') as f:
    COUNTRIES = json.loads(f.read())
    
with open('spaCy/exercises/capitals.json') as f:
    CAPITALS = json.loads(f.read())    
    
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))

def countries_component(doc):
    # Create an entitiy Span with the label 'GPE' for all matches
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label='GPE') for match_id, start, end in matches]
    return doc

In [93]:
COUNTRIES

['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'United States Minor Outlying Islands',
 'Virgin Islands (British)',
 'Virgin Islands (U.S.)',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cabo Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo (Democratic Republic of the)',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curaça

In [94]:
CAPITALS

{'Afghanistan': 'Kabul',
 'Åland Islands': 'Mariehamn',
 'Albania': 'Tirana',
 'Algeria': 'Algiers',
 'American Samoa': 'Pago Pago',
 'Andorra': 'Andorra la Vella',
 'Angola': 'Luanda',
 'Anguilla': 'The Valley',
 'Antarctica': '',
 'Antigua and Barbuda': "Saint John's",
 'Argentina': 'Buenos Aires',
 'Armenia': 'Yerevan',
 'Aruba': 'Oranjestad',
 'Australia': 'Canberra',
 'Austria': 'Vienna',
 'Azerbaijan': 'Baku',
 'Bahamas': 'Nassau',
 'Bahrain': 'Manama',
 'Bangladesh': 'Dhaka',
 'Barbados': 'Bridgetown',
 'Belarus': 'Minsk',
 'Belgium': 'Brussels',
 'Belize': 'Belmopan',
 'Benin': 'Porto-Novo',
 'Bermuda': 'Hamilton',
 'Bhutan': 'Thimphu',
 'Bolivia (Plurinational State of)': 'Sucre',
 'Bonaire, Sint Eustatius and Saba': 'Kralendijk',
 'Bosnia and Herzegovina': 'Sarajevo',
 'Botswana': 'Gaborone',
 'Bouvet Island': '',
 'Brazil': 'Brasília',
 'British Indian Ocean Territory': 'Diego Garcia',
 'United States Minor Outlying Islands': '',
 'Virgin Islands (British)': 'Road Town',
 'V

In [91]:
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

['countries_component']


In [97]:
# Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: CAPITALS.get(span.text) # CAPITALS 는 dictionary

In [98]:
# Register the Span extension attribute 'capital' with the getter get_capital
Span.set_extension('capital', getter=get_capital)

In [99]:
# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace.")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

[('Czech Republic', 'GPE', 'Prague'), ('Slovakia', 'GPE', 'Bratislava')]


# 13. Scaling and performance

대규모의 Doc을 전달할 때는 pipe 메소드를 사용하자. <br>단 pipe 메소드는 생성자를 반환하므로, list(nlp.pipe(LOTS_OF_TEXTS)) 형태로 사용

In [128]:
data = [
    ('This is a text', {'id':1, 'page_number':15}),
    ('And another text', {'id':2, 'page_number':16})
]

for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context['page_number'])

This is a text 15
And another text 16


In [130]:
Doc.set_extension('id', default=None, force=True)
Doc.set_extension('page_number', default=None,  force=True)

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context['id']
    doc._.page_number = context['page_number']

### Using only the tokenizer

Use `nlp.make_doc` to just turn a text into a `Doc` object (just for tokenizing)

### Disabling pipeline components

In [139]:
text= """ "Please help me!!!", He said. "Please!" """
text

' "Please help me!!!", He said. "Please!" '

In [141]:
nlp = spacy.load("en_core_web_sm")

In [143]:
# Disable tagger and parser
with nlp.disable_pipes('tagger', 'parser'):
    # Process the text and print the entities
    doc = nlp(text)
    print(doc.text)

 "Please help me!!!", He said. "Please!" 


### Exercises : Processing streams

In [144]:
with open("spacy/exercises/tweets.json") as f:
    TEXT= json.loads(f.read())

In [145]:
TEXT

['McDonalds is my favorite restaurant.',
 'Here I thought @McDonalds only had precooked burgers but it seems they only have not cooked ones?? I have no time to get sick..',
 'People really still eat McDonalds :(',
 'The McDonalds in Spain has chicken wings. My heart is so happy ',
 '@McDonalds Please bring back the most delicious fast food sandwich of all times!!....The Arch Deluxe :P',
 'please hurry and open. I WANT A #McRib SANDWICH SO BAD! :D',
 'This morning i made a terrible decision by gettin mcdonalds and now my stomach is payin for it']

In [148]:
for doc in nlp.pipe(TEXT):
    print([token.text for token in doc if token.pos_ == 'ADJ'])

['favorite']
['sick']
[]
['happy']
['delicious', 'fast']
[]
['terrible', 'gettin', 'payin']


In [153]:
docs = list(nlp.pipe(TEXT))
entities = [doc.ents for doc in docs]
print(*entities)

(McDonalds,) (@McDonalds,) (McDonalds,) (McDonalds, Spain) (The Arch Deluxe,) (WANT, McRib) (This morning,)


In [155]:
nlp = English()
people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

#patterns = [nlp(person) for person in people]
patterns = list(nlp.pipe(people))
patterns

[David Bowie, Angela Merkel, Lady Gaga]

### Exercises : Processing data with context

In this exercise, you'll be using custom attributes to add author and book meta information to quotes.

A list of `[text, context]` examples is available as the variable `DATA`. The texts are quotes from famous books, and the contexts dictionaries with the keys `'author'` and `'book'`.

In [156]:
with open('spacy/exercises/bookquotes.json') as f:
    DATA = json.loads(f.read())
    
nlp = English()

In [159]:
DATA[0]

['One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.',
 {'author': 'Franz Kafka', 'book': 'Metamorphosis'}]

In [157]:
Doc.set_extension('author', default=None)
Doc.set_extension('book', default=None)

In [160]:
for doc, context in nlp.pipe(DATA, as_tuples=True):
    doc._.book = context['book']
    doc._.author = context['author']
    
    print(doc.text, "\n", "- '{}' by {}".format(doc._.book, doc._.author), "\n")

One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. 
 - 'Metamorphosis' by Franz Kafka 

I know not all that may be coming, but be it what it will, I'll go to it laughing. 
 - 'Moby-Dick or, The Whale' by Herman Melville 

It was the best of times, it was the worst of times. 
 - 'A Tale of Two Cities' by Charles Dickens 

The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars. 
 - 'On the Road' by Jack Kerouac 

It was a bright cold day in April, and the clocks were striking thirteen. 
 - '1984' by George Orwell 

Nowadays people know the price of everything and the value of nothing. 
 - 'The Picture Of Dorian Gray' by Oscar Wilde 



### Exercises : Selective processing

In this exercise, you'll use the `nlp.make_doc` and `nlp.disable_pipes` methods to only run selected components when processing a text.

In [161]:
nlp = spacy.load('en_core_web_sm')
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

doc = nlp.make_doc(text)
print([token.text for token in doc])

['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']


In [163]:
with nlp.disable_pipes('tagger','parser'):
    doc = nlp(text)
    print(doc.ents)

(American, College Park, Georgia)
