In [1]:
import spacy
from spacy.lang.zh import Chinese
from spacy.lang.en import English

# Lexical attributes

In [2]:
nlp_en = English()
doc = nlp_en(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

In [5]:
for token in doc:
    print(token.text, end=' ')
    if token.like_num:
        next_token = doc[token.i + 1]
        if next_token.text == '%':
            print(f"Percentage found: {token.text}")

In 1990 , more than 60 Percentage found: 60
% of people in East Asia were in extreme poverty . Now less than 4 Percentage found: 4
% are . 

In [11]:
nlp_zh = Chinese()('我是中国人,我爱中国.')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.844 seconds.
Prefix dict has been built succesfully.


In [16]:
for token in nlp_zh:
    print(token.text, token.i, token.lemma, token.lemma_, token.is_digit, token.ent_id)

我 0 15874136271370048803 我 False 0
是 1 9707108951148699008 是 False 0
中国 2 14253433776819982803 中国 False 0
人 3 10509053707201126865 人 False 0
, 4 2593208677638477497 , False 0
我 5 15874136271370048803 我 False 0
爱 6 1028791419835934650 爱 False 0
中国 7 14253433776819982803 中国 False 0
. 8 12646065887601541794 . False 0


# Statical Model

In [132]:
nlp_en = spacy.load('en_core_web_sm')

In [134]:
doc = nlp_en('She ate the pizza')
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep, token.dep_, token.tag, token.head.text)

She 95 PRON 429 nsubj 13656873538139661788 ate
ate 100 VERB 8206900633647566924 ROOT 17109001835818727656 ate
the 90 DET 415 det 15267657372422890137 pizza
pizza 92 NOUN 416 dobj 15308085513773655218 ate


In [135]:
token.tag, token.tag_

(15308085513773655218, 'NN')

In [26]:
doc = nlp_en('Apple is looking at buyting U.K. startup for $1 billion')
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [23]:
spacy.explain('GPE')

'Countries, cities, states'

In [24]:
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

In [28]:
spacy.explain('dobj')

'direct object'

In [41]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

In [42]:
doc = nlp_en(text)

In [43]:
for token in doc:
    print(f"{token.text:<12}{token.pos:<10}{token.dep:<10}")

It          95        429       
’s          100       445       
official    92        408       
:           97        445       
Apple       96        429       
is          87        8206900633647566924
the         90        415       
first       84        402       
U.S.        96        426       
public      84        402       
company     92        404       
to          94        405       
reach       100       447       
a           90        415       
$           99        446       
1           93        7037928807040764755
trillion    93        12837356684637874264
market      92        7037928807040764755
value       92        416       


In [44]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [45]:
spacy.explain('ORDINAL')

'"first", "second", etc.'

In [48]:
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"
doc = nlp_en(text)
print([(ent.text, ent.label_) for ent in doc.ents])
iphone_x = doc[1:3]
iphone_x

[('New iPhone', 'EVENT'), ('Apple', 'ORG')]


iPhone X

# Rule-based matching

In [49]:
from spacy.matcher import Matcher

In [50]:
nlp = spacy.load('en_core_web_sm')

In [51]:
matcher = Matcher(nlp.vocab)

In [57]:
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]

In [58]:
matcher.add('IPHONE_PATTERN', None, pattern)

In [59]:
doc = nlp('New iPhone X release date leacked')

In [60]:
matches = matcher(doc)

In [62]:
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text, match_id)

iPhone X 9528407286733565721


In [74]:
pattern = [
    {'IS_DIGIT': True},
    {'LOWER': 'fifa'},
    {'LOWER': 'world'},
    {'LOWER': 'cup'},
    {'IS_PUNCT': False},
]

In [75]:
doc = nlp('2018 FIFA World Cup: France won!')
matcher.add('fifa pattern', None, pattern)
matches = matcher(doc)
[doc[start:end].text for _,start, end in matches]

['2018 FIFA World Cup:']

In [76]:
doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


In [77]:
doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip


In [78]:
spacy.explain('PROPN')

'proper noun'

In [79]:
doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses


# Data Structure

## Vocab, Lexemes and StringStore

In [97]:
doc = nlp('I have a cat, I like coffee')
coffee_hash = nlp.vocab.strings['coffee']

In [98]:
cat_hash = nlp.vocab.strings['cat']
cat_hash

5439657043933447811

In [99]:
cat_string = nlp.vocab.strings[cat_hash]
cat_string

'cat'

In [102]:
coffee_string = nlp.vocab.strings[coffee_hash]
coffee_string

'coffee'

## Doc, Span and Token

In [110]:
from spacy.tokens import Doc, Span

In [108]:
words = ['spaCy', 'is', 'cool', '!']
spaces = [True, True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
doc.text

'spaCy is cool!'

In [127]:
words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc, 2, 4, label="PERSON")
print(span.text, span.label_)

# Add the span to the doc's entities
doc.ents = [span]
print(span, span.label_)
# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
David Bowie PERSON
[('David Bowie', 'PERSON')]


In [113]:
doc = nlp("Berlin is a nice city")

# Iterate over the tokens
for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == "PROPN":
        print(token.text)
        # Check if the next token is a verb
        if doc[token.i + 1].pos_ == "VERB":
            print("Found proper noun before a verb:", token.text)

Berlin


# Word vectors and semantic similarities

In [114]:
# Process a text
doc = nlp("Two bananas in pyjamas")

# Get the vector for the token "bananas"
bananas_vector = doc[1].vector
print(bananas_vector)

[ 1.5189985   1.076918   -2.4386053  -1.0038377  -0.0905692   3.2566323
  1.1464078  -1.7798252   0.9480093   3.1546564   1.5740933   1.4181484
 -0.63161474  0.64892656 -2.9598489  -3.2275558  -2.589254    0.47790492
 -0.90102494 -0.08130413  1.0560234  -2.3348074  -2.1118956  -0.06434625
  2.2199328   3.2613754  -0.10951489 -2.4045506  -1.9903512   0.31546235
  0.34890574 -0.9951826  -3.042477   -3.2609134   1.6715299  -1.5878646
  4.6034184  -1.9911556  -0.45828786 -1.1500677   9.190529    3.1573846
  1.0649832  -1.8014958  -1.0308598  -0.7978955   0.17627858 -0.33392054
  0.972975    2.304922   -3.8109355  -3.8562963   0.13938749 -0.53733057
 -3.0516758   0.18260929  2.7131486  -0.23883325 -2.848932    0.93734527
 -1.8697596   4.676008   -0.4455688   0.33904022  3.8083303  -0.60235286
  1.409047   -2.2682061   1.1596808   0.8128679   1.7838112   0.44814107
 -0.6278328   2.7147403   0.9225056  -0.02261716  0.06356072 -4.513331
  0.0575515  -3.0546     -2.3484716   3.1645222  -3.77102

In [115]:
len(bananas_vector)

96

In [116]:
doc1 = nlp("It's a warm summer day")
doc2 = nlp("It's sunny outside")

# Get the similarity of doc1 and doc2
similarity = doc1.similarity(doc2)
print(similarity)

0.5361259402585243


  "__main__", mod_spec)


In [117]:
doc = nlp("TV and books")
token1, token2 = doc[0], doc[2]

# Get the similarity of the tokens "TV" and "books"
similarity = token1.similarity(token2)
print(similarity)

0.28746358


  "__main__", mod_spec)


In [118]:
doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

# Create spans for "great restaurant" and "really nice bar"
span1 = doc[3:5]
span2 = doc[12:15]

# Get the similarity of the spans
similarity = span1.similarity(span2)
print(similarity)

0.61632603


  "__main__", mod_spec)


# Combining mdels and rules

In [119]:
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"}, {"TEXT": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)
# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


In [123]:
 list(nlp.pipe(['hello', 'world', 'yes', 'no']))

[hello, world, yes, no]

In [124]:
with open("exercises/countries.json") as f:
    COUNTRIES = json.loads(f.read())

nlp = English()
doc = nlp("Czech Republic may help Slovakia protect its airspace")

# Import the PhraseMatcher and initialize it
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# Call the matcher on the test document and print the result
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])

FileNotFoundError: [Errno 2] No such file or directory: 'exercises/countries.json'

In [125]:
spacy.explain('GPE')

'Countries, cities, states'

# Processing pipelines
- what happens when you call nlp

In [128]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [129]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1e5a793c2e8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1e5b3b53ca8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1e5b3b53d08>)]

In [131]:
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print("This document is {} tokens long.".format(doc_length))
    # Return the doc
    return doc


# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)

# Process a text
doc = nlp("This is a sentence.")

['length_component', 'tagger', 'parser', 'ner']
This document is 5 tokens long.
