# 1. Data Structures (1)

In [1]:
import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

In [4]:
coffee_hash = nlp.vocab.strings['coffee']
print('string value:',nlp.vocab.strings[coffee_hash])

KeyError: "[E018] Can't retrieve string for hash '3197928453018144401'."

In [7]:
#coffee_string = nlp.vocab.strings[coffee_hash]
print('string value:',nlp.vocab.strings[coffee_hash])

KeyError: "[E018] Can't retrieve string for hash '3197928453018144401'."

In [4]:
nlp.vocab.strings[3197928453018144401]

KeyError: "[E018] Can't retrieve string for hash '3197928453018144401'."

In [5]:
print(nlp.vocab.strings[3197928453018144401])

KeyError: "[E018] Can't retrieve string for hash '3197928453018144401'."

In [6]:
doc = nlp("I love coffee")
print('hash value:', nlp.vocab.strings['coffee'])
print('string value:', nlp.vocab.strings[3197928453018144401])

hash value: 3197928453018144401
string value: coffee


In [7]:
print('hash value:', doc.vocab.strings['coffee'])

hash value: 3197928453018144401


In [8]:
print('hash value:', nlp.vocab.strings['like'])

hash value: 18194338103975822726


In [9]:
print('hash value:', doc.vocab.strings['like'])

hash value: 18194338103975822726


In [10]:
lexeme = nlp.vocab['coffee']

# Print the lexial attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


# 4. Data Structures (2)

### The Doc object

In [11]:
from spacy.lang.en import English
nlp = English()

In [12]:
# Import the Doc class
from spacy.tokens import Doc

In [13]:
# The words and spaces to create the doc from
words = ['Hello', 'world', '!']
spaces = [True, False, False]

In [14]:
# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [15]:
doc

Hello world!

### The Span object

In [16]:
# Import the Doc and Span classes
from spacy.tokens import Doc, Span

In [17]:
# The words and spaces to create the doc from
words = ['Hello', 'world', '!']
spaces = [True, False, False]

In [18]:
# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [19]:
# Create a span manually
span = Span(doc, 0, 2)

In [20]:
# Create a span with a label
span_with_label  = Span(doc, 0, 2, label='GREETING')

In [21]:
# Add span to the doc.ents
doc.ents = [span_with_label]

In [22]:
span, span_with_label

(Hello world, Hello world)

In [23]:
doc.ents

(Hello world,)

In [24]:
doc

Hello world!

In [25]:
words = ["a"," ", "te"]
spaces = [False,False,False]
doc = Doc(nlp.vocab, words, spaces)
doc

a te

In [30]:
for token in doc:
    print(token.i)

0
1
2
3
4


In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin is a nice city")

for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == "PROPN":
        # Check if the next token is a verb
        if doc[token.i + 1].pos_ == "VERB":
            result = token.text
            print("Found proper noun before a verb:", result)

Found proper noun before a verb: Berlin


# 8. Word vectors and semantic similarities

### Comparing semantic similarity

In [8]:
# Load a larger model with vectors
nlp = spacy.load('en_core_web_md')

In [9]:
# Compare two documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

0.8627203210548107


In [11]:
# Compare two tokens
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

0.73695457


In [12]:
# Compare a document with a token
doc = nlp("I like pizza")
token = nlp("soap")

print(doc.similarity(token))

0.3253198629111809


In [13]:
# Compare a span with a document
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

0.6199091710787739


### Word vectors in spaCy

In [16]:
doc = nlp("I have a banana")
# Access the vector via the token.vector attribute
print(doc, len(doc.vector), doc[3].vector)

I have a banana 300 [ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.556

Q1 : document의 벡터는 어떻게 구성되는가?

In [17]:
import numpy as np

In [19]:
vec = np.mean([tok.vector for tok in doc],axis=0)
sum((doc.vector - vec)**2)

0.0

A1 : document의 각 token의 word vector의 평균으로 정의됨

Q2 : document 사이의 similiarity는 어떻게 정의되는가?

In [20]:
doc1 = nlp("I like pizza and pasta")
doc2 = nlp("McDonalds sells burgers")

vec1, vec2 = doc1.vector, doc2.vector

$n$차원 벡터 $a$, $b$ 에 대하여 

$
\cos(a,b) = \cos(\theta) = \frac{a\cdot b}{\|a\| \cdot \|b\|}
$

In [39]:
vec1_dot_vec2 = (vec1*vec2).sum()
vec1_abs = np.sqrt((vec1**2).sum())
vec2_abs = np.sqrt((vec2**2).sum())

print('doc1.similarity(doc2)={:0.6f}, cos(vec1, vec2)={:0.6f}'.format(doc1.similarity(doc2), vec1_dot_vec2/(vec1_abs * vec2_abs)))

doc1.similarity(doc2)=0.608069, cos(vec1, vec2)=0.608069


A2 : 각 document의 word vector 사이의 cosine similarity로 정의된다!

### Similarity depends on the application context

In [12]:
doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")
print(doc1.similarity(doc2))

0.9501447503553421


In [13]:
# Process a text
doc = nlp("Two bananas in pyjamas")

# Get the vector for the token "bananas"
bananas_vector = doc[1].vector
print(bananas_vector)

[-2.2009e-01 -3.0322e-02 -7.9859e-02 -4.6279e-01 -3.8600e-01  3.6962e-01
 -7.7178e-01 -1.1529e-01  3.3601e-02  5.6573e-01 -2.4001e-01  4.1833e-01
  1.5049e-01  3.5621e-01 -2.1508e-01 -4.2743e-01  8.1400e-02  3.3916e-01
  2.1637e-01  1.4792e-01  4.5811e-01  2.0966e-01 -3.5706e-01  2.3800e-01
  2.7971e-02 -8.4538e-01  4.1917e-01 -3.9181e-01  4.0434e-04 -1.0662e+00
  1.4591e-01  1.4643e-03  5.1277e-01  2.6072e-01  8.3785e-02  3.0340e-01
  1.8579e-01  5.9999e-02 -4.0270e-01  5.0888e-01 -1.1358e-01 -2.8854e-01
 -2.7068e-01  1.1017e-02 -2.2217e-01  6.9076e-01  3.6459e-02  3.0394e-01
  5.6989e-02  2.2733e-01 -9.9473e-02  1.5165e-01  1.3540e-01 -2.4965e-01
  9.8078e-01 -8.0492e-01  1.9326e-01  3.1128e-01  5.5390e-02 -4.2423e-01
 -1.4082e-02  1.2708e-01  1.8868e-01  5.9777e-02 -2.2215e-01 -8.3950e-01
  9.1987e-02  1.0180e-01 -3.1299e-01  5.5083e-01 -3.0717e-01  4.4201e-01
  1.2666e-01  3.7643e-01  3.2333e-01  9.5673e-02  2.5083e-01 -6.4049e-02
  4.2143e-01 -1.9375e-01  3.8026e-01  7.0883e-03 -2

In [7]:
doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

# Create spans for "great restaurant" and "really nice bar"
span1 = doc[3:5]
span2 = doc[-4:-1]

In [8]:
span1, span2

(great restaurant, really nice bar)

In [9]:
span1.similarity(span2)

0.75173926

# 11. Combining models and rules

### Recap: Rule-based Matching

In [11]:
# Initialize with the shared vocab
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [12]:
# Patterns are lists of dictionaries descrining the tokens
pattern = [{'LEMMA': 'love', 'POS': 'VERB'}, {'LOWER': 'cats'}]
matcher.add('LOVE_CATS', None, pattern)

In [13]:
# Operators can specify how often a token should be matched
pattern = [{'TEXT': 'very', 'OP': '+'}, {'TEXT': 'happy'}]

In [14]:
# Calling matcher on doc returns list of (match_id, start, end) tuples
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

In [15]:
matches

[(9137535031263442622, 1, 3)]

### Adding statistical predictions

Here's an example of a matcher rule for "golden retriever".

If we iterate over the matches returned by the matcher, we can get the match ID and the start and end index of the matched span. We can then find out more about it. Span objects give us access to the original document and all other token attributes and linguistic features predicted by the model.

For example, we can get the span's root token. If the span consists of more than one token, this will be the token that decides the category of the phrase. For example, the root of "Golden Retriever" is "Retriever". We can also find the head token of the root. This is the syntactic "parent" that governs the phrase – in this case, the verb "have".

Finally, we can look at the previous token and its attributes. In this case, it's a determiner, the article "a".

In [16]:
matcher = Matcher(nlp.vocab)
matcher.add('DOG', None, [{'LOWER': 'golden'}, {'LOWER': 'retriever'}])
doc = nlp("I have a Golden Retriever")

In [17]:
for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print('Matched span:', span.text)
    # Get the span's root token and root head token
    print('Root token:', span.root.text)
    print('Root head token:', span.root.head.text)
    # Get the previous token and its POS tag
    print('Previous token:', doc[start-1].text, doc[start-1].pos_)

Matched span: Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET


### Efficient phrase matching

The phrase matcher is another helpful tool to find sequences of words in your data.

It performs a keyword search on the document, but instead of only finding strings, it gives you direct access to the tokens in context.

It takes Doc objects as patterns.

It's also really fast.

This makes it very useful for matching large dictionaries and word lists on large volumes of text.

In [18]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

In [19]:
pattern = nlp("Golden Retriever")
matcher.add('DOG', None, pattern)
doc = nlp("I have a Golden Retriever")

In [30]:
type(pattern)

spacy.tokens.doc.Doc

In [21]:
# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Get the matched span
    span = doc[start:end]
    print('Matched span:', span.text)

Matched span: Golden Retriever


In [27]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"},{"TEXT": "-"},{"LOWER": "free"}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


In [24]:
doc

Twitch Prime, the perks program for Amazon Prime members offering free loot, games and other benefits, is ditching one of its best features: ad-free viewing. According to an email sent out to Amazon Prime members today, ad-free viewing will no longer be included as a part of Twitch Prime for new members, beginning on September 14. However, members with existing annual subscriptions will be able to continue to enjoy ad-free viewing until their subscription comes up for renewal. Those with monthly subscriptions will have access to ad-free viewing until October 15.

In [26]:
for i in range(len(doc)):
    print(i, doc[i])

0 Twitch
1 Prime
2 ,
3 the
4 perks
5 program
6 for
7 Amazon
8 Prime
9 members
10 offering
11 free
12 loot
13 ,
14 games
15 and
16 other
17 benefits
18 ,
19 is
20 ditching
21 one
22 of
23 its
24 best
25 features
26 :
27 ad
28 -
29 free
30 viewing
31 .
32 According
33 to
34 an
35 email
36 sent
37 out
38 to
39 Amazon
40 Prime
41 members
42 today
43 ,
44 ad
45 -
46 free
47 viewing
48 will
49 no
50 longer
51 be
52 included
53 as
54 a
55 part
56 of
57 Twitch
58 Prime
59 for
60 new
61 members
62 ,
63 beginning
64 on
65 September
66 14
67 .
68 However
69 ,
70 members
71 with
72 existing
73 annual
74 subscriptions
75 will
76 be
77 able
78 to
79 continue
80 to
81 enjoy
82 ad
83 -
84 free
85 viewing
86 until
87 their
88 subscription
89 comes
90 up
91 for
92 renewal
93 .
94 Those
95 with
96 monthly
97 subscriptions
98 will
99 have
100 access
101 to
102 ad
103 -
104 free
105 viewing
106 until
107 October
108 15
109 .


In [48]:
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
import json

with open("spaCy/exercises/countries.json") as f:
    COUNTRIES = json.loads(f.read(), encoding='UTF8')
with open("spaCy/exercises/country_text.txt",encoding='UTF8') as f:
    TEXT = f.read()

nlp = English()
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# Create a doc and find matches in it
doc = nlp(TEXT)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Create a Span with the label for "GPE"
    span = Span(doc, start, end, label="GPE")

    # Overwrite the doc.ents and add the span
    doc.ents = list(doc.ents) + [span]

    # Get the span's root head token
    span_root_head = span.root.head
    # Print the text of the span root's head token and the span text
    print(span_root_head.text, "-->", span.text)

# Print the entities in the document
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])

Namibia --> Namibia
South --> South Africa
Cambodia --> Cambodia
Kuwait --> Kuwait
Somalia --> Somalia
Haiti --> Haiti
Mozambique --> Mozambique
Somalia --> Somalia
Rwanda --> Rwanda
Singapore --> Singapore
Sierra --> Sierra Leone
Afghanistan --> Afghanistan
Iraq --> Iraq
Sudan --> Sudan
Congo --> Congo
Haiti --> Haiti
[('Namibia', 'GPE'), ('South Africa', 'GPE'), ('Cambodia', 'GPE'), ('Kuwait', 'GPE'), ('Somalia', 'GPE'), ('Haiti', 'GPE'), ('Mozambique', 'GPE'), ('Somalia', 'GPE'), ('Rwanda', 'GPE'), ('Singapore', 'GPE'), ('Sierra Leone', 'GPE'), ('Afghanistan', 'GPE'), ('Iraq', 'GPE'), ('Sudan', 'GPE'), ('Congo', 'GPE'), ('Haiti', 'GPE')]


In [51]:
COUNTRIES

['Afghanistan',
 '횇land Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'United States Minor Outlying Islands',
 'Virgin Islands (British)',
 'Virgin Islands (U.S.)',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cabo Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo (Democratic Republic of the)',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cura챌a

In [52]:
nlp.pipe(COUNTRIES)

<generator object Language.pipe at 0x0000029911B2C930>