In [4]:
import spacy
print(spacy.__version__)

3.0.3


In [5]:
nlp = spacy.load("en_core_web_lg")

In [6]:
### Tokenization

doc = nlp(u'I am flying to Manila.')
print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Manila', '.']


In [7]:
### Lemmatization

doc = nlp(u'this product integrates both libraries for downloading and applying patches')
for token in doc:
  print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch


In [12]:
### Part of Speech (POS) Tagging

doc = nlp(u'I have flown to Cebu. Now I am flying to Manila.')
for token in doc:
  print(token.text, "|", token.pos_, "|", token.tag_)

I | PRON | PRP
have | AUX | VBP
flown | VERB | VBN
to | ADP | IN
Cebu | PROPN | NNP
. | PUNCT | .
Now | ADV | RB
I | PRON | PRP
am | AUX | VBP
flying | VERB | VBG
to | ADP | IN
Manila | PROPN | NNP
. | PUNCT | .


In [13]:
spacy.explain("PRP")

'pronoun, personal'

In [16]:
doc = nlp(u'I have flown to Cebu. Now I am flying to Manila.')
for sent in doc.sents:
  print([sent[i] for i in range(len(sent))])

[I, have, flown, to, Cebu, .]
[Now, I, am, flying, to, Manila, .]


In [18]:
doc = nlp(u'The Golden Gate Bridge is an iconice landmark in San Francisco.')
[doc[i] for i in range(len(doc))]

[The, Golden, Gate, Bridge, is, an, iconice, landmark, in, San, Francisco, .]

In [19]:
with doc.retokenize() as retokenizer:
  retokenizer.merge(doc[1:4])
with doc.retokenize() as retokenizer:
  retokenizer.merge(doc[7:9])
  for token in doc:
    print(token.text, token.lemma_, token.pos_)

The the DET
Golden Gate Bridge Golden Gate Bridge PROPN
is be AUX
an an DET
iconice iconice NOUN
landmark landmark NOUN
in in ADP
San San PROPN
Francisco Francisco PROPN
. . PUNCT


In [6]:
### Syntactic Parsing

doc = nlp(u'I want a green apple.')
for token in doc:
  print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_))

I PRON nsubj nominal subject
want VERB ROOT None
a DET det determiner
green ADJ amod adjectival modifier
apple NOUN dobj direct object
. PUNCT punct punctuation


In [7]:
from spacy import displacy
displacy.serve(doc, style = "dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [14]:
import spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp(u'The firm earned $1.5 million in 2017. In comparison with $1.2 million in 2016')
phrase = ""
for token in doc:
  if token.tag_ == "$":
    phrase = token.text
    i = token.i + 1
    while doc[i].tag_ == "CD":
      phrase += doc[i].text + ' '
      i += 1
    phrase = phrase[:-1]
    print(phrase)

$1.5 million
$1.2 million


In [8]:
spacy.explain("CD")

'cardinal number'

In [15]:
import re

pattern = "\$.*million"
test_string = "The firm earned $1.5 million in 2017."
result = re.findall(pattern, test_string)
print(result)

['$1.5 million']


In [17]:
pattern = "\$.+?million"
test_string = "The firm earned $1.5 million in 2017. In comparison with $1.2 million in 2016."
result = re.findall(pattern, test_string)
print(result)

['$1.5 million', '$1.2 million']


In [18]:
from IPython.core.display import display, HTML

doc = nlp(u'I want a Greek pizza.')

from spacy import displacy

html = displacy.render(doc, style = 'ent',page = True)

displacy(HTML(html))

TypeError: 'module' object is not callable

In [19]:
spacy.explain("NORP")

'Nationalities or religious or political groups'

In [20]:

doc = nlp(u'I want to fly to Manila.')

html = displacy.render(doc, style = 'ent',page = True)

displacy(HTML(html))

TypeError: 'module' object is not callable

In [21]:
spacy.explain("GPE")

'Countries, cities, states'

In [22]:
### Similarity 
doc = nlp('I want a green apple.')
doc.similarity(doc[2:5])

0.8776482403927138

In [23]:
doc.similarity(doc)

1.0

In [24]:
nlp('apple').similarity(nlp('banana'))

0.5831844168885263

In [25]:
nlp('king').similarity(nlp('queen'))

0.7252610345406867

In [26]:
nlp('banana').vector

array([ 2.0228e-01, -7.6618e-02,  3.7032e-01,  3.2845e-02, -4.1957e-01,
        7.2069e-02, -3.7476e-01,  5.7460e-02, -1.2401e-02,  5.2949e-01,
       -5.2380e-01, -1.9771e-01, -3.4147e-01,  5.3317e-01, -2.5331e-02,
        1.7380e-01,  1.6772e-01,  8.3984e-01,  5.5107e-02,  1.0547e-01,
        3.7872e-01,  2.4275e-01,  1.4745e-02,  5.5951e-01,  1.2521e-01,
       -6.7596e-01,  3.5842e-01, -4.0028e-02,  9.5949e-02, -5.0690e-01,
       -8.5318e-02,  1.7980e-01,  3.3867e-01,  1.3230e-01,  3.1021e-01,
        2.1878e-01,  1.6853e-01,  1.9874e-01, -5.7385e-01, -1.0649e-01,
        2.6669e-01,  1.2838e-01, -1.2803e-01, -1.3284e-01,  1.2657e-01,
        8.6723e-01,  9.6721e-02,  4.8306e-01,  2.1271e-01, -5.4990e-02,
       -8.2425e-02,  2.2408e-01,  2.3975e-01, -6.2260e-02,  6.2194e-01,
       -5.9900e-01,  4.3201e-01,  2.8143e-01,  3.3842e-02, -4.8815e-01,
       -2.1359e-01,  2.7401e-01,  2.4095e-01,  4.5950e-01, -1.8605e-01,
       -1.0497e+00, -9.7305e-02, -1.8908e-01, -7.0929e-01,  4.01