In [20]:
import spacy

In [21]:
print(spacy.__version__)

3.0.3


In [22]:
nlp = spacy.load("en_core_web_lg")

In [23]:
### tokenization
doc = nlp(u"I am flying to Manila.")
print([word.text for word in doc])

['I', 'am', 'flying', 'to', 'Manila', '.']


In [24]:
### lemmatization
doc = nlp(u"The quicker brown foxes jump over the lazier dog.")
for token in doc:
    print(token.text, token.lemma_)

The the
quicker quick
brown brown
foxes fox
jump jump
over over
the the
lazier lazy
dog dog
. .


In [25]:
### POS tagging
doc = nlp(u"I have flown to Cebu. Now I am flying to Manila.")
for token in doc:
    print(token.text, token.pos_)

I PRON
have AUX
flown VERB
to ADP
Cebu PROPN
. PUNCT
Now ADV
I PRON
am AUX
flying VERB
to ADP
Manila PROPN
. PUNCT


In [26]:
spacy.explain("PRP")

'pronoun, personal'

In [27]:
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[I, have, flown, to, Cebu, .]
[Now, I, am, flying, to, Manila, .]


In [28]:
doc = nlp(u"The Golden Gate Bridge is an iconic landmark in San Francisco.")
[doc[i] for i in range(len(doc))]

[The, Golden, Gate, Bridge, is, an, iconic, landmark, in, San, Francisco, .]

In [29]:
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[1:4])
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[7:9])
for token in doc:
    print(token.text, token.lemma_, token.pos_)

The the DET
Golden Gate Bridge Golden Gate Bridge PROPN
is be AUX
an an DET
iconic iconic ADJ
landmark landmark NOUN
in in ADP
San Francisco San Francisco PROPN
. . PUNCT


In [30]:
### Syntactic Parsing
doc = nlp(u"I want a green apple.")
for token in doc:
    print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_))

I PRON nsubj nominal subject
want VERB ROOT None
a DET det determiner
green ADJ amod adjectival modifier
apple NOUN dobj direct object
. PUNCT punct punctuation


In [31]:
from spacy import displacy

In [32]:
# displacy.serve(doc, style="dep")

In [33]:
doc = nlp(u"The firm earned $1.5 million in 2017.")
phrase = ""
for token in doc:
    if token.tag_ == "$":
        i = token.i + 1
        while doc[i].tag == "CD":
            phrase += doc[i].text + " "
            i += 1
        break

phrase = phrase[:-1]
print(phrase)




In [34]:
import spacy

nlp = spacy.load("en_core_web_lg")
doc = nlp(u"The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016.")
phrase = ""
for token in doc:
    if token.tag_ == "$":
        phrase = token.text
        i = token.i + 1
        while doc[i].tag_ == "CD":
            phrase += doc[i].text + " "
            i += 1
        phrase = phrase[:-1]
        print(phrase)

$1.5 million
$1.2 million


In [36]:
import regex as re
pattern = "\$.+?million"
test_string = "The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016."
result = re.findall(pattern, test_string)
print(result)

['$1.5 million', '$1.2 million']


In [37]:
from IPython.core.display import display, HTML

In [38]:
spacy.explain("GPE")

'Countries, cities, states'

In [39]:
### similarity
doc = nlp(u"I want a green apple.")
doc.similarity(doc[2:5])

0.8776482403927138

In [40]:
doc.similarity(doc)

1.0

In [41]:
nlp("apple").similarity(nlp("banana"))

0.5831844168885263

In [42]:
nlp("king").similarity(nlp("queen"))

0.7252610345406867