In [1]:
!pip install spacy
#!python -m spacy download en_core_web_sm 
!python -m spacy download en_core_web_lg 

[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


2021-02-24 11:22:56.607154: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-02-24 11:22:56.607434: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# imports
# spacy linguistic-features = https://spacy.io/usage/linguistic-features

import spacy
#nlp = spacy.load('en_core_web_sm') #slow internet
nlp = spacy.load('en_core_web_lg') #fast internet

In [3]:
# tokenization

doc = nlp('I am flying to Manila.')
print((w.text for w in doc))

<generator object <genexpr> at 0x000002829BD46CF0>


In [4]:
# lemmatization

doc = nlp('this product integrates both libraries for downloading and applying patches')
for token in doc:
    print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch


In [5]:
# part of speech tagging

doc = nlp('I have flown to Singapore. I am flying to Manila.')
for token in doc:
    print(token.text, token.pos_, token.tag_)

I PRON PRP
have AUX VBP
flown VERB VBN
to ADP IN
Singapore PROPN NNP
. PUNCT .
I PRON PRP
am AUX VBP
flying VERB VBG
to ADP IN
Manila PROPN NNP
. PUNCT .


In [6]:
# Checking property

spacy.explain('AUX')
spacy.explain('VBG')

'verb, gerund or present participle'

In [7]:
doc

I have flown to Singapore. I am flying to Manila.

In [8]:
for token in doc:
    print(token.text, token.lemma_)

I I
have have
flown fly
to to
Singapore Singapore
. .
I I
am be
flying fly
to to
Manila Manila
. .


In [9]:
print([w.text for w in doc if w.tag_=='VBG' or w.tag_=='VB'])

['flying']


In [10]:
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[I, have, flown, to, Singapore, .]
[I, am, flying, to, Manila, .]


In [11]:
doc = nlp('The Golden Gate Bridge is an iconic landmark in San Francisco.')
print([w.text for w in doc])

['The', 'Golden', 'Gate', 'Bridge', 'is', 'an', 'iconic', 'landmark', 'in', 'San', 'Francisco', '.']


In [12]:
# Retokenization

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[1:3+1])
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[7:8+1])
for token in doc:
    print(token.text, token.lemma_, token.pos_)

The the DET
Golden Gate Bridge Golden Gate Bridge PROPN
is be AUX
an an DET
iconic iconic ADJ
landmark landmark NOUN
in in ADP
San Francisco San Francisco PROPN
. . PUNCT


In [13]:
# depending parsing (with spacy explain)

doc = nlp('I want a green apple.')
for token in doc:
    print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_)) #spacy.explain to explain the property

I PRON nsubj nominal subject
want VERB ROOT None
a DET det determiner
green ADJ amod adjectival modifier
apple NOUN dobj direct object
. PUNCT punct punctuation


In [14]:
# Visualize (Has to shutdown manually)

from spacy import displacy
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [15]:
# entity recognition

doc = nlp('The firm earned $1.5 million in 2017.')
phrase = ''
for token in doc:
    if token.tag_ == '$':
        phrase = token.text
        i = token.i+1
        while doc[i].tag_ == 'CD':
            phrase += doc[i].text + ' '
            i += 1
        phrase = phrase[:-1]
        print(phrase)

$1.5 million


In [16]:
# Checking Property

spacy.explain('CD')

'cardinal number'

In [22]:
# entity recognition

doc = nlp('The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016.')
phrase = ''
for token in doc:
    if token.tag_ == '$':
        phrase = token.text
        i = token.i+1
        while doc[i].tag_ == 'CD':
            phrase += doc[i].text + ' '
            i += 1
        phrase = phrase[:-1]
        print(phrase)

$1.5 million
$1.2 million


In [23]:
# Visualize in HTML

doc = nlp('I want to but an Apple computer.')
from IPython.core.display import display, HTML

from spacy import displacy
html = displacy.render(doc,style='ent',page=True)

display(HTML(html))

<IPython.core.display.HTML object>

In [27]:
# Checking Property

print(spacy.explain('GPE'))
print(spacy.explain('ORG'))

Countries, cities, states
Companies, agencies, institutions, etc.


In [21]:
# Word Similarity

print('apple vs banana: ' + str(nlp('apple').similarity(nlp('banana'))))
print('king vs queen: ' + str(nlp('king').similarity(nlp('queen'))))

doc = nlp('I want a green apple')
doc.similarity(doc[2:4+1])


apple vs banana: 0.5831844168885263
king vs queen: 0.7252610345406867


0.9053610872821355

In [29]:
doc.similarity(doc)

1.0