In [1]:
import spacy

In [6]:
nlp = spacy.load('en_core_web_sm')

In [3]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.2MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp37-none-any.whl size=829180944 sha256=050f0b63a298a779d4f7b31f8925bec04e1b571134a94b6db2eb1890ccaa2437
  Stored in directory: /tmp/pip-ephem-wheel-cache-0lyg2qy4/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [7]:
### tokenization
# u->unicode
doc = nlp(u'I am flying to Manila')
print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Manila']


In [8]:
### lemmatization

doc = nlp(u'this product integrates both libraries for downloading and applying patches')
for token in doc:
    print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch


In [9]:
### Part of speech tagging

doc = nlp(u'I have flown to Cebu. Now I am flying to Manila.')
for token in doc:
    print(token.text, token.pos_, token.tag_)
    
# token.tag - more detailed pos

I PRON PRP
have AUX VBP
flown VERB VBN
to ADP IN
Cebu PROPN NNP
. PUNCT .
Now ADV RB
I PRON PRP
am AUX VBP
flying VERB VBG
to ADP IN
Manila PROPN NNP
. PUNCT .


In [23]:
spacy.explain('PRP')

'pronoun, personal'

In [10]:
doc = nlp(u'I have flown to Cebu. Now I am flying to Manila.')
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[I, have, flown, to, Cebu, .]
[Now, I, am, flying, to, Manila, .]


In [11]:
doc = nlp(u'The Golden Gate Bridge is an iconic landmark in San Francisco.')
[doc[i] for i in range(len(doc))]

## retokenize

# Golden Gate Bridge
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[1:4])
    
# SF
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[7:9])
    
for token in doc:
    print(token.text, token.lemma_, token.pos_)    

The the DET
Golden Gate Bridge Golden Gate Bridge PROPN
is be AUX
an an DET
iconic iconic ADJ
landmark landmark NOUN
in in ADP
San Francisco San Francisco PROPN
. . PUNCT


In [12]:
### syntactic parsing
# dep - dependency

doc = nlp(u'I want a green apple.')
for token in doc:
    print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_))
    

I PRON nsubj nominal subject
want VERB ROOT None
a DET det determiner
green ADJ amod adjectival modifier
apple NOUN dobj direct object
. PUNCT punct punctuation


In [13]:
from spacy import displacy
displacy.serve(doc, style = 'dep')


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [15]:
##Get $ amount using spacy

doc = nlp(u'The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016.')
phrase=''

for token in doc:
    if token.tag_ = '$':
        phrase = token.text
        i = token.i + 1
        while doc[i].tag_ = 'CD':
            phrase += doc[i].text + ' '
            i += 1
        phrase = phrase[:-1]
        print(phrase)

SyntaxError: ignored

In [16]:
# Get $amount using regex
import re

pattern = '\$.*million'
test_string = 'The firm earned $1.5 million in 2017.'
result = re.findall(pattern, test_string)
print(result)

['$1.5 million']


In [17]:
import re

pattern = '\$.+?million'
test_string = 'The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016.'
result = re.findall(pattern, test_string)
print(result)

['$1.5 million', '$1.2 million']


In [18]:
### entity recognition
#NORP - nationalities or religiouis or political group

from IPython.core.display import display, HTML

doc = nlp(u'I want a Greek pizza.')

from spacy import displacy
html = displacy.render(doc, style = 'ent', page = True)

display(HTML(html))

In [19]:
from IPython.core.display import display, HTML

doc = nlp(u'I want to fly to Manila.')

from spacy import displacy
html = displacy.render(doc, style = 'ent', page = True)

display(HTML(html))

In [20]:
##similarity

doc = nlp(u'I want a green apple.')
doc.similarity(doc[2:5])

doc.similarity(doc)

  "__main__", mod_spec)


1.0

In [21]:
nlp('apple').similarity(nlp('banana'))
nlp('king').similarity(nlp('queen'))

  "__main__", mod_spec)
  "__main__", mod_spec)


0.6853425177985294

In [22]:
#embedding
nlp('banana').vector

array([ 1.5882427 , -4.622345  ,  0.835494  , -1.5251803 ,  2.5824933 ,
        2.242254  ,  1.1259234 , -0.21041167,  1.8154271 ,  2.7201712 ,
        3.8814943 ,  0.16758633,  0.26242155, -1.7026334 ,  1.5412636 ,
       -0.393943  , -2.876206  ,  3.2735553 ,  0.23344952,  0.18415219,
        0.43752107,  2.1485398 ,  0.4193222 , -2.0388541 ,  0.65415883,
       -1.294862  ,  2.4144628 , -2.745386  ,  2.1168573 , -1.8451903 ,
        2.027801  , -1.6202624 ,  0.5726354 ,  0.34060296,  0.8692036 ,
       -3.8980675 ,  4.6901174 ,  2.1622126 , -1.4661814 ,  0.23460823,
        4.3306155 ,  1.6257911 ,  0.12003034, -5.4100738 ,  0.7476239 ,
        1.5680416 , -0.84663755,  0.17939603,  0.13341138,  2.232483  ,
       -2.099672  , -3.020019  , -0.95659536, -0.01012713, -2.1324272 ,
       -0.92933816,  1.2745494 ,  1.927857  , -0.4788074 ,  1.7574201 ,
        0.5604429 , -1.4527423 , -2.4652877 ,  2.0573397 , -0.4817862 ,
       -1.7056906 ,  2.9862657 , -3.3478055 , -0.13413234,  1.85