In [1]:
pip install spacy



In [2]:
# import spaCy and load the language library

import spacy
nlp = spacy.load('en_core_web_sm')        # spaCy's smallest English language model

In [6]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f0ec2a38490>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f0ebe9f09f0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f0ebe9f0980>)]

In [5]:
# create a Doc object
doc = nlp('India records 41,649 new Covid cases; active cases rise for 4th day.')

# print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_, token.lemma_)             # token, part of speech, dependency, lemma

India PROPN nsubj India
records VERB ccomp record
41,649 NUM nummod 41,649
new ADJ amod new
Covid PROPN compound Covid
cases NOUN dobj case
; PUNCT punct ;
active ADJ amod active
cases NOUN nsubj case
rise VERB ROOT rise
for ADP prep for
4th ADJ amod 4th
day NOUN pobj day
. PUNCT punct .


In [7]:
doc[0:5]

India records 41,649 new Covid

In [12]:
type(doc[0:5])            # span is a slice of Doc object

spacy.tokens.span.Span

In [11]:
doc[0].pos_, doc[0].tag_, doc[0].shape_, doc[0].is_alpha, doc[0].is_stop

('PROPN', 'NNP', 'Xxxxx', True, False)

|Tag|Description|doc[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`India`|
|`.lemma_`|The base form of the word|`India`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [8]:
spacy.explain('ccomp')        # to explain anything

'clausal complement'

In [13]:
doc1 = nlp('Hi how are you? I am fine. Thanks for asking!')

for sent in doc1.sents:
    print(sent)

Hi how are you?
I am fine.
Thanks for asking!


In [15]:
doc = nlp('India records 41,649 new Covid cases; active cases rise for 4th day.')

for token in doc:
    print(token.text, end=' | ')

print('\n')

# Named Entity Recognition (NER)

for ent in doc.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

India | records | 41,649 | new | Covid | cases | ; | active | cases | rise | for | 4th | day | . | 

India - GPE - Countries, cities, states
41,649 - CARDINAL - Numerals that do not fall under another type
Covid - PRODUCT - Objects, vehicles, foods, etc. (not services)
4th day - DATE - Absolute or relative dates or periods


In [16]:
# Noun Chunks

for chunk in doc.noun_chunks:
    print(chunk.text)

India
41,649 new Covid cases
active cases
4th day


In [3]:
# Visualizing the entity recognizer

from spacy import displacy

doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

In [6]:
!python -m spacy download en_core_web_md      # this model is larger library with built-in word vectors

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.2 MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-py3-none-any.whl size=98051301 sha256=3bf4371f4b50aab5b709eb2ccab6bf9fb4bb173b0769df158b74639fd62352e4
  Stored in directory: /tmp/pip-ephem-wheel-cache-l0fmdwsy/wheels/69/c5/b8/4f1c029d89238734311b3269762ab2ee325a42da2ce8edb997
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [1]:
# restart runtime and run

import spacy
nlp = spacy.load('en_core_web_md')

In [4]:
txt = 'hi hello'

nlp(txt).vector

array([ 0.140563  ,  0.25741   , -0.570875  ,  0.0664975 ,  0.40275002,
        0.147045  ,  0.479875  , -0.86797   , -0.013234  ,  0.70441496,
       -0.18727851, -0.55326   ,  0.031088  , -0.302795  , -0.218425  ,
        0.1714465 , -0.1402215 ,  1.15686   , -0.064224  ,  0.023647  ,
       -0.038963  ,  0.10557149, -0.0825845 ,  0.13992   ,  0.00646   ,
       -0.224475  ,  0.006263  , -0.10045465,  0.07977   ,  0.2405525 ,
        0.0824495 ,  0.199375  , -0.26063502, -0.127125  , -0.312355  ,
       -0.21125549, -0.19742501,  0.0931035 , -0.1841445 , -0.19441   ,
        0.19955501, -0.39757   , -0.17167701, -0.20956999, -0.11297535,
        0.25898   ,  0.126335  ,  0.051426  ,  0.35930002,  0.07032099,
       -0.1369565 ,  0.63065   , -0.122751  , -0.1530238 , -0.21321501,
       -0.09383149, -0.120671  ,  0.124517  , -0.336155  ,  0.0193    ,
       -0.650625  , -0.103172  , -0.09079599, -0.496495  ,  0.1688545 ,
       -0.232255  ,  0.062425  ,  0.35109502,  0.1771375 , -0.09

In [5]:
# check word similarity

tokens = nlp('apple orange cat')

# Iterate through token combinations:
for t1 in tokens:
    for t2 in tokens:
        print(t1.text, t2.text, t1.similarity(t2))

apple apple 1.0
apple orange 0.56189173
apple cat 0.28213844
orange apple 0.56189173
orange orange 1.0
orange cat 0.3288469
cat apple 0.28213844
cat orange 0.3288469
cat cat 1.0
