In [1]:
import spacy
from spacy import displacy

## 1 work with english

### 1.1 basic usage

In [7]:
# !python -m spacy download en_core_web_sm

In [6]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. starting up for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
starting VERB advcl
up ADP prep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [11]:
displacy.serve(doc, style='dep')


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [15]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 48 58 MONEY


In [13]:
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


### 1.2 word vectors and simlarity

> better to work with 'en_core_web_lg'.

In [19]:
# %time !python -m spacy download en_core_web_md

In [2]:
nlp = spacy.load("en_core_web_md")
tokens = nlp("dog cat apple play asdfjhf")

In [3]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
apple True 7.1346846 False
play True 6.4752803 False
asdfjhf False 0.0 True


In [22]:
# similarity comparison
doc1 = nlp("i like salad.")
doc2 = nlp("i am a vegetarian.")

print(doc1, "<->", doc2, doc1.similarity(doc2))

i like salad. <-> i am a vegetarian. 0.8431266191108036


## 2 work with chinese

In [32]:
# %time !python -m spacy download zh_core_web_md

In [33]:
nlp = spacy.load("zh_core_web_md")

In [34]:
nlp = spacy.load("zh_core_web_md")
doc = nlp("我今年要买一台新的苹果电脑。")
for token in doc:
    print(token.text, token.pos_, token.dep_)

我 PRON nsubj
今年 NOUN nmod:tmod
要 VERB xcomp
买 VERB ROOT
一 NUM nummod
台 NUM mark:clf
新 ADJ amod
的 PART case
苹果 NOUN compound:nn
电脑 NOUN dobj
。 PUNCT punct


In [35]:
displacy.serve(doc, style='dep')


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [36]:
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [38]:
print(nlp.vocab['广州'].vector[:10])

[ 2.1091   0.17774 -0.51242 -0.58129 -2.856   -1.3484   1.2208   1.5764
  0.88014 -1.0571 ]


In [39]:
# similarity comparison
doc1 = nlp("我喜欢吃沙拉。")
doc2 = nlp("我是一个素食主义者。")
doc3 = nlp("我是肉食动物。")

print(doc1, "<->", doc2, doc1.similarity(doc2))
print(doc1, "<->", doc3, doc1.similarity(doc3))

我喜欢吃沙拉。 <-> 我是一个素食主义者。 0.6783609682783022
我喜欢吃沙拉。 <-> 我是肉食动物。 0.6805734517111528
