### SpaCy 101

Source: https://spacy.io/usage/spacy-101

### Import Preliminaries

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


### Tokenizations

In [3]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


### Part-of-Speech Tags of Dependencies

In [4]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


### Named Entities

In [5]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


### Word Vectors and Similitary

In [11]:
import en_core_web_sm
nlp = en_core_web_sm.load()
tokens = nlp(u'dog cat banana')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat -1.843209e-22
dog banana 0.0
cat dog -1.843209e-22
cat cat 1.0
cat banana -1.8395509e-22
banana dog 0.0
banana cat -1.8395509e-22
banana banana 1.0


In [13]:
tokens = nlp(u'dog cat banana afskfsd')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 23.858421 True
cat True 24.328733 True
banana True 25.863724 True
afskfsd True 26.598297 True


### Vocab, Hashes, and Lexems

In [14]:
doc = nlp(u'I love coffee')
print(doc.vocab.strings[u'coffee'])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee'

3197928453018144401
coffee


If you return a hash value in the Python 2 interpreter, it'll show up as 3197928453018144401L. The L just means "long integer" – it's not actually a part of the hash value.

In [15]:
doc = nlp(u'I love coffee')
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
          lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en


In [18]:
import spacy
from spacy.tokens import Doc
from spacy.vocab import Vocab

doc = nlp(u'I love coffee') # original Doc
print(doc.vocab.strings[u'coffee'])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])

3197928453018144401
coffee


In [20]:
empty_doc = Doc(Vocab())

In [23]:
# Ad value to vocab
empty_doc.vocab.strings.add(u'coffee')

3197928453018144401

In [24]:
print(empty_doc.vocab.strings[3197928453018144401])

coffee


In [25]:
new_doc = Doc(doc.vocab)

In [26]:
print(new_doc.vocab.strings[3197928453018144401])

coffee


### Serialization