In [4]:
# Import spacy and English models
import spacy

nlp = spacy.load('en')

## Process text

In [5]:
doc = nlp(u'')

In [6]:
# Get first token of the processed document
token = doc[0]
print(token)

# Print sentences (one sentence per line)
for sent in doc.sents:
    print(sent)

What
What's up Vienna?
Let's teach computers to understand us.


In [8]:
# For each token, print corresponding part of speech tag
for token in doc:
    print('{} - {}'.format(token, token.pos_))

What - NOUN
's - VERB
up - ADP
Vienna - PROPN
? - PUNCT
Let - VERB
's - PRON
teach - VERB
computers - NOUN
to - PART
understand - VERB
us - PRON
. - PUNCT


## Visual part of speech tagging ([displaCy](https://displacy.spacy.io))

In [9]:
# Write a function that walks up the syntactic tree of the given token and collects all tokens to the root token (including root token).

def tokens_to_root(token):
    """
    Walk up the syntactic tree, collecting tokens to the root of the given `token`.
    :param token: Spacy token
    :return: list of Spacy tokens
    """
    tokens_to_r = []
    while token.head is not token:
        tokens_to_r.append(token)
        token = token.head
        tokens_to_r.append(token)

    return tokens_to_r

# For every token in document, print it's tokens to the root
for token in doc:
    print('{} --> {}'.format(token, tokens_to_root(token)))

# Print dependency labels of the tokens
for token in doc:
    print('-> '.join(['{}-{}'.format(dependent_token, dependent_token.dep_) for dependent_token in tokens_to_root(token)]))


What --> [What, 's]
's --> []
up --> [up, 's]
Vienna --> [Vienna, 's]
? --> [?, 's]
Let --> []
's --> ['s, teach, teach, Let]
teach --> [teach, Let]
computers --> [computers, teach, teach, Let]
to --> [to, understand, understand, teach, teach, Let]
understand --> [understand, teach, teach, Let]
us --> [us, understand, understand, teach, teach, Let]
. --> [., Let]
What-nsubj-> 's-ROOT

up-prep-> 's-ROOT
Vienna-nsubj-> 's-ROOT
?-punct-> 's-ROOT

's-nsubj-> teach-ccomp-> teach-ccomp-> Let-ROOT
teach-ccomp-> Let-ROOT
computers-dobj-> teach-ccomp-> teach-ccomp-> Let-ROOT
to-aux-> understand-xcomp-> understand-xcomp-> teach-ccomp-> teach-ccomp-> Let-ROOT
understand-xcomp-> teach-ccomp-> teach-ccomp-> Let-ROOT
us-dobj-> understand-xcomp-> understand-xcomp-> teach-ccomp-> teach-ccomp-> Let-ROOT
.-punct-> Let-ROOT


In [15]:
# Print all named entities with named entity types

doc_2 = nlp(u"I went to Vienna to meet some really awesome JavaScript friends.")
for ent in doc_2.ents:
    print('{} - {}'.format(ent, ent.label_))

Vienna - GPE
JavaScript - ORG


In [18]:
# Print noun chunks for doc_2
print([chunk for chunk in doc_2.noun_chunks])

[I, Vienna, some really awesome JavaScript friends]


## Unigram probabilities

In [19]:
# For every token in doc_2, print log-probability of the word, estimated from counts from a large corpus 
for token in doc_2:
    print(token, ',', token.prob)

I , -4.064180850982666
went , -8.474893569946289
to , -3.83851957321167
Vienna , -19.579313278198242
to , -3.83851957321167
meet , -9.823533058166504
some , -6.4027814865112305
really , -6.664026737213135
awesome , -8.797789573669434
JavaScript , -19.579313278198242
friends , -8.6137056350708
. , -3.0729479789733887


In [38]:
# For a given document, calculate similarity between 'apples' and 'oranges' and 'boots' and 'hippos'
doc = nlp(u"I went to Vienna on an airplane. I met some awesome JavaScript and Python friends.")
vienna = doc[3]
airplane = doc[6]
javascript = doc[10]
python = doc[12]
print(vienna.similarity(airplane))
print(javascript.similarity(python))

print()
# Print similarity between sentence and word 'fruit'
vienna, friends = doc.sents
city = doc.vocab[u'city']
coding = doc.vocab[u'coding']
print(vienna.similarity(city))
print(friends.similarity(coding))

0.0
0.0

0.445694088466
0.235097658138
