# Load resources and process text

In [1]:
from __future__ import unicode_literals, print_function
from spacy.en import English
nlp = English()
doc = nlp('Hello, world. here are two sentences.')

# Get tokens and sentences

In [2]:
token = doc[0]
sentence = doc.sents.next()
assert token is sentence[0]

# Use integer IDs for any string

In [6]:
hello_id = nlp.vocab.strings['Hello']
hello_str = nlp.vocab.strings[hello_id]

# assert token.orth == hello_id == 469755

# Word vectors

In [12]:
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")

apples = doc[0]
oranges = doc[1]
boots = doc[6]
hippos = doc[8]

# assert apples.similarity(oranges) > boots.similarity(hippos)

# Syntactic dependencies

In [13]:
def dependency_labels_to_root(token):
    '''Walk up the syntactic tree, collecting the arc labels.'''
    dep_labels = []
    while token.head is not token:
        dep_labels.append(token.dep)
        token = token.head
    return dep_labels

# Named entities

In [15]:
def iter_products(docs):
    for doc in docs:
        for ent in doc.ents:
            if ent.label_ == 'PRODUCT':
                yield ent
 
def word_is_in_entity(word):
    return word.ent_type != 0
 
def count_parent_verb_by_person(docs):
    counts = defaultdict(defaultdict(int))
    for doc in docs:
        for ent in doc.ents:
            if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
                counts[ent.orth_][ent.root.head.lemma_] += 1
    return counts

# Calculate inline mark-up on original string

In [16]:
def put_spans_around_tokens(doc, get_classes):
    '''Given some function to compute class names, put each token in a
    span element, with the appropriate classes computed.
 
    All whitespace is preserved, outside of the spans. (Yes, I know HTML
    won't display it. But the point is no information is lost, so you can
    calculate what you need, e.g. <br /> tags, <p> tags, etc.)
    '''
    output = []
    template = '<span classes="{classes}">{word}</span>{space}'
    for token in doc:
        if token.is_space:
            output.append(token.orth_)
        else:
            output.append(
              template.format(
                classes=' '.join(get_classes(token)),
                word=token.orth_,
                space=token.whitespace_))
    string = ''.join(output)
    string = string.replace('\n', '')
    string = string.replace('\t', '    ')
    return string

# Efficient binary serialization

In [None]:
byte_string = doc.as_bytes()
open('/tmp/moby_dick.bin', 'wb').write(byte_string)

nlp = spacy.en.English()
for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')):
    doc = Doc(nlp.vocab)
    doc.from_bytes(byte_string)