In [None]:
import spacy
#python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

## Tokenization

In [None]:
doc = nlp("If you’re twenty-two, physically fit, hungry to learn and be better, I urge you to travel – as far and as widely as possible. Sleep on floors if you have to. Find out how other people live and eat and cook. Learn from them – wherever you go.")
for token in doc:
    print(token.text, token.pos_, token.dep_)

## Span

In [None]:
from spacy.tokens import Span
span = Span(doc, 3, 6, label = 'NUM')
print(span.text, span.label_)

## Part-of-speech tags and dependencies

In [None]:
print('text \tlemma \tpos \ttag \t\tdep \tshape \tis_alpha \tis_stop')
for token in doc:
    #pos: fine_grained; tag: coarse-grained
    print(token.text + '\t' + token.lemma_ + '\t' + token.pos_ + '\t' + token.tag_ + '\t' + token.dep_ + '\t' + token.shape_ + '\t' + str(token.is_alpha) + '\t' + str(token.is_stop))

## Named Entities

In [None]:
doc_2 = nlp('When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, now the co-founder and CEO of online higher education startup Udacity, in an interview with Recode earlier this week.')
for ent in doc_2.ents:
    print(ent.text + '\t' + str(ent.start_char) +'\t' + str(ent.end_char) + '\t' + ent.label_)

## Word vectors and similarity

In [None]:
for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

In [None]:
for token in doc[:3]:
    print(token.vector) #size: 96

In [None]:
#python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

In [None]:
doc_3 = nlp('The world is a book, and those who do not travel read only a page.')
for token_1 in doc_3:
    for token_2 in doc_3:
        print(token_1.text, token_2.text, token_1.similarity(token_2))

## Pipelines
default model:
Text > tokenizer > tagger > parser > ner > textcat > Doc

## Vocab, hashes and lexemes

In [None]:
doc_4 = nlp('You\'re basic, and he is unfunny!')
print(doc_4.vocab.strings['basic']) #StringStore
print(doc_4.vocab.strings[2744231088585378001])

In [None]:
#every entry in the vocab is a context-independent lexeme
for word in doc_4:
    lexeme = doc_4.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
            lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

## Noun chunks and sentences

In [None]:
noun_chunks = list(doc_2.noun_chunks)
print(noun_chunks[:])

In [None]:
sentences = list(doc_2.sents)
for sentence in sentences:
    print(sentence.text)

## Visualize a dependency parse and named entities

In [None]:
from spacy import displacy

doc_dep = nlp('Germany\'s chancellor is Angela Merkel.')
#displacy.serve(doc_dep, style="dep")
displacy.render(doc_dep, style="dep")

In [None]:
doc_ent = nlp("When Sebastian Thrun started working on self-driving cars at Google "
              "in 2007, few people outside of the company took him seriously.")
#displacy.serve(doc_ent, style="ent")
displacy.render(doc_ent, style="ent")