In [1]:
import spacy

## Parse and tokenize

In [2]:
nlp = spacy.load("en_core_web_sm") # loading the model

In [3]:
doc = nlp(u"Tesla is looking at buying U.S. startup for $6 million") # u"string" : Unicode string
# spacy will parse the sentence and tokenize it

In [11]:
for token in doc:
    print(f"{token.text:{10}} {token.pos:{10}}    {token.pos_:{10}} {token.dep_:{10}}")

# PROPN : proper nouns
# VERB  : verb
# NOUN  : noun
# SYM   : symbol
# NUM   : number

# nsubj : noun subject

# dep : syntactic dependency

Tesla              96    PROPN      nsubj     
is                 87    AUX        aux       
looking           100    VERB       ROOT      
at                 85    ADP        prep      
buying            100    VERB       pcomp     
U.S.               96    PROPN      compound  
startup            92    NOUN       dobj      
for                85    ADP        prep      
$                  99    SYM        quantmod  
6                  93    NUM        compound  
million            93    NUM        pobj      


## Pipeline object

In [12]:
nlp.pipeline

# ner : name entity recognizer

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1d4ac34e720>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1d4ac74e6d0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1d4ac742a00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1d4ac742f40>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1d4ac5f5540>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1d4ac5ebe40>)]

In [13]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [23]:
doc2 = nlp(u"Tesla isn't looking into startup     anymore.")

In [27]:
for token in doc2:
    print(f"{token.text:{10}} {token.pos:{10}}    {token.pos_:{10}} {token.dep_:{10}}")

# PUNCT : puncuation

Tesla              96    PROPN      nsubj     
is                 87    AUX        aux       
n't                94    PART       neg       
looking           100    VERB       ROOT      
into               85    ADP        prep      
startup            92    NOUN       pobj      
                  103    SPACE      dobj      
anymore            86    ADV        advmod    
.                  97    PUNCT      punct     


In [28]:
print(f"{doc2[0].text:{10}} {doc2[0].pos:{10}}    {doc2[0].pos_:{10}} {doc2[0].dep_:{10}}")

Tesla              96    PROPN      nsubj     


## Span

In [29]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [30]:
life_quote = doc3[16:30]

In [31]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [32]:
type(life_quote)
# spacy recognize this as a span

spacy.tokens.span.Span

In [33]:
type(doc3)
# spacy recognize this as a doc

spacy.tokens.doc.Doc

In [34]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [35]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [37]:
doc4[6] # this is the second this in sentece

This

In [36]:
doc4[6].is_sent_start # is this a start of a sentence

True

In [38]:
doc4[7]

is

In [42]:
doc4[7].is_sent_start

False