#### Tokenizer

In [1]:
import spacy
nlp = spacy.load('en_core_web_md') # returns a Language class instance, nlp. This Language class is the text processing pipeline

In [2]:
doc = nlp('I own a dell computer.') # doc is a Doc class instance

In [3]:
print([token.text for token in doc])

['I', 'own', 'a', 'ginger', 'cat', '.']


In [4]:
doc = nlp("It's been a crazy week!!!")

In [5]:
[token.text for token in doc]

['It', "'s", 'been', 'a', 'crazy', 'week', '!', '!', '!']

#### customizing the tokenizer

In [21]:
doc = nlp("gimme that book")

In [22]:
[w.text for w in doc]

['gimme', 'that', 'book']

In [23]:
from spacy.symbols import ORTH

In [26]:
special_case = [{ORTH: "gim"}, {ORTH: "me"}]

In [27]:
nlp.tokenizer.add_special_case("gimme", special_case)

In [29]:
[w.text for w in doc]

['gimme', 'that', 'book']

#### Debugging the tokenizer

In [12]:
text = "let's go!"

In [13]:
doc = nlp(text)

In [14]:
tok_exp = nlp.tokenizer.explain(text)

In [15]:
for t in tok_exp:
    print(t[1], '\t', t[0])

let 	 SPECIAL-1
's 	 SPECIAL-2
go 	 TOKEN
! 	 SUFFIX


#### Sentence segmentation

In [20]:
text = "I flew to Melbourne yesterday. I was at my hotel at 5 pm."

doc = nlp(text)
for sent in doc.sents:
    print("***")
    print(sent.text)

***
I flew to Melbourne yesterday.
***
I was at my hotel at 5 pm.
