# ch2 - Core operations with spaCy

* Overview of spaCy conventions
* Introducing tokenization
* Understanding lemmatization
* spaCy container objects
* More spaCy features

In [2]:
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("I went there")

## Introducing tokenization

In [1]:
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("I own a ginger cat.")
print([token.text for token in doc])

['I', 'own', 'a', 'ginger', 'cat', '.']


In [2]:
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("It's been a crazy week!!!")
print([token.text for token in doc])

['It', "'s", 'been', 'a', 'crazy', 'week', '!', '!', '!']


## Customizing the tokenizer

In [3]:
import spacy
from spacy.symbols import ORTH
nlp = spacy.load("en_core_web_md")
doc = nlp("lemme that")
print([w.text for w in doc])


['lemme', 'that']


In [4]:
# define special case
special_case = [{ORTH: "lem"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("lemme", special_case)
print([w.text for w in nlp("lemme that")])

['lem', 'me', 'that']


In [5]:
print([w.text for w in nlp("lemme!")])

['lem', 'me', '!']


In [6]:
nlp.tokenizer.add_special_case("...lemme...?", [{ORTH: "...lemme...?"}])
print([w.text for w in nlp("...lemme...?")])

['...lemme...?']


## Debugging the tokenizer

In [7]:
import spacy
nlp = spacy.load("en_core_web_md")
text = "Let's go!"
doc = nlp(text)
tok_exp = nlp.tokenizer.explain(text)
for t in tok_exp:
    print(t[1], "\t", t[0])

Let 	 SPECIAL-1
's 	 SPECIAL-2
go 	 TOKEN
! 	 SUFFIX


## Sentence segmentation

In [None]:
import spacy
