# Chapter 1

In [9]:
# Compact English pipeline
from spacy.lang.en import English

# Preprocessing English pipeline instantiation
nlp = English()

# Preprocessed nlp
doc = nlp('Hello world!')

# Iterating over token texts if Doc
print([token.text for token in doc])

# Iterate over spesific token in Doc
print('Token at position 1:', doc[1].text)

# Iterate over a span in DOc
print('Span 1-3:', doc[1:3].text)

['Hello', 'world', '!']
Token at position 1: world
Span 1-3: world!


In [12]:
from spacy.lang.en import English

nlp = English()

doc = nlp('It costs $5.')

for token in doc:
    print(token.text, token.is_alpha, token.is_punct, token.is_stop, sep = '\t')

It	True	False	True
costs	True	False	False
$	False	False	False
5	False	False	False
.	False	True	False


In [17]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')

doc = nlp('She ate the pizza')

for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text, sep = '\t')
    
displacy.render(doc, style = 'dep')

She	PRON	nsubj	ate
ate	VERB	ROOT	ate
the	DET	det	pizza
pizza	NOUN	dobj	ate


In [25]:
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.label, sep = '\t')
    
print('NNP:', spacy.explain('NNP'))

Apple	383
U.K.	384
$1 billion	394
NNP: noun, proper singular


In [39]:
from spacy.matcher import Matcher
import spacy

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

for match_id, start, end in matches:
    print(doc[start: end])

iPhone X


In [44]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]

matcher = Matcher(nlp.vocab)

doc = nlp("2018 FIFA World Cup: France won!")

matcher.add("IPHONE_PATTERN", [pattern])

# Call the matcher on the doc
matches = matcher(doc)

for match_id, start, end in matches:
    print(doc[start: end])

2018 FIFA World Cup:


In [45]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]

matcher = Matcher(nlp.vocab)

doc = nlp("I bought a smartphone. Now I'm buying apps.")

matcher.add("IPHONE_PATTERN", [pattern])

# Call the matcher on the doc
matches = matcher(doc)

for match_id, start, end in matches:
    print(doc[start: end])

bought a smartphone
buying apps
