In [2]:
#import the English language class 
from spacy.lang.en import English

In [3]:
# create the nlp object
nlp = English()

In [4]:
nlp

<spacy.lang.en.English at 0x7f4cceee4280>

In [6]:
#processing a string of text by nlp object
doc = nlp('Hello World!')
doc

Hello World!

In [8]:
for token in doc:
    print(token)

Hello
World
!


In [9]:
#tokens
token1 = doc[0]
token2 = doc[1]
token3 = doc[2]
print("First token - ",token1)
print('Second token - ',token2)
print('Third token - ',token3)

First token -  Hello
Second token -  World
Third token -  !


In [10]:
# this token is an instance of Token class from spacy
type(token1)

spacy.tokens.token.Token

In [11]:
# however, token.text returns an instance of str 
print(token1.text)
print(type(token1.text))

Hello
<class 'str'>


In [13]:
#span
span = doc[1:3]
print(span)
print(type(span))

World!
<class 'spacy.tokens.span.Span'>


In [14]:
print(span.text)
print(type(span.text))

World!
<class 'str'>


In [25]:
# Attributes of a token
doc = nlp('My name is Hardik and am 21 years old.')

#tokens
print('Tokens - ',[token for token in doc])

#index of token in a doc
print('Index - ',[token.i for token in doc])

#text of each token (type str)
print('Text - ',[token.text for token in doc])

# check is each token is alpha numeric
print('Aplha - ',[token.is_alpha for token in doc])

# check if each token is a punctuation
print('punct - ',[token.is_punct for token in doc])

# check if each each token is a num
print('Is num - ',[token.like_num for token in doc])

Tokens -  [My, name, is, Hardik, and, am, 21, years, old, .]
Index -  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Text -  ['My', 'name', 'is', 'Hardik', 'and', 'am', '21', 'years', 'old', '.']
Aplha -  [True, True, True, True, True, True, False, True, True, False]
punct -  [False, False, False, False, False, False, False, False, False, True]
Is num -  [False, False, False, False, False, False, True, False, False, False]


In [26]:
# Attributes of a token
doc = nlp('I am twenty one years of age.')

#tokens
print('Tokens - ',[token for token in doc])

#index of token in a doc
print('Index - ',[token.i for token in doc])

#text of each token (type str)
print('Text - ',[token.text for token in doc])

# check is each token is alpha numeric
print('Aplha - ',[token.is_alpha for token in doc])

# check if each token is a punctuation
print('punct - ',[token.is_punct for token in doc])

# check if each each token is a num
print('Is num - ',[token.like_num for token in doc])

Tokens -  [I, am, twenty, one, years, of, age, .]
Index -  [0, 1, 2, 3, 4, 5, 6, 7]
Text -  ['I', 'am', 'twenty', 'one', 'years', 'of', 'age', '.']
Aplha -  [True, True, True, True, True, True, True, False]
punct -  [False, False, False, False, False, False, False, True]
Is num -  [False, False, True, True, False, False, False, False]


Check the is_num in this case, it even prints true even if the input contains a string, like 'twenty', 'one'

# Statistical models

In [27]:
import spacy

# load the small english model
nlp = spacy.load('en_core_web_sm')

In [29]:
# using model to get pos tag
doc = nlp('She ate the pizza')

for token in doc:
    print(token.text,token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [31]:
# using model to get pos tag
doc = nlp('She ate the pizza')

# getting the subject object role for a token along with grouped token
for token in doc:
    print(token.text,token.pos_,token.dep_,token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [35]:
# predicting entities
doc = nlp('Apple is looking to buy Facebook for $10 million')

for ent in doc.ents:
    print(ent.text,ent.label_)

Apple ORG
Facebook ORG
$10 million MONEY


In [36]:
# predicting entities
doc = nlp('I am eating apples sitting in Apple office')

for ent in doc.ents:
    print(ent.text,ent.label_)

Apple ORG


In [38]:
# what does ORG refer to 
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

# Matching patterns

In [3]:
import spacy 

#import matcher 
from spacy.matcher import Matcher

#load the model 
nlp = spacy.load('en_core_web_sm')

#initialize the mathcer 
matcher = Matcher(nlp.vocab)

#add the patter 
pattern = [{'TEXT':'iPhone'},{'TEXT':'X'}]
matcher.add('iphone_x',None,pattern)

#process some text
doc = nlp('Upcoming iPhone X release data leaked')

# find the matches
matches = matcher(doc)

In [4]:
matches

[(1738708750870670527, 1, 3)]