In [1]:
import spacy

In [3]:
nlp= spacy.load('en_core_web_sm')

In [7]:
doc = nlp(u'I am looking to invest $6 million into a U.K. private company names Barclays so that my money will inflate.')

In [12]:
for token in doc:
    print(token, token.pos_, token.dep_)

I PRON nsubj
am AUX aux
looking VERB ROOT
to PART aux
invest VERB xcomp
$ SYM quantmod
6 NUM compound
million NUM dobj
into ADP prep
a DET det
U.K. PROPN nmod
private ADJ amod
company NOUN compound
names NOUN pobj
Barclays PROPN npadvmod
so SCONJ mark
that SCONJ mark
my PRON poss
money NOUN nsubj
will AUX aux
inflate VERB advcl
. PUNCT punct


In [13]:
doc4=nlp('what are you doin? do you think this is a game? I know you are an idiot.')

In [14]:
#accessing sentences 
for sentence in doc4.sents:
    print(sentence)

what are you doin?
do you think this is a game?
I know you are an idiot.


In [15]:
#count number of tokens
len(doc4)

20

In [19]:
doc=nlp(u'Apple and Microsoft who have offices in San Francisco hired Robert who lives in Nepal to work on Oracle for $40000')

In [24]:
for token in doc:
    print(token, end=' | ')

Apple | and | Microsoft | who | have | offices | in | San | Francisco | hired | Robert | who | lives | in | Nepal | to | work | on | Oracle | for | $ | 40000 | 

In [27]:
# to find information about the named entites
for entity in doc.ents:
    print(entity, entity.label_)

Apple ORG
Microsoft ORG
San Francisco GPE
Robert PERSON
Nepal GPE
40000 MONEY


In [28]:
from spacy import displacy

In [151]:
doc10=nlp(u'That chicken cost me $10 and it is tough!'
         u'Also I bought it from Walmart in Janesville along with an Iphone and Cheetos.')

In [152]:
# to display syntactic dependancy
displacy.render(doc10, style='dep', jupyter=True, )

In [153]:
# to display entity relation
displacy.render(doc10, style='ent', jupyter=True, )

In [158]:
# to display the entittes of only a certain type and as a certain color, pass that as a dictionary as options
color={'GPE':'red'}
options={'ents':['GPE', 'Person'], 'colors':color}
displacy.render(doc10, style='ent', jupyter=True, options=options )

In [155]:
# to print the lemma of each individual token
for token in doc10:
    print(token, token.pos_, token.lemma_)

That DET that
chicken NOUN chicken
cost VERB cost
me PRON I
$ SYM $
10 NUM 10
and CCONJ and
it PRON it
is AUX be
tough!Also ADV tough!also
I PRON I
bought VERB buy
it PRON it
from ADP from
Walmart PROPN Walmart
in ADP in
Janesville PROPN Janesville
along ADP along
with ADP with
an DET an
Iphone PROPN Iphone
and CCONJ and
Cheetos PROPN Cheetos
. PUNCT .


In [39]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [40]:
len(nlp.Defaults.stop_words)

326

In [61]:
# to add or remove a word that you think should be considered a stop word to the default list above
nlp.Defaults.stop_words.add('funky')
nlp.vocab['funky'].is_stop=True
nlp.Defaults.stop_words.remove('yourself')

# POS tagging

In [80]:
textt=nlp(u'The dog ran and jumped the man who was reading a book.')

In [81]:
# to find the fine grain part of speech tag we use the .tag_
for token in textt:
    print(token, token.pos_,token.tag_, spacy.explain(token.tag_))

The DET DT determiner
dog NOUN NN noun, singular or mass
ran VERB VBD verb, past tense
and CCONJ CC conjunction, coordinating
jumped VERB VBD verb, past tense
the DET DT determiner
man NOUN NN noun, singular or mass
who PRON WP wh-pronoun, personal
was AUX VBD verb, past tense
reading VERB VBG verb, gerund or present participle
a DET DT determiner
book NOUN NN noun, singular or mass
. PUNCT . punctuation mark, sentence closer


In [89]:
POS_counts= textt.count_by(spacy.attrs.TAG)

In [90]:
POS_counts

{15267657372422890137: 3,
 15308085513773655218: 3,
 17109001835818727656: 3,
 17571114184892886314: 1,
 4808651922106831370: 1,
 1534113631682161808: 1,
 12646065887601541794: 1}

In [88]:
textt.vocab[90].text

'DET'

In [92]:
DEP_counts=textt.count_by(spacy.attrs.DEP)

In [94]:
DEP_counts.items()

dict_items([(415, 3), (429, 2), (8206900633647566924, 1), (407, 1), (410, 1), (416, 2), (405, 1), (447, 1), (445, 1)])

In [95]:
for k,v in sorted(DEP_counts.items()):
    print (textt.vocab[k].text, v)

aux 1
cc 1
conj 1
det 3
dobj 2
nsubj 2
punct 1
relcl 1
ROOT 1


In [143]:
ent_text1=nlp(u'Microsoft is a new organization and SpaceX is a branch of that company and is focused on building new technology to go to space from Nepal')

In [138]:
# here we will see that Tesla and Spacex are not recognized as an entity
def show_ents(ent_text):
    for ent in ent_text.ents:
        print(ent, ent.label_, ent.start)

In [144]:
#if we want to add new words as an entity of a certain type
from spacy.tokens import Span
ORG= doc.vocab.strings[u'ORG']
#new_ent=Span(ent_text1, 1,2, label=ORG)
new_ent=Span(ent_text, 7,8, label=ORG)
ent_text.ents=list(ent_text.ents)+ [new_ent]

ValueError: [E1010] Unable to set entity information for token 7 which is included in more than one span in entities, blocked, missing or outside.

In [133]:
# if we had more than a single entity that we needed to add then we will need to use span in a different manner
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

show_ents(doc)

first ORDINAL 19


In [136]:
from spacy.matcher import PhraseMatcher

In [137]:
matcher=PhraseMatcher(nlp.vocab)

In [145]:
# Create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [146]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

# See what matches occur:
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [147]:
# Here we create Spans from each match, and create named entities from them:
PROD = doc.vocab.strings[u'PRODUCT']

new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

In [148]:
show_ents(doc)

vacuum cleaner PRODUCT 7
vacuum cleaner PRODUCT 14
first ORDINAL 19


In [150]:
#to find the number of entities 
len([ent for ent in doc.ents])
#to find the number of entities of a certian type for example money
len([ent for ent in doc.ents if ent.label_=='MONEY'])

0

In [None]:
#adding a sentence segmentation rule
