In [3]:
import spacy

In [4]:
spacy.__version__

'2.0.12'

In [9]:
nlp = spacy.load('en')

In [10]:
doc = nlp(u'I am learning how to build chatbots')

In [11]:
for token in doc:
    print(token.text,token.pos_)

I PRON
am VERB
learning VERB
how ADV
to PART
build VERB
chatbots NOUN


In [15]:
doc2 = nlp(u'I am going to London next weekend for a meeting') 

In [16]:
for token in doc2:
    print(token.text,token.pos_)

I PRON
am VERB
going VERB
to ADP
London PROPN
next ADJ
weekend NOUN
for ADP
a DET
meeting NOUN


# doc is a container object for accessing annotations

In [17]:
doc3 = nlp(u'Google release "Move mirror" AI experiment that matches your pose from 80,000 images')

In [18]:
for token in doc3:
    print(token.text,token.lemma_,token.pos_,token.tag_,token.dep_,token.shape_,token.is_alpha,token.is_stop)

Google google PROPN NNP compound Xxxxx True False
release release NOUN NN nmod xxxx True False
" " PUNCT `` punct " False False
Move move NOUN NN nmod Xxxx True False
mirror mirror NOUN NN nmod xxxx True False
" " PUNCT '' punct " False False
AI ai PROPN NNP compound XX True False
experiment experiment NOUN NN ROOT xxxx True False
that that ADJ WDT nsubj xxxx True True
matches match VERB VBZ relcl xxxx True False
your -PRON- ADJ PRP$ poss xxxx True True
pose pose NOUN NN dobj xxxx True False
from from ADP IN prep xxxx True True
80,000 80,000 NUM CD nummod dd,ddd False False
images image NOUN NNS pobj xxxx True False


In [19]:
for token in doc:

SyntaxError: unexpected EOF while parsing (<ipython-input-19-f1a96bc06c43>, line 1)

In [20]:
for token in doc:
    print(token.text,token.lemma_,token.pos_,token.tag_,token.dep_,token.shape_,token.is_alpha,token.is_stop)

I -PRON- PRON PRP nsubj X True False
am be VERB VBP aux xx True True
going go VERB VBG ROOT xxxx True False
to to ADP IN prep xx True True
London london PROPN NNP pobj Xxxxx True False
next next ADJ JJ amod xxxx True True
week week NOUN NN npadvmod xxxx True False
for for ADP IN prep xxx True True
a a DET DT det x True True
meeting meeting NOUN NN pobj xxxx True False


# Words Explained

# LEMMA Root form of the word being processed
POS Part-of-speech of the word
TAG They express the part-of-speech (e.g., VERB) and some amount of morphological
information (e.g., that the verb is past tense).
DEP Syntactic dependency (i.e., the relation between tokens)
SHAPE Shape of the word (e.g., the capitalization, punctuation, digits format)
ALPHA Is the token an alpha character?
Stop Is the word a stop word or part of a stop list?

# NEED FOR POS Tagging

Answer: to reduce the complexity of understanding a text that can’t be trained or
is trained with less confidence. By use of POS tagging, we can identify parts of the text
input and do string matching only for those parts. For example, if you were to find if a
location exists in a sentence, then POS tagging would tag the location word as NOUN, so
you can take all the NOUNs from the tagged list and see if it’s one of the locations from
your preset list or not.

# STEMMING

Stemming is the process of reducing inflected words to their word stem, base form.
A stemming algorithm reduces the words “saying” to the root word “say,” whereas
“presumably” becomes presum. As you can see, this may or may not always be 100%
correct.

# Lemmatization

• Lemmatization tries to do the job more elegantly with the use of a
vocabulary and morphological analysis of words. It tries its best to
remove inflectional endings only and return the dictionary form of a
word, known as the lemma.

• Stemming does the job in a crude, heuristic way that chops off the
ends of words, assuming that the remaining word is what we are
actually looking for, but it often includes the removal of derivational
affixes.


In [25]:
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX ,LEMMA_EXC, LEMMA_RULES
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC,LEMMA_RULES)
lemmatizer('chuckles','NOUN')

['chuckle']

In [26]:
lemmatizer('blazing','verb')

['blaze']

In [27]:
lemmatizer('fastest','adj')

['fast']

# USE OF STEMMERS

using PorterStemmer And SnowballStemmer

In [28]:
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer('english')
print(porter_stemmer.stem('fastest'))
print(snowball_stemmer.stem('fastest'))

fastest
fastest


# USE OF LEMMATIZATION

You must have wondered how Google gives you
the articles in search results that you meant to get even when the search text was not
properly formulated.
This is where one makes use of lemmatization.

# Named-Entity-Recognition

In [29]:
my_string= u"Google has it's headquarters in Mountain View,California having revenue amounted to 109.65 billion US dollars"

In [30]:
doc=nlp(my_string)

In [31]:
for ent in doc.ents:
    print(ent.text,ent.label_)

Google ORG
Mountain View GPE
California GPE
109.65 billion US dollars MONEY


In [32]:
my_string2 = u"Mark Zuckerberg born May 14,1984 in New York is an American  technology entrpreneur and philanthropist best known for co-founding and leading Facebook as it's chairman and CEO"

In [33]:
doc1= nlp(my_string2)

In [34]:
for ent in doc1.ents:
    print(ent.text,ent.label_)

Mark Zuckerberg PERSON
May 14,1984 DATE
New York GPE
American NORP
Facebook ORG


In [38]:
doc3=nlp(u"I usually wakke up at 9:00 AM . 90% of my daytime goes in learning new things.")

In [39]:
for ent in doc3.ents:
    print(ent.text,ent.label_)

9:00 AM TIME
90% PERCENT


As you can see, the entity extractor can easily extract the time information from the
given string. Also as you can see entity extractor not just tries to identify the number but
also exact PERCENTAGE value.

In [40]:
doc4 =  nlp(u"Imagine Dragons are the best band")
doc5 =  nlp(u"Imagine Dragons come and take over the city")

In [41]:
for ent in doc4.ents:
    print(ent.text,ent.label_)

Imagine Dragons ORG


In [43]:
for ent in doc5.ents:
    print(ent.text,ent.label_)

Now, imagine you were to extract the context of the above two strings in a live
environment. What would you do? With help of Entity Extractor, one can easily figure out
the context of the statement and intelligently take the conversation further.

# STOPWORDS

Stop words are high-frequency words like a, an, the, to and also that we sometimes
want to filter out of a document before further processing. Stop words usually have little
lexical content and do not hold much of a meaning

In [44]:
#all the words which are identified as stopwords in spacy

In [45]:
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'yours', 'call', 'make', 'ten', 'at', 'any', 'even', 'after', 'indeed', 'most', 'something', 'sometimes', 'much', 'somewhere', 'me', 'whole', 'will', 'yourselves', 'already', 'again', 'with', 'afterwards', 'give', 'her', 'besides', 'name', 'become', 'becoming', 'down', 'first', 'least', 'only', 'is', 'thus', 'must', 'none', 'third', 'whence', 'beyond', 'herein', 'fifteen', 'there', 'be', 'enough', 'move', 'unless', 'every', 'along', 'five', 'own', 'hereafter', 'he', 'hereupon', 'just', 'thence', 'ca', 'whereas', 'made', 'within', 'before', 'formerly', 'them', 'one', 'these', 'whither', 'however', 'when', 'they', 'a', 'various', 'bottom', 'wherein', 'alone', 'was', 'that', 'serious', 'well', 'almost', 'go', 'nowhere', 'once', 'mostly', 'should', 'whenever', 'nothing', 'does', 'whereafter', 'ourselves', 'beforehand', 'several', 'upon', 'such', 'others', 'keep', 'had', 'over', 'out', 'either', 'each', 'may', 'themselves', 'i', 'twelve', 'nevertheless', 'has', 'namely', 'amount', 'else', 

There are about 305 stop words defined in spaCy’s stop words list. You can always
define your own stop words if needed and override the existing list.
To see if a word is a stop word or not, you can use the nlp object of spaCy. We can use
the nlp object’s is_stop attribute.

In [46]:
nlp.vocab[u'is'].is_stop

True

In [47]:
nlp.vocab[u"hello"].is_stop

False

In [48]:
nlp.vocab[u'with'].is_stop

True

Stop words are a very important part of text clean up. It helps removal of meaningless
data before we try to do actual processing to make sense of the text.

# Dependency Parsing

In [49]:
doc6 = nlp(u'Book me a flight from Bangalore to Goa')

In [56]:
blr,goa = doc6[5],doc6[7]

In [54]:
doc6[5]

Bangalore

In [55]:
doc6[7]

Goa

In [57]:
list(blr.ancestors)

[from, flight, Book]

In [58]:
list(goa.ancestors)

[to, flight, Book]

# What Are Ancestors in Dependency Parsing?

Ancestors are the rightmost token of this token’s syntactic descendants. Like in the above
example for the object blr the ancestors were from, flight, and Book

In [60]:
#to check whether a particular element is ancestor or not
doc6[3].is_ancestor(doc6[5])

True

In [61]:
docf = nlp('I want to book a cab to the hotel and a table at a restaurant')

In [67]:
as = docf[8]

SyntaxError: invalid syntax (<ipython-input-67-f17af37d47c4>, line 1)

In [68]:
list(docf[8].ancestors)

[to, book, want]

# Example1

In [69]:
doc = nlp(u'Book a Table at the restaurant and the taxi to the hotel')
tasks = doc[2],doc[8]  #(table,taxi)
tasks_target = doc[5],doc[11]  #(restaurant,hotel)

for task in tasks_target:
    for tok in task.ancestors:
        if tok in tasks:
            print('Booking of {} belongs to {}'.format(tok,task))
            
    break
    

Booking of Table belongs to restaurant


# Interactive Visualization for Dependency Parsing

In [70]:
#run the following code
from spacy import displacy
doc = nlp(u'Book a table at the restaurant and the taxi to the hotel')
displacy.serve(doc, style='dep')
#go to the link in your browser
# http://localhost:5000

TypeError: __init__() got an unexpected keyword argument 'encoding'

# Example2

In [71]:
doc = nlp(u'What are some places to visit in Berlin and stay in Lubeck')
places = [doc[7],doc[11]]
actions = [doc[5],doc[9]]

In [72]:
for place in places:
    for tok in place.ancestors:
        if tok in actions:
            print('User is referring {} to {}'.format(place,tok))
            break

User is referring Berlin to visit
User is referring Lubeck to stay


As we see in these examples, dependency parsing makes it quite easy to understand
what the user is referring to. We saw that in the case of two different tasks as well, we can
pretty figure out the expectation and, based on that, formulate the next response.

# Dependency Parsing Helps in

• It helps in finding relationships between words of grammatically
correct sentences.

• It can be used for sentence boundary detection.

• It is quite useful to find out if the user is talking about more than one
context simultaneously

In [73]:
doc = nlp(u'Boston Dynamics is gearing up to produce thousands of robot dogs')

In [74]:
list(doc.noun_chunks)

[Boston Dynamics, thousands, robot dogs]

Though having noun chunks from a given sentence helps a lot, spaCy provides other
attributes that can be helpful too. Let’s try to explore some of those

In [75]:
doc = nlp(u"Deep learning cracks the code of messsenger RNAs and protein coding potential")

In [76]:
for chunk in doc.noun_chunks:
    print(chunk.text,chunk.root.text,chunk.root.dep_,chunk.root.head.text)

Deep learning learning nsubj cracks
the code code dobj cracks
messsenger RNAs RNAs pobj of
protein protein conj RNAs
potential potential dobj coding


# Glove Algorithm

GloVe is an unsupervised learning algorithm for obtaining vector representations for
words. GloVe algorithm uses aggregated global word-word co-occurrence statistics from
a corpus to train the model.

In [78]:
doc = nlp(u'How are you doing today?')

In [80]:
for token in doc:
    print(token.text,token.vector[:5])

How [-0.29742572  0.73939663 -0.04001489  0.44033998  2.8967493 ]
are [-0.23435052 -1.6145046   1.0197451   0.99281657  0.28227103]
you [ 0.10252154 -3.5647116   2.482279    4.2825      3.5902457 ]
doing [-0.6240918 -2.0210214 -0.9101492  2.7051926  4.189254 ]
today [ 3.5409102  -0.62185943  2.6274269   2.050488    0.20191962]
? [ 2.8915     -0.25079128  3.3764174   1.6942688   1.9849055 ]


This is a 5D Vector used for the representation of a particular word in the vector domain for cosine similarity or to find similarity between two different words under consideration

# Similarity Example-1

In [82]:
hello1 = nlp(u'Hello')
hi_doc = nlp(u'hi')

In [83]:
hella_doc = nlp(u'hella')
print(hello1.similarity(hi_doc))
print(hello1.similarity(hella_doc))

0.7522951892083747
0.38901935595436765


In [84]:
str1 = nlp(u'When will next season of Game of Thrones be releasing?')
str2 = nlp(u'Game of thrones next season release date?')
sim_str = str1.similarity(str2)
print('The similarity between two sentences is {}'.format(sim_str))

The similarity between two sentences is 0.7063789688496691


In [85]:
example_doc = nlp(u'car truck google')


In [87]:
for t1 in example_doc:
    for t2 in example_doc:
        similarity_perc = int(t1.similarity(t2)*100)
        print("Word {} is {}% similar to word {}".format(t1.text,similarity_perc, t2.text))

Word car is 100% similar to word car
Word car is 71% similar to word truck
Word car is 24% similar to word google
Word truck is 71% similar to word car
Word truck is 100% similar to word truck
Word truck is 36% similar to word google
Word google is 24% similar to word car
Word google is 36% similar to word truck
Word google is 100% similar to word google


Finding similarity between words or sentences becomes quite important when we
intend to build any application that is hugely dependent on the implementations of
NLP.

# Tokenization

Tokenization is one of the simple yet basic concepts of NLP where we split a text into
meaningful segments. spaCy first tokenizes the text (i.e., segments it into words and
then punctuation and other things).

In [88]:
doc = nlp(u'Brexit is the impending withdrawal of U.K. from the European Union. ')
for token in doc:
    print(token.text)

Brexit
is
the
impending
withdrawal
of
U.K.
from
the
European
Union
.


If you see in the above output, U.K. comes as a single word after the tokenization
process, which makes sense, as U.K. is a country name and splitting it would be wrong.
Even after this if you not happy with spaCy’s tokenization, then you can use its
add_special_case case method to add your own rule before relying completely on
spaCy’s tokenization method.

In [89]:
import re
sent1 = "Book me a metro from Airport Station to Hong Kong Station"
sent2 = "Book me a cab to Honk Kong Airport from AsiaWorld-Expo."
