In [None]:
!pip install spacy==3.0.6

In [None]:
# install spaCy model as python packages
!python -m spacy download en_core_web_sm

In [None]:
import spacy
spacy.__version__

## POS Tagging

In [None]:
#Loads the spacy en model into a python object
nlp = spacy.load('en_core_web_sm') 

#Creates a doc object
doc = nlp(u'I am learning how to build chatbots') 

for token in doc:
    print('{:9}  {}'.format(token.text, token.pos_))

In [None]:
doc = nlp(u'I am going to London next week for a meeting.')
for token in doc:
    print('{:9}  {}'.format(token.text, token.pos_))

In [None]:
doc = nlp(u'Google release "Move Mirror" AI experiment that matches your pose from 80,000 images')

print('{:12} {:12} {:9} {:9} {:9} {:9} {:9} {:9}'.format('Text', 'Lemma', 'POS', 'Tag', 'Dep', 'Shape', 'Alpha', 'Stop'))
for token in doc:
    print('{:12} {:12} {:9} {:9} {:9} {:9} {!s:9} {!s:9}'.format(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                token.shape_, token.is_alpha, token.is_stop))

In [None]:
doc = nlp(u'I am learning how to build chatbots')

print('{:12} {:12} {:9} {:9} {:9} {:9} {:9} {:9}'.format('Text', 'Lemma', 'POS', 'Tag', 'Dep', 'Shape', 'Alpha', 'Stop'))
for token in doc:
    print('{:12} {:12} {:9} {:9} {:9} {:9} {!s:9} {!s:9}'.format(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                token.shape_, token.is_alpha, token.is_stop))

## Named-Entity Recognition

In [None]:
my_string = u"Google has its headquarters in Mountain View, California having revenue amounted to 109.65 billion US dollars"
doc = nlp(my_string)

for ent in doc.ents: 
    print(ent.text, ent.label_)

## Stop words

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

In [None]:
 nlp.vocab[u'is'].is_stop

## Dependency Parsing

In [None]:
doc = nlp(u'Book me a flight from Bangalore to Goa')
# Bangalore and Goa
blr, goa = doc[5], doc[7]

In [None]:
list(blr.ancestors)

In [None]:
list(goa.ancestors)

In [None]:
# `flight` is ancestor of Bangalore
doc[3].is_ancestor(doc[5])

In [None]:
from spacy import displacy 
doc = nlp(u'Book a table at the restaurant and the taxi to the hotel')             

# visit http://localhost:5000/ to view
displacy.serve(doc, style='dep')

In [None]:
doc = nlp(u"What are some places to visit in Berlin and stay in Lubeck") 
places = [doc[7], doc[11]] #[Berlin, Lubeck] 

actions = [doc[5], doc[9]] #[visit, stay] 

for place in places: 
    for tok in place.ancestors: 
        if tok in actions: 
            print("User is referring {} to {}").format(place, tok) 
            break

## Noun Chunks

In [None]:
doc = nlp(u"Boston Dynamics is gearing up to produce thousands of robot dogs") 

list(doc.noun_chunks)

In [None]:
doc = nlp(u"Deep learning cracks the code of messenger RNAs and protein-coding potential") 
print('{:30} {:15} {:15} {:15}'.format('TEXT', 'ROOT.TEXT', 'ROOT.DEP_', 'ROOT.HEAD.TEXT'))
for chunk in doc.noun_chunks:
    print('{:30} {:15} {:15} {:15}'.format(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text))


The following table will help you understand each column:

- Text: text of the original noun chunk
- Root text: text of the original word that connects the noun chunk with remaining parse 
- Root dep: dependency relation that connects the root to its head
- Root head text: text of the root token’s head

## Finding Similarity

In [None]:
hello_doc = nlp(u"hello") 
hi_doc = nlp(u"hi") 
hella_doc = nlp(u"hella")

If you see the word hello, it is more related and similar to the word hi, even though there is only a difference of a character between the words hello and hella.

In [None]:
hello_doc.similarity(hi_doc)

In [None]:
hello_doc.similarity(hella_doc)

In [None]:
GoT_str1 = nlp(u"When will next season of Game of Thrones be releasing?")
GoT_str2 = nlp(u"Game of Thrones next season release date?")
GoT_str1.similarity(GoT_str2)

In [None]:
example_doc = nlp(u"car truck google")

for t1 in example_doc:
    for t2 in example_doc:
        similarity_perc = int(t1.similarity(t2) * 100)
        print("Word {} is {}% similar to word {}".format(t1.text, similarity_perc,  t2.text))

## Tokenization

In [None]:
doc = nlp(u'Brexit is the impending withdrawal of the U.K. from the European Union.')
for token in doc:
    print(token.text)