Credits/Course URL - https://www.udemy.com/course/the-ultimate-beginners-guide-to-natural-language-processing/

In [1]:
import nltk
import spacy
import en_core_web_sm

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
nlp

<spacy.lang.en.English at 0x2160d67cc70>

### Tokenization

In [103]:
document = nlp('I am learning natural language processing. This course is in India')

In [107]:
print(str(document).split())

['I', 'am', 'learning', 'natural', 'language', 'processing.', 'This', 'course', 'is', 'in', 'India']


In [104]:
for token in document:
    print(token)

I
am
learning
natural
language
processing
.
This
course
is
in
India


In [5]:
for token in document:
    print(token, token.pos_, token.lemma_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

I PRON I PRP nsubj X True True
am AUX be VBP aux xx True True
learning VERB learn VBG ROOT xxxx True False
natural ADJ natural JJ amod xxxx True False
language NOUN language NN compound xxxx True False
processing NOUN processing NN dobj xxxx True False
. PUNCT . . punct . False False
This DET this DT det Xxxx True True
course NOUN course NN nsubj xxxx True False
is AUX be VBZ ROOT xx True True
in ADP in IN prep xx True True
India PROPN India NNP pobj Xxxxx True False


In [6]:
for token in document:
    if token.pos_ == 'VERB':
        print(token.text)

learning


Lemmitization

In [7]:
for token in document:
    print(token.text, token.lemma_)

I I
am be
learning learn
natural natural
language language
processing processing
. .
This this
course course
is be
in in
India India


In [8]:
doc = nlp('learn learning watch watching watched go went gone ')
[token.lemma_ for token in doc]

['learn', 'learn', 'watch', 'watch', 'watch', 'go', 'go', 'go']

Stemming

In [9]:
stemmer = nltk.stem.PorterStemmer()

In [10]:
for token in doc:
    print(f'{token.text}: {stemmer.stem(token.text)}')

learn: learn
learning: learn
watch: watch
watching: watch
watched: watch
go: go
went: went
gone: gone


Named Entity Recognition

In [16]:
text = "Cognizant Techology Solutions is an American multinational information technology services and consulting company. It is headquartered in Teaneck, New Jersey, United States. It is part of the NASDAQ-100 and trades under CTSH and expects fourth-quarter 2022 revenues to be $4.8 billion"

In [17]:
doc = nlp(text)

In [18]:
for entity in doc.ents:
    print(entity.text, entity.label_)

Cognizant Techology Solutions ORG
American NORP
Teaneck GPE
New Jersey GPE
United States GPE
CTSH PRODUCT
fourth-quarter 2022 DATE
$4.8 billion MONEY


In [19]:
spacy.displacy.render(doc, style = 'ent', jupyter=True)

In [20]:
doc = nlp("Bill Gates was born in Seattle on 1995-10-28 and is the founder of Microsoft")
spacy.displacy.render(doc, style = 'ent', jupyter=True)

In [21]:
for entity in doc.ents:
    if entity.label_ == 'PERSON':
        print(entity.text, entity.label_)

Bill Gates PERSON


Stopwords

In [24]:
en_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(en_stopwords)

{'become', 'under', 'against', 'everywhere', 'elsewhere', 'somewhere', 'since', "'m", 'no', 'eight', 'on', '‘ve', 'before', 'really', 'per', 'upon', '’m', 'but', 'us', 'enough', 'who', '’ll', 'do', 'two', 'within', '‘s', 'whereby', 'therefore', '’ve', 'if', 'towards', 'least', 'moreover', 'amount', 'becomes', 'it', 'through', 'among', 'themselves', 'something', 'been', 'put', 'did', 'off', 'much', 'therein', 'another', 'why', 'became', 'nowhere', 'alone', 're', 'will', 'even', 'seem', 'same', 'regarding', 'as', 'quite', 'those', 'to', 'should', 'i', 'am', 'about', 'can', 'done', 'keep', 'whereupon', 'than', 'nevertheless', 'further', 'was', 'although', '’d', 'bottom', 'during', 'otherwise', 'from', 'him', 'very', 'itself', 'behind', 'our', 'whatever', 'fifty', 'once', 'with', 'amongst', 'in', "'ll", 'them', 'beforehand', 'anything', 'few', 'wherever', 'neither', 'third', 'front', '‘ll', 'while', 'without', 'move', 'always', 'throughout', 'ca', 'being', "'d", 'seeming', 'thence', 'only'

In [25]:
'it' in en_stopwords

True

In [26]:
len(en_stopwords)

326

In [27]:
nlp.vocab['it'].is_stop

True

In [32]:
for token in document:
    if nlp.vocab[token.text].is_stop:
        print(token.text)

I
am
This
is
in


In [33]:
for token in document:
    if not nlp.vocab[token.text].is_stop:
        print(token.text)

learning
natural
language
processing
.
course
India


Dependency Parsing

In [34]:
text = 'book a ticket from India to Germany'
document = nlp(text)

In [35]:
origin = document[4]
destiny = document[6]

In [38]:
list(origin.ancestors)

[from, ticket, book]

In [40]:
list(destiny.ancestors)

[to, ticket, book]

In [41]:
document[0].is_ancestor(document[2])

True

In [42]:
document = nlp('book a table for the resturant and a taxi to the hotel')

In [44]:
tasks = document[2], document[8]
locations = document[5], document[11]
print(tasks, locations)

(table, taxi) (resturant, hotel)


In [47]:
for places in locations:
    print("******* ", places, " *******")
    for obj in places.ancestors:
        if obj in tasks:
            print(f'Reservation of {obj} to {places}')
            break
        
    

*******  resturant  *******
Reservation of table to resturant
*******  hotel  *******
Reservation of taxi to hotel


In [49]:
list(document[5].children)

[the, and, taxi]

In [53]:
spacy.displacy.render(document, style='dep', jupyter=True, options={'distance': 70})

In [54]:
document = nlp('What places can we visit in London and stay in Paris?')

In [59]:
for token in document:
    print(token.text, token.pos_)

What DET
places NOUN
can AUX
we PRON
visit VERB
in ADP
London PROPN
and CCONJ
stay VERB
in ADP
Paris PROPN
? PUNCT


In [55]:
locations = list()
actions = list()

In [62]:
locations = [token for token in document if token.pos_ == 'PROPN']
actions = [token for token in document if token.pos_ == 'VERB']

print(f"Location: {locations}\nActions: {actions}")

Location: [London, Paris]
Actions: [visit, stay]


In [70]:
for places in locations:
    for action in places.ancestors:
        #print(places, "->>", action)
        if action in actions:
            print(f'{places} to {action}')
            break

London to visit
Paris to stay


In [71]:
spacy.displacy.render(document, style='dep', jupyter=True, options={'distance': 70})

### Similarity between words and sentences

spaCy uses GloVe Algorithm (Globe vectors for Word Representation)

In [77]:
word1 = nlp('hello')
word2 = nlp('hi')
word3 = nlp('where')
word4 = nlp('why')

In [81]:
word1.similarity(word2)

  word1.similarity(word2)


0.7161104850664421

In [82]:
word3.similarity(word4)

  word3.similarity(word4)


0.8559772299351288

In [83]:
word1.similarity(word3)

  word1.similarity(word3)


0.20077019416999942

In [89]:
text1 = nlp('When are we going for a trip?')
text2 = nlp('We should plan for a trip on October?')
text3 = nlp('I will be graduating on March 2023')

In [90]:
text1.similarity(text2)

  text1.similarity(text2)


0.5008525262323833

In [91]:
text1.similarity(text3)

  text1.similarity(text3)


0.40245600458480424

In [94]:
text = nlp('cat dog horse human elephant shark whale')

In [95]:
text[1:]

dog horse human elephant shark whale

In [102]:
for obj1 in text[:-1]:
    for obj2 in text[obj1.i+1:]:
        print(f"{obj1} is {round(obj1.similarity(obj2)*100,2)}% similar to {obj2}")

cat is 54.12% similar to dog
cat is 43.94% similar to horse
cat is 60.49% similar to human
cat is 52.16% similar to elephant
cat is 56.03% similar to shark
cat is 23.91% similar to whale
dog is 71.42% similar to horse
dog is 41.71% similar to human
dog is 62.72% similar to elephant
dog is 69.77% similar to shark
dog is 46.02% similar to whale
horse is 35.27% similar to human
horse is 56.27% similar to elephant
horse is 57.06% similar to shark
horse is 37.13% similar to whale
human is 65.05% similar to elephant
human is 41.31% similar to shark
human is 21.38% similar to whale
elephant is 62.92% similar to shark
elephant is 30.89% similar to whale
shark is 46.53% similar to whale


  print(f"{obj1} is {round(obj1.similarity(obj2)*100,2)}% similar to {obj2}")
