# Basics Of Natural Language Processsing

In [3]:
import spacy

In [4]:
nlp=spacy.load("en")

In [5]:
Sentence=nlp("Hi There, Welcome To Natural Language Processing Basics, performed by Jinish Kanpara")

In [6]:
for items in Sentence:
    print(items.text,items.pos_,items.pos,items.dep)

Hi INTJ 91 8206900633647566924
There ADV 86 400
, PUNCT 97 445
Welcome VERB 100 428
To ADP 85 443
Natural PROPN 96 7037928807040764755
Language PROPN 96 7037928807040764755
Processing PROPN 96 7037928807040764755
Basics PROPN 96 439
, PUNCT 97 445
performed VERB 100 8206900633647566924
by ADP 85 401
Jinish PROPN 96 7037928807040764755
Kanpara PROPN 96 439


In [7]:
#In here pos means parts of speech, i.e. which type of speech is it.
#To elaborate the statement, we can see it's explanation.

In [8]:
Sentence[8]

Basics

In [9]:
Sentence[8].pos_

'PROPN'

In [10]:
spacy.explain(Sentence[8].pos_)

'proper noun'

In [11]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [12]:
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [13]:
type(life_quote) #span

spacy.tokens.span.Span

In [14]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [15]:
for sentences in doc4.sents:
    print(sentences)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [16]:
doc4[6].is_sent_start

True

# Tokenization

In [17]:
Statement=nlp("This is the statement to learn tokenization in Tableau")

In [18]:
for tokens in Statement:
    print(tokens.text,end = " | ")

This | is | the | statement | to | learn | tokenization | in | Tableau | 

# Entities

In [19]:
Statement1 = nlp("Reliance Industries seems to be earning 200 million dollars more than TATA industries in India")

In [20]:
for entities in Statement1.ents:
    print(entities.text+" - "+entities.label_,spacy.explain(entities.label_))

Reliance Industries - ORG Companies, agencies, institutions, etc.
200 million dollars - MONEY Monetary values, including unit
TATA - ORG Companies, agencies, institutions, etc.
India - GPE Countries, cities, states


In [21]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


# Displacy

In [22]:
from spacy import displacy

In [23]:
Statement2 = nlp("Hey, remember the time we went for cycling in Stanley park on 24th October 2018")

In [24]:
displacy.render(Statement2,style = "ent",jupyter = True)

# Day 10

# Stemming

In [25]:
import nltk

In [26]:
from nltk.stem.porter import *

In [27]:
P_Stemmer=PorterStemmer()

In [28]:
sample_words = ['gathering','cycling','beautiful','fairly','astonishingly','promtly','goodness','running']

In [29]:
for words in sample_words:
    print(words+'--->'+P_Stemmer.stem(words))

gathering--->gather
cycling--->cycl
beautiful--->beauti
fairly--->fairli
astonishingly--->astonishingli
promtly--->promtli
goodness--->good
running--->run


In [30]:
#We can certainly see in the above results that some of the words
#such as 
# 1.promtly
# 2.beautiful
# 3. astonishingly
#are not settled according to the needs

In [31]:
from nltk.stem.snowball import *


In [32]:
S_Stemmer = SnowballStemmer(language = "english")

In [33]:
sample_words = ['gathering','cycling','beautiful','fairly','astonishingly','promtly','goodness','running']

In [34]:
for words in sample_words:
    print(words+'--->'+S_Stemmer.stem(words))

gathering--->gather
cycling--->cycl
beautiful--->beauti
fairly--->fair
astonishingly--->astonish
promtly--->promt
goodness--->good
running--->run


In [35]:
#Results imporves here
#1. astonishingly--->astonish
#2. fairly--->fair
#3. promtly--->promt

# Lemmatization

In [36]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [44]:
for words in doc1:
    print(words.text,"\t",words.pos_,"\t",words.lemma,"\t",words.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [54]:
#Why not create a function instead of using the same sequence everytime
def primary_lemma_function(text):
    for words in text:
        print(f'{words.text:{12}} {words.pos_:{6}} {words.lemma:<{22}} {words.lemma_}')

In [55]:
primary_lemma_function(doc1)

I            PRON   561228191312463089     -PRON-
am           VERB   10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      ADP    16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        ADP    10066841407251338481   since
I            PRON   561228191312463089     -PRON-
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today
