Credits/Course URL - https://www.udemy.com/course/the-ultimate-beginners-guide-to-natural-language-processing/

In [1]:
import nltk
import spacy
import en_core_web_sm

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
nlp

<spacy.lang.en.English at 0x2160d67cc70>

In [4]:
document = nlp('I am learning natural language processing. This course is in India')

In [5]:
for token in document:
    print(token, token.pos_, token.lemma_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

I PRON I PRP nsubj X True True
am AUX be VBP aux xx True True
learning VERB learn VBG ROOT xxxx True False
natural ADJ natural JJ amod xxxx True False
language NOUN language NN compound xxxx True False
processing NOUN processing NN dobj xxxx True False
. PUNCT . . punct . False False
This DET this DT det Xxxx True True
course NOUN course NN nsubj xxxx True False
is AUX be VBZ ROOT xx True True
in ADP in IN prep xx True True
India PROPN India NNP pobj Xxxxx True False


In [6]:
for token in document:
    if token.pos_ == 'VERB':
        print(token.text)

learning


Lemmitization

In [7]:
for token in document:
    print(token.text, token.lemma_)

I I
am be
learning learn
natural natural
language language
processing processing
. .
This this
course course
is be
in in
India India


In [8]:
doc = nlp('learn learning watch watching watched go went gone ')
[token.lemma_ for token in doc]

['learn', 'learn', 'watch', 'watch', 'watch', 'go', 'go', 'go']

Stemming

In [9]:
stemmer = nltk.stem.PorterStemmer()

In [10]:
for token in doc:
    print(f'{token.text}: {stemmer.stem(token.text)}')

learn: learn
learning: learn
watch: watch
watching: watch
watched: watch
go: go
went: went
gone: gone


Named Entity Recognition

In [16]:
text = "Cognizant Techology Solutions is an American multinational information technology services and consulting company. It is headquartered in Teaneck, New Jersey, United States. It is part of the NASDAQ-100 and trades under CTSH and expects fourth-quarter 2022 revenues to be $4.8 billion"

In [17]:
doc = nlp(text)

In [18]:
for entity in doc.ents:
    print(entity.text, entity.label_)

Cognizant Techology Solutions ORG
American NORP
Teaneck GPE
New Jersey GPE
United States GPE
CTSH PRODUCT
fourth-quarter 2022 DATE
$4.8 billion MONEY


In [19]:
spacy.displacy.render(doc, style = 'ent', jupyter=True)

In [20]:
doc = nlp("Bill Gates was born in Seattle on 1995-10-28 and is the founder of Microsoft")
spacy.displacy.render(doc, style = 'ent', jupyter=True)

In [21]:
for entity in doc.ents:
    if entity.label_ == 'PERSON':
        print(entity.text, entity.label_)

Bill Gates PERSON


Stopwords

In [24]:
en_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(en_stopwords)

{'become', 'under', 'against', 'everywhere', 'elsewhere', 'somewhere', 'since', "'m", 'no', 'eight', 'on', '‘ve', 'before', 'really', 'per', 'upon', '’m', 'but', 'us', 'enough', 'who', '’ll', 'do', 'two', 'within', '‘s', 'whereby', 'therefore', '’ve', 'if', 'towards', 'least', 'moreover', 'amount', 'becomes', 'it', 'through', 'among', 'themselves', 'something', 'been', 'put', 'did', 'off', 'much', 'therein', 'another', 'why', 'became', 'nowhere', 'alone', 're', 'will', 'even', 'seem', 'same', 'regarding', 'as', 'quite', 'those', 'to', 'should', 'i', 'am', 'about', 'can', 'done', 'keep', 'whereupon', 'than', 'nevertheless', 'further', 'was', 'although', '’d', 'bottom', 'during', 'otherwise', 'from', 'him', 'very', 'itself', 'behind', 'our', 'whatever', 'fifty', 'once', 'with', 'amongst', 'in', "'ll", 'them', 'beforehand', 'anything', 'few', 'wherever', 'neither', 'third', 'front', '‘ll', 'while', 'without', 'move', 'always', 'throughout', 'ca', 'being', "'d", 'seeming', 'thence', 'only'

In [25]:
'it' in en_stopwords

True

In [26]:
len(en_stopwords)

326

In [27]:
nlp.vocab['it'].is_stop

True

In [32]:
for token in document:
    if nlp.vocab[token.text].is_stop:
        print(token.text)

I
am
This
is
in


In [33]:
for token in document:
    if not nlp.vocab[token.text].is_stop:
        print(token.text)

learning
natural
language
processing
.
course
India
