In [1]:
#The spaCy library is one of the most popular NLP libraries along with NLTK. 
#NLTK contains a wide variety of algorithms to solve one problem 
#whereas spaCy contains only one, but the best algorithm to solve a problem.
import spacy
import en_core_web_sm
sp= en_core_web_sm.load()

ModuleNotFoundError: No module named 'spacy'

In [5]:
sentence = sp(u'Manchester United is looking to sign a forward for $90 million')
for word in sentence:
    print(word.text)

Manchester
United
is
looking
to
sign
a
forward
for
$
90
million


In [6]:
#We can see the parts of speech of each of these tokens using the .pos_ attribute shown below:
for word in sentence:
    print(word.text,  word.pos_)

Manchester PROPN
United PROPN
is AUX
looking VERB
to PART
sign VERB
a DET
forward NOUN
for ADP
$ SYM
90 NUM
million NUM


In [7]:
#you can also print sentences from a document
document = sp(u'Hello from Stackabuse. The site with the best Python Tutorials. What are you looking for?')
for sentence in document.sents:
    print(sentence)

Hello from Stackabuse.
The site with the best Python Tutorials.
What are you looking for?


In [8]:
#spaCy tokenization
sentence3 = sp(u'"They\'re leaving U.K. for U.S.A."')
print(sentence3)
for word in sentence3:
    print(word.text)

"They're leaving U.K. for U.S.A."
"
They
're
leaving
U.K.
for
U.S.A.
"


In [9]:
sentence4 = sp(u"Hello, I am non-vegetarian, email me the menu at abc-xyz@gmai.com")
print(sentence4)
for word in sentence4:
    print(word.text)

Hello, I am non-vegetarian, email me the menu at abc-xyz@gmai.com
Hello
,
I
am
non
-
vegetarian
,
email
me
the
menu
at
abc-xyz@gmai.com


In [10]:
#It is evident from the output that spaCy was actually able to detect the email 
#and it did not tokenize it despite having a "-". 
#On the other hand, the word "non-vegetarian" was tokenized.

In [11]:
#nouns can also be detected. To do so, the noun_chunks attribute is used
sentence5 = sp(u'Latest Rumours: Manchester United is looking to sign Harry Kane for $90 million')  
for noun in sentence5.noun_chunks:
    print(noun.text)

Manchester United
Harry Kane


In [12]:
#Stemming refers to reducing a word to its root form
#There are two types of stemmers in NLTK: Porter Stemmer and Snowball stemmers
import nltk
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokens = ['compute', 'computer', 'computed', 'computing']
for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


In [13]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='english')
tokens = ['compute', 'computer', 'computed', 'computing']
for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


In [14]:
#You can see that all the 4 words have been reduced to "comput" which actually isn't a word at all.
#This is where lemmatization comes handy.
#Lemmatization reduces the word to its stem as it appears in the dictionary. 

In [15]:
sentence6 = sp(u'compute computer computed computing')
for word in sentence6:
    print(word.text,  word.lemma_)

compute compute
computer computer
computed compute
computing computing


In [16]:
#Lemmatization converts words in the second or third forms to their first form variants
sentence7 = sp(u'A letter has been written, asking him to be released')
for word in sentence7:
    print(word.text + '  ===>', word.lemma_)

A  ===> a
letter  ===> letter
has  ===> have
been  ===> be
written  ===> write
,  ===> ,
asking  ===> ask
him  ===> -PRON-
to  ===> to
be  ===> be
released  ===> release


In [17]:
sentence6 = sp(u'did done do')
for word in sentence6:
    print(word.text,  word.lemma_)

did do
done do
do do
