# In this notebook we can see how stemming is done and what are its drawbacks

In [1]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()

In [3]:
# Stemming a list of words 

example_words = ["eat","eats","eating","eated"]
for i in example_words:
    print(ps.stem(i))
    

eat
eat
eat
eat


In [4]:
example_words = ["play","playing","played","plays"]
for i in example_words:
    print(ps.stem(i))

play
play
play
play


In [7]:
# stem a sentence after tokenizing it

text = """When I study the letters, messages and mails that I have received and also the personal interactions with the people,
I can clearly see abundant opportunities 
in which every citizen can contribute. I thought of sharing this with you:
My topic of this address will be — “What can I give to my nation?"

In Indian history, our nation has come across a situation, all at a time, 
an ascending economic trajectory, continuously rising foreign exchange reserves,
increasing domestic investment with investors’ confidence rising steadily, global 
successes of Indian managerial and entrepreneurial talents, global recognition of technological competence,
energy of 540 million youth, umbilical connectivities of more than 25 million people of Indian origin in
various parts of the planet and the interest shown by many developed countries to invest in our engineers 
and scientists through setting up of new Research and Development Centres in India.

The distinction between the public and the private sectors and the illusory primacy of one over the other
is vanishing. Also, there is a trend that many young people are opting for creating new enterprises instead
of being mere employees."""

word_tokens = word_tokenize(text)
for i in word_tokens:
    print(ps.stem(i))

when
i
studi
the
letter
,
messag
and
mail
that
i
have
receiv
and
also
the
person
interact
with
the
peopl
,
i
can
clearli
see
abund
opportun
in
which
everi
citizen
can
contribut
.
i
thought
of
share
thi
with
you
:
my
topic
of
thi
address
will
be
—
“
what
can
i
give
to
my
nation
?
''
in
indian
histori
,
our
nation
ha
come
across
a
situat
,
all
at
a
time
,
an
ascend
econom
trajectori
,
continu
rise
foreign
exchang
reserv
,
increas
domest
invest
with
investor
’
confid
rise
steadili
,
global
success
of
indian
manageri
and
entrepreneuri
talent
,
global
recognit
of
technolog
compet
,
energi
of
540
million
youth
,
umbil
connect
of
more
than
25
million
peopl
of
indian
origin
in
variou
part
of
the
planet
and
the
interest
shown
by
mani
develop
countri
to
invest
in
our
engin
and
scientist
through
set
up
of
new
research
and
develop
centr
in
india
.
the
distinct
between
the
public
and
the
privat
sector
and
the
illusori
primaci
of
one
over
the
other
is
vanish
.
also
,
there
is
a
trend
that
mani
young

# Stemming

In [13]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
para = """When I study the letters, messages and mails that I have received and also the personal interactions 
with the people,I can clearly see abundant opportunities in which every citizen can contribute. 
I thought of sharing this with you:My topic of this address will be — “What can I give to my nation?"
In Indian history, our nation has come across a situation, all at a time, 
an ascending economic trajectory, continuously rising foreign exchange reserves,
increasing domestic investment with investors’ confidence rising steadily, global 
successes of Indian managerial and entrepreneurial talents, global recognition of technological competence,
energy of 540 million youth, umbilical connectivities of more than 25 million people of Indian origin in
various parts of the planet and the interest shown by many developed countries to invest in our engineers 
and scientists through setting up of new Research and Development Centres in India.

The distinction between the public and the private sectors and the illusory primacy of one over the other
is vanishing. Also, there is a trend that many young people are opting for creating new enterprises instead
of being mere employees."""


sentences = nltk.sent_tokenize(para)

ps = PorterStemmer()


## What are stopwords? : All the repeating words in a corpus, for eg - the,of,them,we etc are called stopwords

In [9]:
#Checking all the stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
# stopwords.words('German')   #checking stopwords in German 

In [31]:
# Stemming
ps = PorterStemmer()

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [ps.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)
print(words)
print(sentences[i])

['also', ',', 'trend', 'mani', 'young', 'peopl', 'opt', 'creat', 'new', 'enterpri', 'instead', 'mere', 'employ', '.']
also , trend mani young peopl opt creat new enterpri instead mere employ .


In [None]:
## Problem with stemming is that most of the stemmed words have no meaning for example -
# "people" becomes "peopl","create" becomes "creat"
# likewise more words are converted into meaningless words so we can solve this by lemmatization