### This ipynb contains the usage of 'Porter Stemmer', 'Regexp Stemmer' and 'Snowball Stemmer' for Stemming tasks
### This ipynb contains the usage of 'WordNetLemmatizer' for Lemmatization tasks

In [1]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

  from pandas.core import (
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hardiksharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hardiksharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1. Stemming 

#### Cuts off word endings to get the root. Rule-based chopping (heuristic). Might produce non-words. Faster (no lookup needed). Less accurate

In [2]:
words = [
    "congratulations","running", "eat", "jumping", "talk", "flying", "build", "sleeping", "drive", "dancing", "write",
    "reading", "sing", "working", "think", "going", "bring", "buying", "catch", "climbing", "cry",
    "drawing", "fall", "feeling", "fighting", "find", "giving", "grow", "helping", "hitting", "hold",
    "hoping", "kicking", "knowing", "learning", "leaving", "listen", "losing", "making", "meeting", "open",
    "painting", "playing", "pull", "pushing", "riding", "running", "saying", "seeing", "sitting", "starting"
]

### 1.1 PorterStemmer

In [3]:
from nltk.stem import PorterStemmer
stemming = PorterStemmer()

In [4]:
for word in words:
    print(word + "------>" + stemming.stem(word))

congratulations------>congratul
running------>run
eat------>eat
jumping------>jump
talk------>talk
flying------>fli
build------>build
sleeping------>sleep
drive------>drive
dancing------>danc
write------>write
reading------>read
sing------>sing
working------>work
think------>think
going------>go
bring------>bring
buying------>buy
catch------>catch
climbing------>climb
cry------>cri
drawing------>draw
fall------>fall
feeling------>feel
fighting------>fight
find------>find
giving------>give
grow------>grow
helping------>help
hitting------>hit
hold------>hold
hoping------>hope
kicking------>kick
knowing------>know
learning------>learn
leaving------>leav
listen------>listen
losing------>lose
making------>make
meeting------>meet
open------>open
painting------>paint
playing------>play
pull------>pull
pushing------>push
riding------>ride
running------>run
saying------>say
seeing------>see
sitting------>sit
starting------>start


### 1.2 RegexpStemmer Class

In [5]:
from nltk.stem import RegexpStemmer
reg_stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4)

In [6]:
for word in words:
    print(word + "------>" + reg_stemmer.stem(word))

congratulations------>congratulation
running------>runn
eat------>eat
jumping------>jump
talk------>talk
flying------>fly
build------>build
sleeping------>sleep
drive------>driv
dancing------>danc
write------>writ
reading------>read
sing------>s
working------>work
think------>think
going------>go
bring------>br
buying------>buy
catch------>catch
climbing------>climb
cry------>cry
drawing------>draw
fall------>fall
feeling------>feel
fighting------>fight
find------>find
giving------>giv
grow------>grow
helping------>help
hitting------>hitt
hold------>hold
hoping------>hop
kicking------>kick
knowing------>know
learning------>learn
leaving------>leav
listen------>listen
losing------>los
making------>mak
meeting------>meet
open------>open
painting------>paint
playing------>play
pull------>pull
pushing------>push
riding------>rid
running------>runn
saying------>say
seeing------>see
sitting------>sitt
starting------>start


### 1.3 Snowball Stemmer

In [7]:
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')

In [8]:
for word in words:
    print(word + "------>" + snowball_stemmer.stem(word))

congratulations------>congratul
running------>run
eat------>eat
jumping------>jump
talk------>talk
flying------>fli
build------>build
sleeping------>sleep
drive------>drive
dancing------>danc
write------>write
reading------>read
sing------>sing
working------>work
think------>think
going------>go
bring------>bring
buying------>buy
catch------>catch
climbing------>climb
cry------>cri
drawing------>draw
fall------>fall
feeling------>feel
fighting------>fight
find------>find
giving------>give
grow------>grow
helping------>help
hitting------>hit
hold------>hold
hoping------>hope
kicking------>kick
knowing------>know
learning------>learn
leaving------>leav
listen------>listen
losing------>lose
making------>make
meeting------>meet
open------>open
painting------>paint
playing------>play
pull------>pull
pushing------>push
riding------>ride
running------>run
saying------>say
seeing------>see
sitting------>sit
starting------>start


## 2. Lemmatization

#### Converts a word to its base dictionary form. Dictionary and grammar-based analysis. Always produces real words. Slower (uses vocabulary + POS tagging). More accurate and meaningful

### 2.1 WordNetLemmatizer

In [9]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
'''
Noun = n
Verb = v
Adjective = a
adverb = r

by defaul it is 'n'
'''

"\nNoun = n\nVerb = v\nAdjective = a\nadverb = r\n\nby defaul it is 'n'\n"

In [11]:
print(lemmatizer.lemmatize("going"))
print(lemmatizer.lemmatize("going",pos='v'))

going
go


In [12]:
for word in words:
    print(word + "------>" + lemmatizer.lemmatize(word,pos='v')) # Lemmatization depends on the provided POS

congratulations------>congratulations
running------>run
eat------>eat
jumping------>jump
talk------>talk
flying------>fly
build------>build
sleeping------>sleep
drive------>drive
dancing------>dance
write------>write
reading------>read
sing------>sing
working------>work
think------>think
going------>go
bring------>bring
buying------>buy
catch------>catch
climbing------>climb
cry------>cry
drawing------>draw
fall------>fall
feeling------>feel
fighting------>fight
find------>find
giving------>give
grow------>grow
helping------>help
hitting------>hit
hold------>hold
hoping------>hop
kicking------>kick
knowing------>know
learning------>learn
leaving------>leave
listen------>listen
losing------>lose
making------>make
meeting------>meet
open------>open
painting------>paint
playing------>play
pull------>pull
pushing------>push
riding------>rid
running------>run
saying------>say
seeing------>see
sitting------>sit
starting------>start


## 3. Handling Stopwords

In [13]:
paragraph = """It is in the morning when the sun is up that he usually goes out to the park. 
There, he walks around for a while, and then he sits on a bench, looking at the people who are passing by. 
Although he doesn’t talk to anyone, he feels as if he is a part of what is going on. 
It is not about what he does, but about how he feels when he is there. 
And so, day after day, he comes back, sits there, and watches the world go by."""

In [14]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [15]:
stopwords.words('english') #Stopwords in English. You can try for different languages

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [16]:
sentences = sent_tokenize(paragraph)

In [17]:
for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = " ".join(words)

In [18]:
sentences

['it morning sun usually go park .',
 'there , walk around , sits bench , looking people passing .',
 'although ’ talk anyone , feel part going .',
 'it , feel .',
 'and , day day , come back , sits , watch world go .']