In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Stemming**
=====

## Poter Stemmer

In [None]:
from nltk.stem import PorterStemmer
s = PorterStemmer()

In [None]:
print(s.stem('Fishing'))
print(s.stem('Fish'))
print(s.stem('Fisher'))
print(s.stem('Fishes'))
print(s.stem('Fished'))

fish
fish
fisher
fish
fish


In [None]:
print(s.stem('Having'))
print(s.stem('Have'))
print(s.stem('Had'))

have
have
had


In [None]:
print(s.stem('European'))
print(s.stem('Europe'))

european
europ


In [None]:
print(s.stem('policy'))
print(s.stem('police'))

polici
polic


In [None]:
print(s.stem('matrix'))
print(s.stem('matrices'))

matrix
matric


In [None]:
print(s.stem('automation'))
print(s.stem('automatic'))
print(s.stem('automate'))
print(s.stem('automat'))

autom
automat
autom
automat


In [None]:
print(s.stem('was'))
print(s.stem('saw'))

wa
saw


In [None]:
e_words= ["wait", "waiting", "waited", "waits"]
for w in e_words:
  print(s.stem(w))

wait
wait
wait
wait


In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Stemming for {} is {}".format(w,s.stem(w))) 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Stemming for studies is studi
Stemming for studying is studi
Stemming for cries is cri
Stemming for cry is cri


## SnowBall Stemmer

This is very similar to Porter stemmer (A little faster though and a little more aggresive while stemming). SnowBall Stemer is designed for creating stemming algorithms for use in Information Retrieval.

In [2]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [3]:
print(stemmer.stem("authorize"))
print(stemmer.stem("authorized"))
print(stemmer.stem("authority"))
print(stemmer.stem("authorization"))
print(stemmer.stem("authorizing"))

author
author
author
author
author


In [4]:
print(stemmer.stem("matrices"))
print(stemmer.stem("matrix"))

matric
matrix


In [5]:
print(stemmer.stem("ran"))
print(stemmer.stem("runs"))

ran
run


In [6]:
print(stemmer.stem("police"))
print(stemmer.stem("policy"))

polic
polici


In [7]:
print(stemmer.stem("european"))
print(stemmer.stem("europe"))

european
europ


In [8]:
print(stemmer.stem("stocking"))
print(stemmer.stem("stocks"))

stock
stock


In [9]:
print(stemmer.stem("caring"))


care


In [10]:
print(stemmer.stem("misinform"))

misinform


**Lemmatization**
=====

Lemmatisation is similar to stemming, as it produces a normalised version of the input word.

The output is a lemma, i.e. a proper word (different from stemming)

The input word is lemmatised according to its Part-of-Speech (POS) tag, i.e. verb, noun, etc.

In [None]:
# You'll need the "wordnet" package from NLTK data
# python -m nltk.downloader wordnet
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
syns = wordnet.synsets("dog")
print(syns)

[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]


In [None]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("active"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
             antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

{'dynamic', 'combat-ready', 'fighting', 'alive', 'active_voice', 'active_agent', 'participating', 'active'}
{'extinct', 'passive', 'inactive', 'stative', 'quiet', 'passive_voice', 'dormant'}


In [None]:
from nltk.stem import WordNetLemmatizer
l = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
print(l.lemmatize('having', pos='v'))
print(l.lemmatize('have', pos='v'))
print(l.lemmatize('had', pos='v'))

have
have
have


In [None]:
print(l.lemmatize('fishing', pos='v'))
print(l.lemmatize('fish', pos='v'))
print(l.lemmatize('fisher', pos='n'))
print(l.lemmatize('fishes', pos='v'))
print(l.lemmatize('fished', pos='v'))

fish
fish
fisher
fish
fish


In [None]:
print(l.lemmatize('am', pos='v'))
print(l.lemmatize('is', pos='v'))
print(l.lemmatize('was', pos='v'))

be
be
be


In [None]:
# import nltk
# from nltk.tokenize import word_tokenize
# nltk.download('punkt')

text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, l.lemmatize(w))) 

Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry
