#Lancaster Stemmer

Algorithm

In [12]:
def lancaster_stem(word):
    suffixes = {
        "ing": "", "ly": "", "ed": "", "ious": "", "ies": "y",
        "ive": "", "es": "", "ment": "", "er": "", "ion": ""
    }
    for suffix, replacement in suffixes.items():
        if word.endswith(suffix):
            return word[:-len(suffix)] + replacement
    return word

# Set of words
words = {"running", "happily", "played", "various", "flies", "giving"}
print([lancaster_stem(word) for word in words])


['var', 'runn', 'giv', 'play', 'happi', 'fly']


Library Implementation

In [13]:
from nltk.stem import LancasterStemmer

lancaster = LancasterStemmer()
words = {"running", "happily", "played", "various", "flies", "giving"}
print([lancaster.stem(word) for word in words])


['vary', 'run', 'giv', 'play', 'happy', 'fli']


#Porter Stemmer

Algorithm

In [14]:
def porter_stem(word):
    suffixes = {
        "ing": "", "ly": "", "ed": "", "ous": "", "ies": "y",
        "ive": "", "es": "", "ment": "", "er": "", "ation": "ate"
    }
    for suffix, replacement in suffixes.items():
        if word.endswith(suffix):
            return word[:-len(suffix)] + replacement
    return word

# Set of words
words = {"running", "happily", "played", "various", "flies", "giving"}
print([porter_stem(word) for word in words])


['vari', 'runn', 'giv', 'play', 'happi', 'fly']


Library Implementation

In [15]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()
words = {"running", "happily", "played", "various", "flies", "giving"}
print([porter.stem(word) for word in words])


['variou', 'run', 'give', 'play', 'happili', 'fli']


#Regular Expression Based Stemmer

Algorithm

In [16]:
import re

def regex_stem(word):
    return re.sub(r"(ing|ly|ed|ous|ies|ive|es|ment|er|ation)$", "", word)

# Set of words
words = {"running", "happily", "played", "various", "flies", "giving"}
print([regex_stem(word) for word in words])


['vari', 'runn', 'giv', 'play', 'happi', 'fl']


Library Implementation

In [17]:
from nltk.stem import RegexpStemmer

regexp = RegexpStemmer(r"(ing|ly|ed|ous|ies|ive|es|ment|er|ation)$")
words = {"running", "happily", "played", "various", "flies", "giving"}
print([regexp.stem(word) for word in words])


['vari', 'runn', 'giv', 'play', 'happi', 'fl']


#Lemmatizer

Algorithm

In [18]:
def simple_lemmatizer(word):
    lemma_dict = {"running": "run", "mice": "mouse", "feet": "foot", "better": "good",
                  "flies": "fly", "giving": "give", "various": "various"}
    return lemma_dict.get(word, word)

# Set of words
words = {"running", "happily", "played", "various", "flies", "giving"}
print([simple_lemmatizer(word) for word in words])


['various', 'run', 'give', 'played', 'happily', 'fly']


Library Implementation

In [19]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [20]:
lemmatizer = WordNetLemmatizer()
words = {"running", "happily", "played", "various", "flies", "giving"}
print([lemmatizer.lemmatize(word, pos="v") for word in words])


['various', 'run', 'give', 'play', 'happily', 'fly']
