# Stemming

Stemming is a text preprocessing technique used in natural language processing (NLP) to reduce words to their root or base form. The goal of stemming is to simplify and standardize words, which helps improve the performance of information retrieval, text classification, and other NLP tasks. By transforming words to their stems, NLP models can treat different forms of the same word as a single entity, reducing the complexity of the text data.

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [8]:
 corpus = "The quick brown fox jumps over the lazy dog. It is running swiftly through the green grass, while the birds are singing beautifully in the trees. The sun is shining brightly, making everything look more vibrant. Children are playing and laughing, creating a joyful atmosphere in the park. Everyone seems to be enjoying the lovely day."

In [11]:
nltk.download('punkt')
words = word_tokenize(corpus)

print(words)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'It', 'is', 'running', 'swiftly', 'through', 'the', 'green', 'grass', ',', 'while', 'the', 'birds', 'are', 'singing', 'beautifully', 'in', 'the', 'trees', '.', 'The', 'sun', 'is', 'shining', 'brightly', ',', 'making', 'everything', 'look', 'more', 'vibrant', '.', 'Children', 'are', 'playing', 'and', 'laughing', ',', 'creating', 'a', 'joyful', 'atmosphere', 'in', 'the', 'park', '.', 'Everyone', 'seems', 'to', 'be', 'enjoying', 'the', 'lovely', 'day', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### PorterStemmer

In [12]:
stemmer = PorterStemmer()

stemmed_words = [stemmer.stem(word) for word in words]

print(stemmed_words)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.', 'it', 'is', 'run', 'swiftli', 'through', 'the', 'green', 'grass', ',', 'while', 'the', 'bird', 'are', 'sing', 'beauti', 'in', 'the', 'tree', '.', 'the', 'sun', 'is', 'shine', 'brightli', ',', 'make', 'everyth', 'look', 'more', 'vibrant', '.', 'children', 'are', 'play', 'and', 'laugh', ',', 'creat', 'a', 'joy', 'atmospher', 'in', 'the', 'park', '.', 'everyon', 'seem', 'to', 'be', 'enjoy', 'the', 'love', 'day', '.']


### RegexpStemmer Class

In [13]:
from nltk.stem import RegexpStemmer

reg_stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4)

In [16]:
regexp_words = [reg_stemmer.stem(word) for word in words]
print(regexp_words)

['The', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '.', 'It', 'is', 'runn', 'swiftly', 'through', 'the', 'green', 'gras', ',', 'whil', 'the', 'bird', 'are', 'sing', 'beautifully', 'in', 'the', 'tree', '.', 'The', 'sun', 'is', 'shin', 'brightly', ',', 'mak', 'everyth', 'look', 'mor', 'vibrant', '.', 'Children', 'are', 'play', 'and', 'laugh', ',', 'creat', 'a', 'joyful', 'atmospher', 'in', 'the', 'park', '.', 'Everyon', 'seem', 'to', 'be', 'enjoy', 'the', 'lovely', 'day', '.']


### Snowball Stemmer

In [17]:
from nltk.stem import SnowballStemmer

snowball = SnowballStemmer('english')

In [18]:
snowball_words = [snowball.stem(word) for word in words]
print(snowball_words)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.', 'it', 'is', 'run', 'swift', 'through', 'the', 'green', 'grass', ',', 'while', 'the', 'bird', 'are', 'sing', 'beauti', 'in', 'the', 'tree', '.', 'the', 'sun', 'is', 'shine', 'bright', ',', 'make', 'everyth', 'look', 'more', 'vibrant', '.', 'children', 'are', 'play', 'and', 'laugh', ',', 'creat', 'a', 'joy', 'atmospher', 'in', 'the', 'park', '.', 'everyon', 'seem', 'to', 'be', 'enjoy', 'the', 'love', 'day', '.']
