## Stemming Vs Lemmatization

In [10]:
# import necessary libraries
import nltk

# tokenization
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# POS tagging
from nltk import pos_tag

# to map pos tags to wordnet tags
from nltk.corpus import wordnet

# stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stemming
from nltk.stem import PorterStemmer

# Lemmatization
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harika\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harika\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
text = "He glanced up from his computer when she came into his office"
text

'He glanced up from his computer when she came into his office'

### Stemming

In [12]:
# Instantiate PosterStemmer()
stemmer = PorterStemmer()

tokens = word_tokenize(text)
stem = []
for ele in tokens:
    if ele.lower() not in stopwords.words('english'):
        stem.append(stemmer.stem(ele))

### Lemmatization

In [13]:
# Lemmatization
# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

#Instantiate WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

lemma = []
pos = pos_tag(word_tokenize(text))
for ele, tag in pos:
    tag = pos_dict.get(tag[0])
    if ele.lower() not in stopwords.words('english'):
        if not tag:
            lemma.append(ele)
        else:
            lemma.append(wordnet_lemmatizer.lemmatize(ele, tag))

In [15]:
print("Text:", text)
print("Stem:", stem)
print("Lemma:", lemma)

Text: He glanced up from his computer when she came into his office
Stem: ['glanc', 'comput', 'came', 'offic']
Lemma: ['glance', 'computer', 'come', 'office']
