# Stemming

In [1]:
# There are several Stemming algorithms available. We will use PorterStemmer that comes with nltk library.
from nltk.stem import PorterStemmer

In [19]:
# Create a stemming object
stemmer = PorterStemmer()

# Pass a value to the stemmer

print(stemmer.stem('walking')) # returns 'walk'
print(stemmer.stem('Announcing')) # returns 'announc' which is not a real word. T
                                  #this shows that Stemming is a crude approach
print(stemmer.stem('Mice')) # although the 'mice' is plural, there is no 's' at the end. 
                            # the Stemmer returns the word as is. Therefore we choose Lemmatization for such words.
print(stemmer.stem('running'))# Returns run
print(stemmer.stem('was')) # returns 'wa' which is not a real word
print(stemmer.stem('unnecessary')) # returned 'unnecessari', this happens based on whether the word ends with 'y'
print(stemmer.stem('wary')) # returned 'wari'
print(stemmer.stem('indices')) # retruned 'indic', again not a real word. It stemmed the 'es' part.

walk
announc
mice
run
wa
unnecessari
wari
indic


# Lemmatization

In [1]:
# lemmatization algos appear in nltk, spaCy etc. here we use nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import nltk
nltk.download('omw-1.4') #Recieved an error asking for this data. hence downloaded this using nltk Downloader.

In [3]:
# create a lemmatizer object
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('mice')) # Returns 'mouse' as it is the root word for 'mice'.

mouse


In [21]:
# sometimes a word can be a noun, verb, adjective depending on Parts of Speech.
# For example
print(lemmatizer.lemmatize('going')) # Returns 'going' instead of 'go'.

# We have to let the module know that 'going' is a verb and not a noun
print(lemmatizer.lemmatize('going', pos = wordnet.VERB)) # Returns 'go', which is desired.
print(lemmatizer.lemmatize('walking', pos = wordnet.VERB)) # Recognizes that 'walking' is verb.
print(lemmatizer.lemmatize('better', pos = wordnet.ADJ)) # Returns 'good' as better is an adjective.

going
go
walk
good


In [23]:
# Sometimes we need the pos tags inorder determine whether the word is a noun, adjective, conjunction or other parts
# of speech. We can use nltk's pos_tags to do the job.
# We have to donwload a pos tagger called 'averaged_perceptron_tagger'
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Chaitra.b.c\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [27]:
# Lets use a sentence which has a word which can be noun or verb depending on the position.
sentence = 'Andrew Tate has a world-wide following'

# split them into tokens
tokens = sentence.split()
print(tokens)

# tag them using nltk's parts of speech tagger
tagged_tokens = nltk.pos_tag(tokens)
tagged_tokens

['Andrew', 'Tate', 'has', 'a', 'world-wide', 'following']


[('Andrew', 'NNP'),
 ('Tate', 'NNP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('world-wide', 'JJ'),
 ('following', 'NN')]

POS tag list:

CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent\'s
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when

In [36]:
# Mapping the word using a if-elif ladder. We can use a predefined dictionary for this mapping.
def get_wordnet_tags(token_tags):
    if token_tags.startswith(('J','D')):
        return wordnet.ADJ
    elif token_tags.startswith('V'):
        return wordnet.VERB
    elif token_tags.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [41]:
# Empty list to contain the root words
root_word_list = []

# We have to do the lemmatization one by one for each word.
for tagged_token in tagged_tokens:
    root_word = lemmatizer.lemmatize(tagged_token[0], pos = get_wordnet_tags(tagged_token[1]))
    root_word_list.append(root_word)

print('original sentence : ',sentence)
print('Lemmatized sentence : ',' '.join(root_word_list))

    

original sentence :  Andrew Tate has a world-wide following
Lemmatized sentence :  Andrew Tate have a world-wide following


In [28]:
dir(wordnet)

['ADJ',
 'ADJ_SAT',
 'ADV',
 'MORPHOLOGICAL_SUBSTITUTIONS',
 'NOUN',
 'VERB',
 '_ENCODING',
 '_FILEMAP',
 '_FILES',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_citation',
 '_compute_max_depth',
 '_data_file',
 '_data_file_map',
 '_encoding',
 '_exception_map',
 '_fileids',
 '_get_root',
 '_key_count_file',
 '_key_synset_file',
 '_lang_data',
 '_lemma_pos_offset_map',
 '_lexnames',
 '_license',
 '_load_exception_map',
 '_load_lang_data',
 '_load_lemma_pos_offset_map',
 '_max_depth',
 '_morphy',
 '_omw_reader',
 '_pos_names',
 '_pos_numbers',
 '_readme',
 '_root',
 '_synset_from_pos_and_line',
 '_synset_from_pos_and_offset',
 '_synset_offset_cache',
 '_tagset',
 '_