## Stemming - Lemminizer
About stemming, see: https://en.wikipedia.org/wiki/Stemming

See also: 
https://www.nltk.org/<br/>
https://pypi.python.org/pypi/stemming/1.0![image.png](attachment:image.png)<br/>
https://pythonspot.com/category/nltk/<br/>
https://textminingonline.com/dive-into-nltk-part-i-getting-started-with-nltk

In [None]:
!pip install --user --upgrade nltk

In [None]:
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
nltk.download('punkt') # needed for the word_tokenize method
nltk.download('averaged_perceptron_tagger') # needed for part of speech
nltk.download('wordnet') # Needed for lemmatizer

# Stemming

In [None]:
data="game Gaming gamed games"
words = word_tokenize(data)

ps = PorterStemmer()
 
for word in words:
    print(word + " --> " + ps.stem(word))

## Comparing Stemmers in NLTK

In [None]:
import pandas as pd

data="game Gaming gamed games is are was were man men wolves abaci good better best surprise surprisingly"
words = word_tokenize(data)

ls = LancasterStemmer()
sl = SnowballStemmer('english') # or nltk.stem.snowball.EnglishStemmer(ignore_stopwords=False)
table=[]
for word in words:
    table.append([word , ps.stem(word),ls.stem(word),sl.stem(word)])
    
table_pd = pd.DataFrame(table, columns=["Word", "Porter", "Lancaster", "Snowball"])
table_pd

# Part of Speech tags:
<table align="left" style="width:80%">
    <tr>
        <td style="text-align: left;">CC	coordinating conjunction</td>
        <td style="text-align: left;">CD	cardinal digit</td>
    </tr>
    <tr>
        <td style="text-align: left;">DT	determiner</td>
        <td style="text-align: left;">EX	existential there (like: "there is" / "there exists")</td>
    </tr>
    <tr>
        <td style="text-align: left;">FW	foreign word</td>
        <td style="text-align: left;">IN	preposition/subordinating conjunction</td>
    </tr>
    <tr>
        <td style="text-align: left;">JJ	adjective	'big'</td>
<td style="text-align: left;">JJR	adjective, comparative	'bigger'</td>
    </tr>
    <tr>
        <td style="text-align: left;">JJS	adjective, superlative	'biggest'</td>
        <td style="text-align: left;">LS	list marker	1)</td>
    </tr>
    <tr>
        <td style="text-align: left;">MD	modal	could, will</td>
        <td style="text-align: left;">NN	noun, singular 'desk'</td>
    </tr>
    <tr>
        <td style="text-align: left;">NNS	noun plural	'desks'</td>
        <td style="text-align: left;">NNP	proper noun, singular	'Harrison'</td>
    </tr>
    <tr>
        <td style="text-align: left;">NNPS	proper noun, plural	'Americans'</td>
        <td style="text-align: left;">PDT	predeterminer	'all the kids'</td>
    </tr>
    <tr>
        <td style="text-align: left;">POS	possessive ending	parent's</td>
        <td style="text-align: left;">PRP	personal pronoun	I, he, she</td>
    </tr>
    <tr>
<td style="text-align: left;">PRP\$	possessive pronoun	my, his, hers</td>
<td style="text-align: left;">RB	adverb	very, silently,</td>
</tr>
<tr>
<td style="text-align: left;">RBR	adverb, comparative	better</td>
<td style="text-align: left;">RBS	adverb, superlative	best</td>
</tr>
<tr>
<td style="text-align: left;">RP	particle	give up</td>
<td style="text-align: left;">TO	to	go 'to' the store.</td>
</tr>
<tr>
<td style="text-align: left;">UH	interjection	errrrrrrrm</td>
<td style="text-align: left;">VB	verb, base form	take</td>
</tr>
<tr>
<td style="text-align: left;">VBD	verb, past tense	took</td>
<td style="text-align: left;">VBG	verb, gerund/present participle	taking</td>
</tr>
<tr>
<td style="text-align: left;">VBN	verb, past participle	taken</td>
<td style="text-align: left;">VBP	verb, sing. present, non-3d	take</td>
</tr>
<tr>
<td style="text-align: left;">VBZ	verb, 3rd person sing. present	takes</td>
<td style="text-align: left;">WDT	wh-determiner	which</td>
</tr>
<tr>
<td style="text-align: left;">WP	wh-pronoun	who, what</td>
<td style="text-align: left;">WP\$	possessive wh-pronoun	whose</td>
    </tr>
    <tr>
        <td style="text-align: left;">WRB	wh-abverb	where, when</td>
        <td style="text-align: left;"></td>
    </tr>
</table>

In [None]:
data="game gaming gamed games is are was were man men wolves abaci good better best surprise surprisingly"
words = word_tokenize(data)
nltk.pos_tag(words)

In [None]:
nltk.pos_tag(word_tokenize("the wolves of wall street did not bring their abaci") )

# Lemmatizer

### Part of Speech for the lemmatizer

In [None]:
# Create a mapping between pos_tag and lemmatizer
# When in doubt, just put noun
posMapping = { "CC": wn.NOUN, "CD": wn.NOUN, "DT": wn.NOUN, "EX": wn.NOUN,
               "FW": wn.NOUN, "IN": wn.NOUN, "JJ": wn.ADJ,  "JJR": wn.ADJ,
               "JJS": wn.ADJ, "LS": wn.NOUN, "MD": wn.VERB, "NN": wn.NOUN,
               "NNS": wn.NOUN, "NNP": wn.NOUN, "NNPS": wn.NOUN, "PDT": wn.NOUN,
               "POS": wn.NOUN, "PRP": wn.NOUN, "PRP$": wn.NOUN, "RB": wn.ADV,
               "RBR": wn.ADV, "RBS": wn.ADV, "RP": wn.NOUN, "TO": wn.NOUN,
               "UH": wn.NOUN, "VB": wn.VERB,"VBD": wn.VERB, "VBG": wn.VERB,
               "VBN": wn.VERB, "VBP": wn.VERB, "VBZ": wn.VERB, "WDT": wn.NOUN,
               "WP": wn.NOUN, "WP$": wn.NOUN, "WRB": wn.ADV   
}

In [None]:
lemmatiser = WordNetLemmatizer()
words = word_tokenize(data)
result = nltk.pos_tag(words)

for word in result:
    print("  " + word[0] + "(" +  word[1] + ") --> " + lemmatiser.lemmatize(word[0], pos=posMapping[word[1]]))

In [None]:
result = nltk.pos_tag(word_tokenize("the wolves of wall street did not bring their abaci"))

for word in result:
    print("  " + word[0] + "(" +  word[1] + ") --> " + lemmatiser.lemmatize(word[0], pos=posMapping[word[1]]))