<a href="https://colab.research.google.com/github/jaison-1920/nlp/blob/main/Tokenization%2CStemming%2CLemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using NLTK library

## Tokenization

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
corpus = """ Hello! Welcome, My name is Jaison Philip.
I love to study A.I. Artificail intelligence is world's new trend.
"""

In [None]:
print(corpus)

 Hello! Welcome, My name is Jaison Philip.
I love to study A.I. Artificail intelligence is world's new trend.



In [None]:
# using sentence tokenizer
from nltk.tokenize import sent_tokenize
document = sent_tokenize(corpus)
document

[' Hello!',
 'Welcome, My name is Jaison Philip.',
 'I love to study A.I.',
 "Artificail intelligence is world's new trend."]

In [None]:
# using word tokenizer
from nltk.tokenize import word_tokenize
word_tokenize(corpus)

['Hello',
 '!',
 'Welcome',
 ',',
 'My',
 'name',
 'is',
 'Jaison',
 'Philip',
 '.',
 'I',
 'love',
 'to',
 'study',
 'A.I',
 '.',
 'Artificail',
 'intelligence',
 'is',
 'world',
 "'s",
 'new',
 'trend',
 '.']

In [None]:
for sentence in document:
  print(sentence)

 Hello!
Welcome, My name is Jaison Philip.
I love to study A.I.
Artificail intelligence is world's new trend.


In [None]:
for sentence in document:
  print(word_tokenize(sentence))

['Hello', '!']
['Welcome', ',', 'My', 'name', 'is', 'Jaison', 'Philip', '.']
['I', 'love', 'to', 'study', 'A.I', '.']
['Artificail', 'intelligence', 'is', 'world', "'s", 'new', 'trend', '.']


In [None]:
#word_punct_tokenize
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(corpus)
# it will tokenize each and every word with  punctuation marks
# earlier A.I was a single word in word_tokenize but those are
# tokenized in wordpunct_tokenize

['Hello',
 '!',
 'Welcome',
 ',',
 'My',
 'name',
 'is',
 'Jaison',
 'Philip',
 '.',
 'I',
 'love',
 'to',
 'study',
 'A',
 '.',
 'I',
 '.',
 'Artificail',
 'intelligence',
 'is',
 'world',
 "'",
 's',
 'new',
 'trend',
 '.']

In [None]:
#TreeBankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer
#first create an obj
tree = TreebankWordTokenizer()
tree.tokenize(corpus)
# in this the last sentence only the . is considered as punctuation
# rest of the sentences end with .

['Hello',
 '!',
 'Welcome',
 ',',
 'My',
 'name',
 'is',
 'Jaison',
 'Philip.',
 'I',
 'love',
 'to',
 'study',
 'A.I.',
 'Artificail',
 'intelligence',
 'is',
 'world',
 "'s",
 'new',
 'trend',
 '.']

## Stemming

#### Stemming is cutting the prefixes or suffixes of a word and converting it into root stem
#### eg: eating --> eat
#### ===================
#### porterStemmer, snowballStemmer, regexpStemmer etc are some examples

In [None]:
from nltk.stem import PorterStemmer

In [27]:
words = ['eaten','eating','writing','eats','writes','programs','programming','history','finally','finalize']
stemmer = PorterStemmer()
for word in words:
  print(word+"---->"+stemmer.stem(word))
# some words like eaten,history is not getting proper words

eaten---->eaten
eating---->eat
writing---->write
eats---->eat
writes---->write
programs---->program
programming---->program
history---->histori
finally---->final
finalize---->final


In [31]:
from nltk.stem import RegexpStemmer
reg_stemmer = RegexpStemmer('ing$|s$|e$|able$|en$')
for word in words:
  print(word+"---->"+reg_stemmer.stem(word))
#here the stemmer follow the rules blindly based on the regex
#we are providing. so it is not so effective

eaten---->eat
eating---->eat
writing---->writ
eats---->eat
writes---->write
programs---->program
programming---->programm
history---->history
finally---->finally
finalize---->finaliz


In [35]:
from nltk.stem import SnowballStemmer
snow = SnowballStemmer('english')
for word in words:
  print(word+"---->"+snow.stem(word))
#yet there are some problems because in stemming
#we are not getting the root word. we are getting
#the root stem. but snowball is far better than porter

eaten---->eaten
eating---->eat
writing---->write
eats---->eat
writes---->write
programs---->program
programming---->program
history---->histori
finally---->final
finalize---->final


In [36]:
# using porter stemmer
stemmer.stem('fairly'),stemmer.stem('sportingly')

('fairli', 'sportingli')

In [37]:
# using snowball stemmer
snow.stem('fairly'),snow.stem('sportingly')
#more better

('fair', 'sport')

## Lemmatization
#### It is the process of cutting down a word to its root word
#### eg: eating ---> eat
#### eg: ate ----> eat
#### WordnetLemmatizer in nltk is used for lemmatization
#### it have 2 parameters---> the word and 'pos'

In [39]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('eating',pos='v')

[nltk_data] Downloading package wordnet to /root/nltk_data...


'eat'

In [40]:
lemmatizer.lemmatize('ate',pos='v')

'eat'

In [46]:
lemmatizer.lemmatize('pleasent',pos='n')

'pleasent'

In [48]:
lemmatizer.lemmatize('went',pos='v')
# v-> verb
# a-> adjective
# n-> noun
# r-> adverb

'go'