#### 09-02-2022

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
text = "She jumped inot the river and breathed heavily"
wordnet = WordNetLemmatizer()
tokenizer = word_tokenize(text)

for token in tokenizer:
  print(token,"-->", wordnet.lemmatize(token))

She --> She
jumped --> jumped
inot --> inot
the --> the
river --> river
and --> and
breathed --> breathed
heavily --> heavily


In [None]:
text = "I am running and I usually use don't drink water in between"
wordnet = WordNetLemmatizer()
tokenizer = word_tokenize(text)

for token in tokenizer:
  print(token,"-->", wordnet.lemmatize(token))

I --> I
am --> am
running --> running
and --> and
I --> I
usually --> usually
use --> use
do --> do
n't --> n't
drink --> drink
water --> water
in --> in
between --> between


In [None]:
# with pos tagging
nltk.download('averaged_perceptron_tagger')

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
text = "She jumped inot the river and breathed heavily"
wordnet = WordNetLemmatizer()

for token,tag in pos_tag(word_tokenize(text)):
  pos=tag[0].lower()

  if pos not in ['a','r','n','v']:
    pos='n'
  print(token,"-->", wordnet.lemmatize(token,pos))

She --> She
jumped --> jump
inot --> inot
the --> the
river --> river
and --> and
breathed --> breathe
heavily --> heavily


In [None]:
text = "I am running and I usually use to runs"
wordnet = WordNetLemmatizer()

for token,tag in pos_tag(word_tokenize(text)):
  pos=tag[0].lower()

  if pos not in ['a','r','n','v']:
    pos='n'
  print(token,"-->", wordnet.lemmatize(token,pos))

I --> I
am --> be
running --> run
and --> and
I --> I
usually --> usually
use --> use
to --> to
runs --> run


In [None]:
from nltk.stem import SnowballStemmer,PorterStemmer,WordNetLemmatizer
from nltk import word_tokenize,pos_tag
 
snowball = SnowballStemmer(language='english')
porter = PorterStemmer()
wordnet = WordNetLemmatizer()
 
text = ["better","Caring","are","am","worse","struggling",'meeting']
print("{0:10}{1:20}{2:30}".format("Word","Snowball Stemmer","Wordnet Lemmatizer"))
for token,tag in pos_tag(text):

    pos=tag[0].lower()
    if pos not in ['a', 'r', 'n', 'v']:
        pos='n'

    print("{0:10}{1:20}{2:30}".format(token,snowball.stem(token),wordnet.lemmatize(token,pos)))

Word      Snowball Stemmer    Wordnet Lemmatizer            
better    better              well                          
Caring    care                Caring                        
are       are                 be                            
am        am                  be                            
worse     wors                worse                         
strugglingstruggl             struggle                      
meeting   meet                meeting                       


In [None]:
string  = "Books! And cleverness! There are more important things - friendship and bravery and - oh Harry - be careful!"
from nltk import sent_tokenize, word_tokenize
sequences = sent_tokenize(string)
sequences

['Books!',
 'And cleverness!',
 'There are more important things - friendship and bravery and - oh Harry - be careful!']

In [None]:
seq_tokens = [word_tokenize(seq) for seq in sequences]
seq_tokens 

[['Books', '!'],
 ['And', 'cleverness', '!'],
 ['There',
  'are',
  'more',
  'important',
  'things',
  '-',
  'friendship',
  'and',
  'bravery',
  'and',
  '-',
  'oh',
  'Harry',
  '-',
  'be',
  'careful',
  '!']]

In [None]:
import string
no_punct_seq_tokens=[]
for seq_token in seq_tokens:
  no_punct_seq_tokens.append([
                              token for token in seq_token 
                              if token not in string.punctuation
                              ])

In [None]:
no_punct_seq_tokens

[['Books'],
 ['And', 'cleverness'],
 ['There',
  'are',
  'more',
  'important',
  'things',
  'friendship',
  'and',
  'bravery',
  'and',
  'oh',
  'Harry',
  'be',
  'careful']]

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
lm = WordNetLemmatizer()
lemm_tokens = [lm.lemmatize(token) for seq in no_punct_seq_tokens for token in seq]

In [None]:
lemm_tokens

['Books',
 'And',
 'cleverness',
 'There',
 'are',
 'more',
 'important',
 'thing',
 'friendship',
 'and',
 'bravery',
 'and',
 'oh',
 'Harry',
 'be',
 'careful']

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stem_tokens = [stemmer.stem(token) for seq in no_punct_seq_tokens for token in seq]
stem_tokens

['book',
 'and',
 'clever',
 'there',
 'are',
 'more',
 'import',
 'thing',
 'friendship',
 'and',
 'braveri',
 'and',
 'oh',
 'harri',
 'be',
 'care']

In [None]:
print("Lemmatized tokens : ",lemm_tokens)
print("Stem tokens : ",stem_tokens)

Lemmatized tokens :  ['Books', 'And', 'cleverness', 'There', 'are', 'more', 'important', 'thing', 'friendship', 'and', 'bravery', 'and', 'oh', 'Harry', 'be', 'careful']
Stem tokens :  ['book', 'and', 'clever', 'there', 'are', 'more', 'import', 'thing', 'friendship', 'and', 'braveri', 'and', 'oh', 'harri', 'be', 'care']
