In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
lemmatizer = WordNetLemmatizer()

def lemmatize_word(text):

   word_tokens = word_tokenize(text)

   lemmas = [lemmatizer.lemmatize(word,pos ='v')  for word in word_tokens]

   return lemmas

text = 'data science uses scientific methods algorithms and many types of processes'

lemmatize_word(text)

['data',
 'science',
 'use',
 'scientific',
 'methods',
 'algorithms',
 'and',
 'many',
 'type',
 'of',
 'process']

In [None]:
text = "She jumped into the river and breathed heavily"
wordnet = WordNetLemmatizer()
tokenizer = word_tokenize(text)
for token in tokenizer:
  print(token, "-->", wordnet.lemmatize(token))

She --> She
jumped --> jumped
into --> into
the --> the
river --> river
and --> and
breathed --> breathed
heavily --> heavily


In [None]:
text = "I am running and I usually use to runs"
wordnet = WordNetLemmatizer()
tokenizer = word_tokenize(text)
for token in tokenizer:
  print(token, "-->", wordnet.lemmatize(token))

I --> I
am --> am
running --> running
and --> and
I --> I
usually --> usually
use --> use
to --> to
runs --> run


In [None]:
text = "I am generally an artist, I like to draw generalization of art"
wordnet = WordNetLemmatizer()
tokenizer = word_tokenize(text)
for token in tokenizer:
  print(token, "-->", wordnet.lemmatize(token))

I --> I
am --> am
generally --> generally
an --> an
artist --> artist
, --> ,
I --> I
like --> like
to --> to
draw --> draw
generalization --> generalization
of --> of
art --> art


In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
text = "She jumped into the river and breathed heavily"
wordnet = WordNetLemmatizer()
for token,tag in pos_tag(word_tokenize(text)):
  pos = tag[0].lower()
  if pos not in ['a', 'r', 'n', 'v']:
    pos = 'n'
  print(wordnet.lemmatize(token, pos))

She
jump
into
the
river
and
breathe
heavily


In [None]:
text = "I am running and I usually use to runs"
wordnet = WordNetLemmatizer()
for token,tag in pos_tag(word_tokenize(text)):
  pos = tag[0].lower()
  if pos not in ['a', 'r', 'n', 'v']:
    pos = 'n'
  print(token, "-->", wordnet.lemmatize(token, pos))

I --> I
am --> be
running --> run
and --> and
I --> I
usually --> usually
use --> use
to --> to
runs --> run


In [None]:
from nltk.stem import SnowballStemmer,PorterStemmer,WordNetLemmatizer
from nltk import word_tokenize,pos_tag
 
snowball = SnowballStemmer(language='english')
porter = PorterStemmer()
wordnet = WordNetLemmatizer()
 
text = ["better","Caring","are","am","worse","struggling",'meeting']
print("{0:10}{1:20}{2:30}".format("Word","Snowball Stemmer","Wordnet Lemmatizer"))
for token,tag in pos_tag(text):

    pos=tag[0].lower()
    if pos not in ['a', 'r', 'n', 'v']:
        pos='n'

    print("{0:10}{1:20}{2:30}".format(token,snowball.stem(token),wordnet.lemmatize(token,pos)))

Word      Snowball Stemmer    Wordnet Lemmatizer            
better    better              well                          
Caring    care                Caring                        
are       are                 be                            
am        am                  be                            
worse     wors                worse                         
strugglingstruggl             struggle                      
meeting   meet                meeting                       


In [None]:
text = "Books! And cleverness! There are more important things - friendship and bravery and - oh Harry -  be careful!"

In [None]:
from nltk import sent_tokenize
sequences = sent_tokenize(text)
sequences

['Books!',
 'And cleverness!',
 'There are more important things - friendship and bravery and - oh Harry -  be careful!']

In [None]:
seq_tokens = [word_tokenize(seq) for seq in sequences]
seq_tokens

[['Books', '!'],
 ['And', 'cleverness', '!'],
 ['There',
  'are',
  'more',
  'important',
  'things',
  '-',
  'friendship',
  'and',
  'bravery',
  'and',
  '-',
  'oh',
  'Harry',
  '-',
  'be',
  'careful',
  '!']]

In [None]:
import string
no_punct_seq_tokens = []
for seq_token in seq_tokens:
  no_punct_seq_tokens.append([token for token in seq_token if token not in string.punctuation])
no_punct_seq_tokens

[['Books'],
 ['And', 'cleverness'],
 ['There',
  'are',
  'more',
  'important',
  'things',
  'friendship',
  'and',
  'bravery',
  'and',
  'oh',
  'Harry',
  'be',
  'careful']]

In [None]:
lemm_tokens = [wordnet.lemmatize(token) for seq in no_punct_seq_tokens for token in seq]
lemm_tokens

['Books',
 'And',
 'cleverness',
 'There',
 'are',
 'more',
 'important',
 'thing',
 'friendship',
 'and',
 'bravery',
 'and',
 'oh',
 'Harry',
 'be',
 'careful']

In [None]:
stem_tokens = [porter.stem(token) for seq in no_punct_seq_tokens for token in seq]
stem_tokens

['book',
 'and',
 'clever',
 'there',
 'are',
 'more',
 'import',
 'thing',
 'friendship',
 'and',
 'braveri',
 'and',
 'oh',
 'harri',
 'be',
 'care']