## Tokenization


In [None]:
import nltk

nltk.download("punkt_tab")
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/hugorodriguez/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
sentences = "Her cat's name is Luna. Her dog's name is max"
sent_tokenize(sentences)

["Her cat's name is Luna.", "Her dog's name is max"]

In [4]:
sentence = "Her cat's name is Luna"
word_tokenize(sentence)

['Her', 'cat', "'s", 'name', 'is', 'Luna']

In [5]:
sentence_2 = "Her cat's name is Luna and her dog's name is max"
word_tokenize(sentence_2)

['Her',
 'cat',
 "'s",
 'name',
 'is',
 'Luna',
 'and',
 'her',
 'dog',
 "'s",
 'name',
 'is',
 'max']

In [None]:
import contractions
import re

nltk.download("stopwords")
from nltk.corpus import stopwords

en_stopwords = stopwords.words("english")

sentence_2 = "Her cat's name is Luna and her dog's name is max and I'm Hugo"

sentence_no_stopwords = " ".join(
    [word for word in sentence_2.split() if word not in en_stopwords]
)


sentence_2_fix = contractions.fix(sentence_no_stopwords)
sentence_2_fix = re.sub(r"'s\b", "", sentence_2_fix).lower()

sentence_2_fix

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hugorodriguez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'her cat name luna dog name max i am hugo'

In [14]:
word_tokenize(sentence_2_fix)

['her', 'cat', 'name', 'luna', 'dog', 'name', 'max', 'i', 'am', 'hugo']

## Stemming


In [2]:
from nltk.stem import PorterStemmer

In [3]:
ps = PorterStemmer()

In [None]:
connect_tokens = ["connecting", "connect", "connectivity", "connect", "connects"]

In [None]:
for t in connect_tokens:
    print(t, ": ", ps.stem(t))

connecting :  connect
connect :  connect
connectivity :  connect
connect :  connect
connects :  connect


In [None]:
learn_tokens = ["learned", "learning", "learn", "learns", "learner", "learners"]

In [None]:
for t in learn_tokens:
    print(t, ": ", ps.stem(t))

learned :  learn
learning :  learn
learn :  learn
learns :  learn
learner :  learner
learners :  learner


In [None]:
likes_tokens = ["likes", "better", "worse", "studies"]

In [None]:
for t in likes_tokens:
    print(t, ": ", ps.stem(t))

likes :  like
better :  better
worse :  wors
studies :  studi


## Lemmatization


In [None]:
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hugorodriguez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
lemmatizer = WordNetLemmatizer()

In [None]:
for t in connect_tokens:
    print(t, ": ", lemmatizer.lemmatize(t))

connecting :  connecting
connect :  connect
connectivity :  connectivity
connect :  connect
connects :  connects


In [None]:
for t in learn_tokens:
    print(t, ": ", lemmatizer.lemmatize(t))

learned :  learned
learning :  learning
learn :  learn
learns :  learns
learner :  learner
learners :  learner


In [None]:
for t in likes_tokens:
    print(t, ": ", lemmatizer.lemmatize(t))

likes :  like
better :  better
worse :  worse
studies :  study
