In [1]:
import sys
import re
import string
from itertools import chain

import pandas as pd
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import(
    word_tokenize,
    wordpunct_tokenize,
    WhitespaceTokenizer,
    RegexpTokenizer,
    sent_tokenize
)

In [2]:
print(sys.version)

3.11.9 | packaged by conda-forge | (main, Apr 19 2024, 18:36:13) [GCC 12.3.0]


In [7]:
# the below code will need to be run if first time using nltk
# downloads all nltk data used for functionality
# nltk.download("all")

Text normalization is the process of making text uniform. It entails tokenization, removal of stopwords, punction, lemmatization/stemming, etc. **The level of text normalization is task dependent**: in some tasks, less normalization is better.

# Tokenization

In [8]:
text = """
Dr. Smith, a renowned historian with a Ph.D. from Harvard, said, "The 20th century was transformative; it's undeniable." Mr. Johnson, CEO of TechInnovations, agrees: "In the 1990s, technology's impact began accelerating." They were discussing this over email (contact@techinnovations.com), emphasizing the importance of innovation. Meanwhile, Ms. Davis, an expert in ancient cultures, found artifacts worth $2,500,000 in Italy. Interestingly, these artifacts date back to 300 B.C. Visit our website for more: http://www.historyartifacts.com. Did you know? The average temperature in New York has risen by 1.5°C since 1950. This fact, among others, was highlighted in the recent study published by the Environmental Research Institute; however, the implications are yet to be fully understood...
"""

Using NLTK's recommended word tokenizer. Under the hood, this function first applies sentence tokenization, then applies word tokenization on those sentences.

In [9]:
tokens_1 = word_tokenize(text)
print(*tokens_1, sep="\n")

Dr.
Smith
,
a
renowned
historian
with
a
Ph.D.
from
Harvard
,
said
,
``
The
20th
century
was
transformative
;
it
's
undeniable
.
''
Mr.
Johnson
,
CEO
of
TechInnovations
,
agrees
:
``
In
the
1990s
,
technology
's
impact
began
accelerating
.
''
They
were
discussing
this
over
email
(
contact
@
techinnovations.com
)
,
emphasizing
the
importance
of
innovation
.
Meanwhile
,
Ms.
Davis
,
an
expert
in
ancient
cultures
,
found
artifacts
worth
$
2,500,000
in
Italy
.
Interestingly
,
these
artifacts
date
back
to
300
B.C
.
Visit
our
website
for
more
:
http
:
//www.historyartifacts.com
.
Did
you
know
?
The
average
temperature
in
New
York
has
risen
by
1.5°C
since
1950
.
This
fact
,
among
others
,
was
highlighted
in
the
recent
study
published
by
the
Environmental
Research
Institute
;
however
,
the
implications
are
yet
to
be
fully
understood
...


A simpler tokenizer thats regex-based which splits text on punctuation and whitespace.

In [10]:
tokens_2 = wordpunct_tokenize(text)

print(*tokens_2, sep="\n")

Dr
.
Smith
,
a
renowned
historian
with
a
Ph
.
D
.
from
Harvard
,
said
,
"
The
20th
century
was
transformative
;
it
'
s
undeniable
."
Mr
.
Johnson
,
CEO
of
TechInnovations
,
agrees
:
"
In
the
1990s
,
technology
'
s
impact
began
accelerating
."
They
were
discussing
this
over
email
(
contact
@
techinnovations
.
com
),
emphasizing
the
importance
of
innovation
.
Meanwhile
,
Ms
.
Davis
,
an
expert
in
ancient
cultures
,
found
artifacts
worth
$
2
,
500
,
000
in
Italy
.
Interestingly
,
these
artifacts
date
back
to
300
B
.
C
.
Visit
our
website
for
more
:
http
://
www
.
historyartifacts
.
com
.
Did
you
know
?
The
average
temperature
in
New
York
has
risen
by
1
.
5
°
C
since
1950
.
This
fact
,
among
others
,
was
highlighted
in
the
recent
study
published
by
the
Environmental
Research
Institute
;
however
,
the
implications
are
yet
to
be
fully
understood
...


An even simpler tokenizer that tokenizes just on white space. This does the same thing as built-in `split()` method. In fact, NLTK recommends doing just that:

>In general, users should use the string split() method instead.

In [11]:
w_space_tokenizer = WhitespaceTokenizer()
tokens_3 = w_space_tokenizer.tokenize(text)

print(*tokens_3, sep="\n")

Dr.
Smith,
a
renowned
historian
with
a
Ph.D.
from
Harvard,
said,
"The
20th
century
was
transformative;
it's
undeniable."
Mr.
Johnson,
CEO
of
TechInnovations,
agrees:
"In
the
1990s,
technology's
impact
began
accelerating."
They
were
discussing
this
over
email
(contact@techinnovations.com),
emphasizing
the
importance
of
innovation.
Meanwhile,
Ms.
Davis,
an
expert
in
ancient
cultures,
found
artifacts
worth
$2,500,000
in
Italy.
Interestingly,
these
artifacts
date
back
to
300
B.C.
Visit
our
website
for
more:
http://www.historyartifacts.com.
Did
you
know?
The
average
temperature
in
New
York
has
risen
by
1.5°C
since
1950.
This
fact,
among
others,
was
highlighted
in
the
recent
study
published
by
the
Environmental
Research
Institute;
however,
the
implications
are
yet
to
be
fully
understood...


We can even write our own tokenzer if the occassion arises:

In [12]:
# below pattern is broken, only for demo purposes ()
pattern =  """
(?:[A-Z]\.)+               # all caps abbreviations 
| \w+(?:-\w+)*             # hypenated words
| \$?\d{1,3}(?:,\d{3})*(?:\.\d+)?%?  # monetary amounts $2,500,000 or 82%
| \.{3}                    # ellipsis
| [.,;"'?():_`-]           # punctuation characters
| \w+@\w+\.\w{2,3}         # emails
| http[s]?://\S+           # urls
"""

regex_tokenizer = RegexpTokenizer(pattern, flags=re.VERBOSE)
tokens_4 = regex_tokenizer.tokenize(text)

print(*tokens_4, sep="\n")

Dr
.
Smith
,
a
renowned
historian
with
a
Ph
.
D.
from
Harvard
,
said
,
"
The
20th
century
was
transformative
;
it
'
s
undeniable
.
"
Mr
.
Johnson
,
CEO
of
TechInnovations
,
agrees
:
"
In
the
1990s
,
technology
'
s
impact
began
accelerating
.
"
They
were
discussing
this
over
email
(
contact
techinnovations
.
com
)
,
emphasizing
the
importance
of
innovation
.
Meanwhile
,
Ms
.
Davis
,
an
expert
in
ancient
cultures
,
found
artifacts
worth
$2,500,000
in
Italy
.
Interestingly
,
these
artifacts
date
back
to
300
B.C.
Visit
our
website
for
more
:
http
:
www
.
historyartifacts
.
com
.
Did
you
know
?
The
average
temperature
in
New
York
has
risen
by
1
.
5
C
since
1950
.
This
fact
,
among
others
,
was
highlighted
in
the
recent
study
published
by
the
Environmental
Research
Institute
;
however
,
the
implications
are
yet
to
be
fully
understood
...


We can also tokenize into sentences, also referred to sentence segmentation. This very useful in many occassions. For example, if we have to chunk text for processing, we may want to chunktext in a way that doesn't split sentences.

In [13]:
sentences = sent_tokenize(text)

print(*sentences, sep="\n")


Dr. Smith, a renowned historian with a Ph.D. from Harvard, said, "The 20th century was transformative; it's undeniable."
Mr. Johnson, CEO of TechInnovations, agrees: "In the 1990s, technology's impact began accelerating."
They were discussing this over email (contact@techinnovations.com), emphasizing the importance of innovation.
Meanwhile, Ms. Davis, an expert in ancient cultures, found artifacts worth $2,500,000 in Italy.
Interestingly, these artifacts date back to 300 B.C.
Visit our website for more: http://www.historyartifacts.com.
Did you know?
The average temperature in New York has risen by 1.5°C since 1950.
This fact, among others, was highlighted in the recent study published by the Environmental Research Institute; however, the implications are yet to be fully understood...


## Stemming & Lemmatization

Stemming and lemmatization both try to reduce word inflections but go about it in different ways. Now that we have words (tokens), we can apply these methods.

In [14]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
lemmatizer = nltk.WordNetLemmatizer()

In [15]:
tokens = word_tokenize(text) # using recommended tokenizer

In [16]:
df = pd.DataFrame(
    {
        "original": tokens,
        "porter": [porter.stem(token) for token in tokens],
        "lancaster": [lancaster.stem(token) for token in tokens],
        "lemmas": [lemmatizer.lemmatize(token) for token in tokens]
    }
)

For the below output, does lemmatization perform as expected?

In [17]:
with pd.option_context("display.max_rows", 200):
    display(df)

Unnamed: 0,original,porter,lancaster,lemmas
0,Dr.,dr.,dr.,Dr.
1,Smith,smith,smi,Smith
2,",",",",",",","
3,a,a,a,a
4,renowned,renown,renown,renowned
5,historian,historian,hist,historian
6,with,with,with,with
7,a,a,a,a
8,Ph.D.,ph.d.,ph.d.,Ph.D.
9,from,from,from,from


Lemmatization seems to be underperforming. Reason is because NLTK implementation of lemmatizer uses the parts of speech tag of word, which has to be manually provided

In [18]:
pos_tags = nltk.pos_tag(tokens) # returns a list of (token, tag) tuples


def get_wordnet_pos(treebank_tag, skip_value="n"):
    """Converts treebank tags to WordNet tags."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return skip_value
    

pos_tags_wordnet = [(token, get_wordnet_pos(tag)) for token, tag in pos_tags]

With the tags known known, we can apply lemmatization again

In [19]:
df = pd.DataFrame(
    {
        "original": tokens,
        "porter": [porter.stem(token) for token in tokens],
        "lancaster": [lancaster.stem(token) for token in tokens],
        "lemmas": [lemmatizer.lemmatize(token) for token in tokens],
        "lemmas_improved": [lemmatizer.lemmatize(token, pos) for token, pos in pos_tags_wordnet]
    }
)

In [20]:
with pd.option_context("display.max_rows", 200):
    display(df)

Unnamed: 0,original,porter,lancaster,lemmas,lemmas_improved
0,Dr.,dr.,dr.,Dr.,Dr.
1,Smith,smith,smi,Smith,Smith
2,",",",",",",",",","
3,a,a,a,a,a
4,renowned,renown,renown,renowned,renowned
5,historian,historian,hist,historian,historian
6,with,with,with,with,with
7,a,a,a,a,a
8,Ph.D.,ph.d.,ph.d.,Ph.D.,Ph.D.
9,from,from,from,from,from


## Stop Words, Case Folding, & Punctuation

Stopwords are words that can be filtered out of because they contribute little meaning to the text. They are usually the most common words in the language, such as "the", "a", "which", "on", etc. A few notes on stopword removal:

1. They reduce noise in the data
2. They improve efficiency of data processing
3. They can increase performance
4. Although stopwords lists are common in different libraries, they may differ and can be customized to fit your paritcular application

In [21]:
# inspecting stopword list
set(stopwords.words("english"))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [22]:
stop_words = set(stopwords.words("english"))
tokenized_stop_words = set(chain.from_iterable([word_tokenize(word) for word in stop_words])) # why do this?
stop_words = stop_words | tokenized_stop_words

punctuation = set(string.punctuation)
special_punctuation = {"``", "''"}
all_punctuation = punctuation | special_punctuation

In [23]:
tokens_clean = [token if token.lower() not in stop_words and token not in all_punctuation else "----" for token in tokens]

pos_tags = nltk.pos_tag(tokens_clean) # returns a list of (token, tag) tuples
pos_tags_wordnet = [(token, get_wordnet_pos(tag)) for token, tag in pos_tags]

df = pd.DataFrame(
    {
        "original": tokens,
        "porter": [porter.stem(token) for token in tokens],
        "lancaster": [lancaster.stem(token) for token in tokens],
        "lemmas": [lemmatizer.lemmatize(token) for token in tokens],
        "lemmas_improved": [lemmatizer.lemmatize(token, pos) for token, pos in pos_tags_wordnet]
    }
)

In [24]:
with pd.option_context("display.max_rows", 200):
    display(df)

Unnamed: 0,original,porter,lancaster,lemmas,lemmas_improved
0,Dr.,dr.,dr.,Dr.,Dr.
1,Smith,smith,smi,Smith,Smith
2,",",",",",",",",----
3,a,a,a,a,----
4,renowned,renown,renown,renowned,renowned
5,historian,historian,hist,historian,historian
6,with,with,with,with,----
7,a,a,a,a,----
8,Ph.D.,ph.d.,ph.d.,Ph.D.,Ph.D.
9,from,from,from,from,----
