In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')#
#nltk.download('wordnet')

In [2]:
document = ''' Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions
between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.
Tokenization is a crucial step in NLP, where text is divided into smaller units called tokens. Part-of-speech tagging (POS tagging) assigns
grammatical information to tokens, such as whether they are nouns, verbs, adjectives, etc. Stop words are common words that are often
filtered out during NLP tasks as they typically do not carry much meaning. Stemming and lemmatization are techniques used to reduce words to their
base or root forms.'''


In [3]:
tokens = word_tokenize(document)
tokens

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'subfield',
 'of',
 'linguistics',
 ',',
 'computer',
 'science',
 ',',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'language',
 ',',
 'in',
 'particular',
 'how',
 'to',
 'program',
 'computers',
 'to',
 'process',
 'and',
 'analyze',
 'large',
 'amounts',
 'of',
 'natural',
 'language',
 'data',
 '.',
 'Tokenization',
 'is',
 'a',
 'crucial',
 'step',
 'in',
 'NLP',
 ',',
 'where',
 'text',
 'is',
 'divided',
 'into',
 'smaller',
 'units',
 'called',
 'tokens',
 '.',
 'Part-of-speech',
 'tagging',
 '(',
 'POS',
 'tagging',
 ')',
 'assigns',
 'grammatical',
 'information',
 'to',
 'tokens',
 ',',
 'such',
 'as',
 'whether',
 'they',
 'are',
 'nouns',
 ',',
 'verbs',
 ',',
 'adjectives',
 ',',
 'etc',
 '.',
 'Stop',
 'words',
 'are',
 'common',
 'words',
 'that',
 'are',
 'often',
 'filtered',
 'out',
 'during',
 'NLP',
 'tas

In [4]:
Pos_tags = pos_tag(tokens)
Pos_tags

[('Natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('(', '('),
 ('NLP', 'NNP'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('subfield', 'NN'),
 ('of', 'IN'),
 ('linguistics', 'NNS'),
 (',', ','),
 ('computer', 'NN'),
 ('science', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('concerned', 'VBN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('interactions', 'NNS'),
 ('between', 'IN'),
 ('computers', 'NNS'),
 ('and', 'CC'),
 ('human', 'JJ'),
 ('language', 'NN'),
 (',', ','),
 ('in', 'IN'),
 ('particular', 'JJ'),
 ('how', 'WRB'),
 ('to', 'TO'),
 ('program', 'NN'),
 ('computers', 'NNS'),
 ('to', 'TO'),
 ('process', 'VB'),
 ('and', 'CC'),
 ('analyze', 'VB'),
 ('large', 'JJ'),
 ('amounts', 'NNS'),
 ('of', 'IN'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('data', 'NNS'),
 ('.', '.'),
 ('Tokenization', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('crucial', 'JJ'),
 ('step', 'NN'),
 ('in', 'IN'),
 ('NLP', 'NNP'),
 (',', ','),
 ('where', 'WRB'),
 ('text', 'N

In [5]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [6]:
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
filtered_tokens

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'subfield',
 'linguistics',
 ',',
 'computer',
 'science',
 ',',
 'artificial',
 'intelligence',
 'concerned',
 'interactions',
 'computers',
 'human',
 'language',
 ',',
 'particular',
 'program',
 'computers',
 'process',
 'analyze',
 'large',
 'amounts',
 'natural',
 'language',
 'data',
 '.',
 'Tokenization',
 'crucial',
 'step',
 'NLP',
 ',',
 'text',
 'divided',
 'smaller',
 'units',
 'called',
 'tokens',
 '.',
 'Part-of-speech',
 'tagging',
 '(',
 'POS',
 'tagging',
 ')',
 'assigns',
 'grammatical',
 'information',
 'tokens',
 ',',
 'whether',
 'nouns',
 ',',
 'verbs',
 ',',
 'adjectives',
 ',',
 'etc',
 '.',
 'Stop',
 'words',
 'common',
 'words',
 'often',
 'filtered',
 'NLP',
 'tasks',
 'typically',
 'carry',
 'much',
 'meaning',
 '.',
 'Stemming',
 'lemmatization',
 'techniques',
 'used',
 'reduce',
 'words',
 'base',
 'root',
 'forms',
 '.']

In [7]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

In [8]:
print("Stemmed Tokens:")
for token in stemmed_tokens:
    print(token)

Stemmed Tokens:
natur
languag
process
(
nlp
)
subfield
linguist
,
comput
scienc
,
artifici
intellig
concern
interact
comput
human
languag
,
particular
program
comput
process
analyz
larg
amount
natur
languag
data
.
token
crucial
step
nlp
,
text
divid
smaller
unit
call
token
.
part-of-speech
tag
(
po
tag
)
assign
grammat
inform
token
,
whether
noun
,
verb
,
adject
,
etc
.
stop
word
common
word
often
filter
nlp
task
typic
carri
much
mean
.
stem
lemmat
techniqu
use
reduc
word
base
root
form
.


In [9]:
print("Lemmatized Tokens:")
for token in lemmatized_tokens:
    print(token)

Lemmatized Tokens:
Natural
language
processing
(
NLP
)
subfield
linguistics
,
computer
science
,
artificial
intelligence
concerned
interaction
computer
human
language
,
particular
program
computer
process
analyze
large
amount
natural
language
data
.
Tokenization
crucial
step
NLP
,
text
divided
smaller
unit
called
token
.
Part-of-speech
tagging
(
POS
tagging
)
assigns
grammatical
information
token
,
whether
noun
,
verb
,
adjective
,
etc
.
Stop
word
common
word
often
filtered
NLP
task
typically
carry
much
meaning
.
Stemming
lemmatization
technique
used
reduce
word
base
root
form
.
