In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [7]:
display(nltk.download("punkt_tab"))
display(nltk.download("stopwords"))

[nltk_data] Downloading package punkt_tab to /home/kshku/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

[nltk_data] Downloading package stopwords to /home/kshku/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)
    print("Tokenized words:", tokens)

    # Filteration (removeing punctuations and special characters)
    filtered_tokens = [word for word in tokens if word.isalnum()]
    print("Filtered tokens:", tokens)

    # Script validation (Removing non-English words/characters)
    validated_token = [word for word in filtered_tokens if re.match("^[A-Za-z]+$", word)]
    print("Script validation tokens:", validated_token)

    # Stop word removal
    stop_words = set(stopwords.words("english"))
    tokens_without_stopwords = [word for word in filtered_tokens if word.lower() not in stop_words]
    print("Tokens after stop word removal:", tokens_without_stopwords)

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens_without_stopwords]
    print("Stemmed tokens:", stemmed_tokens)
    return stemmed_tokens

In [9]:
text = "Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence"

In [10]:
preprocess_text(text)

Tokenized words: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence']
Filtered tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence']
Script validation tokens: ['Natural', 'Language', 'Processing', 'NLP', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence']
Tokens after stop word removal: ['Natural', 'Language', 'Processing', 'NLP', 'fascinating', 'field', 'Artificial', 'Intelligence']
Stemmed tokens: ['natur', 'languag', 'process', 'nlp', 'fascin', 'field', 'artifici', 'intellig']


['natur',
 'languag',
 'process',
 'nlp',
 'fascin',
 'field',
 'artifici',
 'intellig']