In [1]:
# Import necessary libraries
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Sample text
text = "Text preprocessing is an important step in natural language processing. It involves tokenization, filtration, script validation, stop word removal, and stemming."

# Tokenization: Split the text into words or tokens
tokens = word_tokenize(text)

# Filtration: Remove non-alphanumeric characters and convert to lowercase
filtered_tokens = [re.sub(r'[^a-zA-Z0-9]', '', token).lower() for token in tokens]

# Script Validation: You can use regular expressions to validate scripts (e.g., only keep words with Latin characters)
latin_tokens = [token for token in filtered_tokens if re.match('^[a-zA-Z]+$', token)]

# Stop Word Removal: Remove common stop words
stop_words = set(stopwords.words('english'))
filtered_tokens_no_stop = [token for token in latin_tokens if token not in stop_words]

# Stemming: Reduce words to their root form using Porter Stemmer
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens_no_stop]

# Display the results
print("Original Text:")
print(text)
print("\nTokenization:")
print(tokens)
print("\nFiltration:")
print(filtered_tokens)
print("\nScript Validation:")
print(latin_tokens)
print("\nStop Word Removal:")
print(filtered_tokens_no_stop)
print("\nStemming:")
print(stemmed_tokens)


Original Text:
Text preprocessing is an important step in natural language processing. It involves tokenization, filtration, script validation, stop word removal, and stemming.

Tokenization:
['Text', 'preprocessing', 'is', 'an', 'important', 'step', 'in', 'natural', 'language', 'processing', '.', 'It', 'involves', 'tokenization', ',', 'filtration', ',', 'script', 'validation', ',', 'stop', 'word', 'removal', ',', 'and', 'stemming', '.']

Filtration:
['text', 'preprocessing', 'is', 'an', 'important', 'step', 'in', 'natural', 'language', 'processing', '', 'it', 'involves', 'tokenization', '', 'filtration', '', 'script', 'validation', '', 'stop', 'word', 'removal', '', 'and', 'stemming', '']

Script Validation:
['text', 'preprocessing', 'is', 'an', 'important', 'step', 'in', 'natural', 'language', 'processing', 'it', 'involves', 'tokenization', 'filtration', 'script', 'validation', 'stop', 'word', 'removal', 'and', 'stemming']

Stop Word Removal:
['text', 'preprocessing', 'important', 's