In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# ---------------------------------------------------------
# 1. SETUP & DICTIONARY DOWNLOADS
# ---------------------------------------------------------
# We need to download specific data "packages" for the computer to understand rules.
# 'punkt' is for sentence structure, 'stopwords' is for filler words,
# and 'wordnet' is a dictionary for finding word roots.
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
# ---------------------------------------------------------
# 2. THE INPUT (Raw Text)
# ---------------------------------------------------------
# Imagine an AI Agent receives this request. It's full of "noise":
# mixed cases, punctuation, and emojis that the AI doesn't need for math.
sentence = "The students are learning Artificial Intelligence quickly and happily! ðŸ¤–"
print(f"ðŸŒŸ ORIGINAL REQUEST: {sentence}\n" + "="*50)


ðŸŒŸ ORIGINAL REQUEST: The students are learning Artificial Intelligence quickly and happily! ðŸ¤–


In [3]:
# ---------------------------------------------------------
# 3. LOWERCASE & TOKENIZATION
# ---------------------------------------------------------
# Why: AI treats "The" and "the" as different things. We lowercase for consistency.
# Tokenization: We split the string into a list of individual words (tokens).
tokens = word_tokenize(sentence.lower())
print(f"âœ… STEP 1: TOKENIZATION\nWords are now separated: {tokens}\n")


âœ… STEP 1: TOKENIZATION
Words are now separated: ['the', 'students', 'are', 'learning', 'artificial', 'intelligence', 'quickly', 'and', 'happily', '!', 'ðŸ¤–']



In [4]:
# ---------------------------------------------------------
# 4. STOPWORD & SYMBOL REMOVAL
# ---------------------------------------------------------
# Why: "The", "are", and "and" are 'Stopwords'. They take up memory but
# don't help the AI understand the core meaning.
# '.isalpha()' ensures we remove symbols, numbers, and emojis.
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
print(f"âœ… STEP 2: NOISE REMOVAL\nOnly the 'meaningful' words remain: {filtered_tokens}\n")


âœ… STEP 2: NOISE REMOVAL
Only the 'meaningful' words remain: ['students', 'learning', 'artificial', 'intelligence', 'quickly', 'happily']



In [5]:
# ---------------------------------------------------------
# 5. STEMMING
# ---------------------------------------------------------
# Why: This is a fast, aggressive way to find a word's root by chopping the ends off.
# Notice how it changes 'artificial' to 'artifici'. It's fast but a bit messy.
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered_tokens]
print(f"âœ… STEP 3: STEMMING\nWords chopped to their roots: {stemmed}\n")



âœ… STEP 3: STEMMING
Words chopped to their roots: ['student', 'learn', 'artifici', 'intellig', 'quickli', 'happili']



In [6]:
# ---------------------------------------------------------
# 6. LEMMATIZATION
# ---------------------------------------------------------
# Why: This is "smarter" than stemming. It uses a dictionary to find the
# real base form (e.g., 'happily' becomes 'happy').
# This is much better for Agentic AI because it preserves the word's logic.
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in filtered_tokens]
print(f"âœ… STEP 4: LEMMATIZATION\nSmart base forms for AI planning: {lemmatized}\n")

print("="*50 + "\nðŸš€ SUCCESS: The text is now ready to be turned into VECTORS (Numbers)!")

âœ… STEP 4: LEMMATIZATION
Smart base forms for AI planning: ['student', 'learning', 'artificial', 'intelligence', 'quickly', 'happily']

ðŸš€ SUCCESS: The text is now ready to be turned into VECTORS (Numbers)!
