In [1]:
# Import necessary libraries
from nltk.tokenize import word_tokenize # For splitting text into tokens
from nltk.corpus import stopwords # For stop word removal
from nltk.stem import PorterStemmer # For stemming words
from nltk.stem import WordNetLemmatizer # For lemmatizing words
from nltk import pos_tag, download # For POS tagging
from nltk.corpus import wordnet # For WordNet integration (lemmatization)
import re # For special character removal

In [2]:
# Download necessary datasets
download('punkt') # For tokenization
download('stopwords') # For accessing stop word list
download('wordnet') # For lemmatization support
download('averaged_perceptron_tagger') # For POS tagging

[nltk_data] Downloading package punkt to C:\Users\Jarshana
[nltk_data]     Shrestha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jarshana
[nltk_data]     Shrestha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Jarshana
[nltk_data]     Shrestha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jarshana
[nltk_data]     Shrestha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [14]:
# Sample text examples (students can replace these with their own)
text = "AI is transforming industries rapidly! From healthcare to education, its applications are endless."

In [15]:
# Tokenization - Splitting text into individual words
tokens = word_tokenize(text)
print("\nTokens:", tokens)


Tokens: ['AI', 'is', 'transforming', 'industries', 'rapidly', '!', 'From', 'healthcare', 'to', 'education', ',', 'its', 'applications', 'are', 'endless', '.']


In [16]:
# Stop Word Removal - Removing common words that add no semantic value
stop_words = set(stopwords.words('english')) # Load English stop words

tokens_without_stopwords = [word for word in tokens if word.lower() not in stop_words]
print("\nTokens After Stop Word Removal:", tokens_without_stopwords)


Tokens After Stop Word Removal: ['AI', 'transforming', 'industries', 'rapidly', '!', 'healthcare', 'education', ',', 'applications', 'endless', '.']


In [17]:
# Special Character Removal - Removing punctuation or special symbols

tokens_cleaned = [re.sub(r'[^\w\s]', '', word) for word in tokens_without_stopwords if re.sub(r'[^\w\s]', '', word)]

print("\nTokens After Special Character Removal:", tokens_cleaned)


Tokens After Special Character Removal: ['AI', 'transforming', 'industries', 'rapidly', 'healthcare', 'education', 'applications', 'endless']


In [18]:
# Stemming - Reducing words to their root forms
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in tokens_cleaned]
print("\nStemmed Tokens:", stemmed_tokens)


Stemmed Tokens: ['ai', 'transform', 'industri', 'rapidli', 'healthcar', 'educ', 'applic', 'endless']


In [19]:
# Lemmatization with POS Tagging - Context-aware word normalization
lemmatizer = WordNetLemmatizer()

# Helper function to get WordNet POS tag
def get_wordnet_pos(word):
 tag = pos_tag([word])[0][1][0].upper() # Get the first letter of POS tag
 tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
 return tag_dict.get(tag, wordnet.NOUN) # Default to noun if tag is not found

In [20]:
# Apply lemmatization
lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens_cleaned]

print("\nLemmatized Tokens (with POS):", lemmatized_tokens)


Lemmatized Tokens (with POS): ['AI', 'transform', 'industry', 'rapidly', 'healthcare', 'education', 'application', 'endless']


In [21]:
# Final Reflection
print("\n**Final Results:**")
print("Original Tokens:", tokens)
print("Tokens After Stop Word Removal:", tokens_without_stopwords)
print("Tokens After Special Character Removal:", tokens_cleaned)
print("Stemmed Tokens:", stemmed_tokens)
print("Lemmatized Tokens:", lemmatized_tokens)


**Final Results:**
Original Tokens: ['AI', 'is', 'transforming', 'industries', 'rapidly', '!', 'From', 'healthcare', 'to', 'education', ',', 'its', 'applications', 'are', 'endless', '.']
Tokens After Stop Word Removal: ['AI', 'transforming', 'industries', 'rapidly', '!', 'healthcare', 'education', ',', 'applications', 'endless', '.']
Tokens After Special Character Removal: ['AI', 'transforming', 'industries', 'rapidly', 'healthcare', 'education', 'applications', 'endless']
Stemmed Tokens: ['ai', 'transform', 'industri', 'rapidli', 'healthcar', 'educ', 'applic', 'endless']
Lemmatized Tokens: ['AI', 'transform', 'industry', 'rapidly', 'healthcare', 'education', 'application', 'endless']


____________________

In [22]:
# New sentence
text = "The sun rises in the east, setting a golden glow over the horizon."

In [23]:
# Tokenization
tokens = word_tokenize(text)
print("\nTokens:", tokens)


Tokens: ['The', 'sun', 'rises', 'in', 'the', 'east', ',', 'setting', 'a', 'golden', 'glow', 'over', 'the', 'horizon', '.']


In [24]:
# Stop Word Removal
stop_words = set(stopwords.words('english'))
tokens_without_stopwords = [word for word in tokens if word.lower() not in stop_words]

print("\nTokens After Stop Word Removal:", tokens_without_stopwords)


Tokens After Stop Word Removal: ['sun', 'rises', 'east', ',', 'setting', 'golden', 'glow', 'horizon', '.']


In [25]:
# Special Character Removal
tokens_cleaned = [re.sub(r'[^\w\s]', '', word) for word in tokens_without_stopwords if re.sub(r'[^\w\s]', '', word)]

print("\nTokens After Special Character Removal:", tokens_cleaned)


Tokens After Special Character Removal: ['sun', 'rises', 'east', 'setting', 'golden', 'glow', 'horizon']


In [26]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in tokens_cleaned]

print("\nStemmed Tokens:", stemmed_tokens)


Stemmed Tokens: ['sun', 'rise', 'east', 'set', 'golden', 'glow', 'horizon']


In [27]:
# Lemmatization with POS Tagging
lemmatizer = WordNetLemmatizer()

# Helper function for POS tagging
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [28]:
lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens_cleaned]

print("\nLemmatized Tokens (with POS):", lemmatized_tokens)


Lemmatized Tokens (with POS): ['sun', 'rise', 'east', 'set', 'golden', 'glow', 'horizon']


In [29]:
# Final Reflection
print("\n**Final Results:**")
print("Original Tokens:", tokens)
print("Tokens After Stop Word Removal:", tokens_without_stopwords)
print("Tokens After Special Character Removal:", tokens_cleaned)
print("Stemmed Tokens:", stemmed_tokens)
print("Lemmatized Tokens:", lemmatized_tokens)


**Final Results:**
Original Tokens: ['The', 'sun', 'rises', 'in', 'the', 'east', ',', 'setting', 'a', 'golden', 'glow', 'over', 'the', 'horizon', '.']
Tokens After Stop Word Removal: ['sun', 'rises', 'east', ',', 'setting', 'golden', 'glow', 'horizon', '.']
Tokens After Special Character Removal: ['sun', 'rises', 'east', 'setting', 'golden', 'glow', 'horizon']
Stemmed Tokens: ['sun', 'rise', 'east', 'set', 'golden', 'glow', 'horizon']
Lemmatized Tokens: ['sun', 'rise', 'east', 'set', 'golden', 'glow', 'horizon']
