In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# Download required resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# ---------------------------
# Read HTML file
# ---------------------------
with open("sample.html", "r", encoding="utf-8") as file:
    html_text = file.read()

# ---------------------------
# Remove HTML tags
# ---------------------------
text = re.sub(r'<.*?>', '', html_text)

# ---------------------------
# Remove punctuation & special characters
# Keep only letters and spaces
# ---------------------------
text = re.sub(r'[^a-zA-Z\s]', '', text)

# Convert to lowercase
text = text.lower()

# ---------------------------
# Tokenization
# ---------------------------
tokens = word_tokenize(text)

# ---------------------------
# Stopword removal
# ---------------------------
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w not in stop_words]

# ---------------------------
# POS tagging + Lemmatization
# ---------------------------
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

pos_tags = nltk.pos_tag(filtered_tokens)

lemmatized_tokens = [
    lemmatizer.lemmatize(word, get_wordnet_pos(tag))
    for word, tag in pos_tags
]

# ---------------------------
# Output
# ---------------------------
print("Original HTML:\n", html_text)
print("\nClean Text:\n", text)
print("\nTokens:\n", tokens)
print("\nAfter Stopword Removal:\n", filtered_tokens)
print("\nPOS Tags:\n", pos_tags)
print("\nLemmatized Tokens:\n", lemmatized_tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\masup\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\masup\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\masup\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\masup\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


FileNotFoundError: [Errno 2] No such file or directory: 'sample.html'