In [3]:
# 2. Import libraries
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Today's top India news article (RBI rate cut example)
text = """The Reserve Bank of India (RBI), under Governor Sanjay Malhotra, has implemented
its most significant interest rate cut in five years, signaling a bold shift toward aggressive
economic growth support."""

# 3. Tokenization (spaCy)
print(" Tokenization:")
doc = nlp(text)
tokens = [token.text for token in doc]
print(tokens)
print("\n")

# 4. Stemming (NLTK)
print("Stemming:")
words = word_tokenize(text)
for word in words:
    if word.isalpha():  # ignore punctuation/numbers
        print(f"{word} -> {stemmer.stem(word)}")
print("\n")

# 5. Lemmatization (spaCy)
print("Lemmatization:")
for token in doc:
    if token.is_alpha:
        print(f"{token.text} -> {token.lemma_}")
print("\n")

# 6. Stop Word Removal (NLTK)
print("Stop Word Removal:")
filtered = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]
print(filtered)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🔹 Tokenization:
['The', 'Reserve', 'Bank', 'of', 'India', '(', 'RBI', ')', ',', 'under', 'Governor', 'Sanjay', 'Malhotra', ',', 'has', 'implemented', '\n', 'its', 'most', 'significant', 'interest', 'rate', 'cut', 'in', 'five', 'years', ',', 'signaling', 'a', 'bold', 'shift', 'toward', 'aggressive', '\n', 'economic', 'growth', 'support', '.']


🔹 Stemming:
The -> the
Reserve -> reserv
Bank -> bank
of -> of
India -> india
RBI -> rbi
under -> under
Governor -> governor
Sanjay -> sanjay
Malhotra -> malhotra
has -> ha
implemented -> implement
its -> it
most -> most
significant -> signific
interest -> interest
rate -> rate
cut -> cut
in -> in
five -> five
years -> year
signaling -> signal
a -> a
bold -> bold
shift -> shift
toward -> toward
aggressive -> aggress
economic -> econom
growth -> growth
support -> support


🔹 Lemmatization:
The -> the
Reserve -> Reserve
Bank -> Bank
of -> of
India -> India
RBI -> RBI
under -> under
Governor -> Governor
Sanjay -> Sanjay
Malhotra -> Malhotra
has -> h