# Q1. Write a Python function that takes a paragraph as input and performs the following tasks step by step:

Tokenize the paragraph into words.

Remove punctuation and lowercase the words.

Apply stemming.

Apply lemmatization.

# 🛠 Your Task:
Use NLTK or spaCy for tokenization, stemming, and lemmatization.
Print output at each stage so you can compare.

In [8]:
sample = """\"Natural Language Processing (NLP) is amazing!\" said Sarah.
\"But... how does it actually work?\" she wondered.
Well, it's not magic—it's math, data, and a lot of clever algorithms.
With tools like NLTK, spaCy, and transformers, we can build chatbots, summarize articles, or even translate languages! Isn’t that incredible?"""


#Tokenization
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

tokenized_words=word_tokenize(sample)
print('Tokenized Paragraph:',tokenized_words)


#Lowercase + Punctuation
import string
cleaned_words=[word.lower() for word in tokenized_words if word not in string.punctuation]
print('Lowercase + Punctuation:',cleaned_words)

#Stemming
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
stemmed_words=[stemmer.stem(word) for word in cleaned_words]
print('Stemming:',stemmed_words)

#Lemmatization
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lematizer=WordNetLemmatizer()
lemmatized_words=[lematizer.lemmatize(word) for word in cleaned_words]
print('Lemmatization:',lemmatized_words)





[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Tokenized Paragraph: ['``', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'amazing', '!', "''", 'said', 'Sarah', '.', '``', 'But', '...', 'how', 'does', 'it', 'actually', 'work', '?', "''", 'she', 'wondered', '.', 'Well', ',', 'it', "'s", 'not', 'magic—it', "'s", 'math', ',', 'data', ',', 'and', 'a', 'lot', 'of', 'clever', 'algorithms', '.', 'With', 'tools', 'like', 'NLTK', ',', 'spaCy', ',', 'and', 'transformers', ',', 'we', 'can', 'build', 'chatbots', ',', 'summarize', 'articles', ',', 'or', 'even', 'translate', 'languages', '!', 'Isn', '’', 't', 'that', 'incredible', '?']
Lowercase + Punctuation: ['``', 'natural', 'language', 'processing', 'nlp', 'is', 'amazing', "''", 'said', 'sarah', '``', 'but', '...', 'how', 'does', 'it', 'actually', 'work', "''", 'she', 'wondered', 'well', 'it', "'s", 'not', 'magic—it', "'s", 'math', 'data', 'and', 'a', 'lot', 'of', 'clever', 'algorithms', 'with', 'tools', 'like', 'nltk', 'spacy', 'and', 'transformers', 'we', 'can', 'build', 'chat


# Problem 2: POS Tag and N-Gram Explorer
**📘 Question:**
**Create a function that:**

Tokenizes a given sentence

Tags each token with its Part of Speech (POS)

Extracts N-grams (bigrams & trigrams)

# 🛠 Your Task:
Use nltk.pos_tag() for POS tagging

Use nltk.ngrams() or your own function to get bigrams/trigrams

Print tokens with tags, bigrams, and trigrams

In [10]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ngrams
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

text="The little girl is playing in the garden."

#Tokenization

tokens=word_tokenize(text)
print('Tokens:',tokens)

#POS Tagging
pos_tags=pos_tag(tokens)
print('POS Tags:',pos_tags)

#N-grams
bi_grams=list(ngrams(tokens,2))
tri_grams=list(ngrams(tokens,3))
print('Bigrams:',bi_grams)
print('Trigrams:',tri_grams)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Tokens: ['The', 'little', 'girl', 'is', 'playing', 'in', 'the', 'garden', '.']
POS Tags: [('The', 'DT'), ('little', 'JJ'), ('girl', 'NN'), ('is', 'VBZ'), ('playing', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('garden', 'NN'), ('.', '.')]
Bigrams: [('The', 'little'), ('little', 'girl'), ('girl', 'is'), ('is', 'playing'), ('playing', 'in'), ('in', 'the'), ('the', 'garden'), ('garden', '.')]
Trigrams: [('The', 'little', 'girl'), ('little', 'girl', 'is'), ('girl', 'is', 'playing'), ('is', 'playing', 'in'), ('playing', 'in', 'the'), ('in', 'the', 'garden'), ('the', 'garden', '.')]


# Problem 3: TF-IDF Feature Extractor
**📘 Question:**
**Build a small search engine-like program that:**

Takes a list of small documents (5–6 sentences each)

Calculates TF-IDF scores for each word in the corpus

For a given word, prints the top 3 documents where the word is most important (highest TF-IDF)

# 🛠 Your Task:
Use sklearn.feature_extraction.text.TfidfVectorizer

Accept a search word and sort the scores across documents

Print top 3 matching documents

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import numpy as np
from collections import Counter


corpus = [
    "I love NLP and machine learning.",
    "NLP is a subfield of AI.",
    "Deep learning and NLP are connected.",
    "Machine learning powers AI.",
    "Natural language processing is NLP."
]

# tokens=[word_tokenize(doc) for doc in corpus]
# single=[word for sublist in tokens for word in sublist]
# clean= [token for token in single if token not in string.punctuation]
# word_f=Counter(clean)
# final= np.array(list(word_f.items()))
# print(final)

search_word = "NLP"
vectorizer=TfidfVectorizer()
tfidf_matrix=vectorizer.fit_transform(corpus)
feature_names=vectorizer.get_feature_names_out()
print(feature_names)
print(tfidf_matrix.toarray())
print(tfidf_matrix.shape)

# Step 5: Check if the word is in the vocabulary
if search_word.lower() not in feature_names:
    print(f"'{search_word}' not found in vocabulary.")
else:
    # Step 6: Find the column index of the search word
    word_index = np.where(feature_names == search_word.lower())[0][0]
    print(word_index)

    # Step 7: Extract the TF-IDF scores for this word across all documents
    scores = tfidf_matrix[:, word_index].toarray().flatten()
    print(scores)

    # # Step 8: Sort and get top 3 indices
    top_indices = scores.argsort()[::-1][:3]

    # # Step 9: Print the top 3 documents
    print(f"Top 3 documents where '{search_word}' is most important:\n")
    for i in top_indices:
        if scores[i] > 0:
            print(f"Doc {i+1} (Score: {scores[i]:.4f}): {corpus[i]}")
        else:
            print(f"Doc {i+1}: (No relevance for '{search_word}')")



['ai' 'and' 'are' 'connected' 'deep' 'is' 'language' 'learning' 'love'
 'machine' 'natural' 'nlp' 'of' 'powers' 'processing' 'subfield']
[[0.         0.46063063 0.         0.         0.         0.
  0.         0.38236504 0.5709398  0.46063063 0.         0.32165752
  0.         0.         0.         0.        ]
 [0.42408634 0.         0.         0.         0.         0.42408634
  0.         0.         0.         0.         0.         0.29613871
  0.52564409 0.         0.         0.52564409]
 [0.         0.38389033 0.47582217 0.47582217 0.47582217 0.
  0.         0.31866365 0.         0.         0.         0.26806991
  0.         0.         0.         0.        ]
 [0.48648432 0.         0.         0.         0.         0.
  0.         0.40382593 0.         0.48648432 0.         0.
  0.         0.60298477 0.         0.        ]
 [0.         0.         0.         0.         0.         0.40500406
  0.50199209 0.         0.         0.         0.50199209 0.28281359
  0.         0.         0.5