In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\isatyamks\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\isatyamks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\isatyamks\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
def preprocess_text(text):
    """Tokenizes the text into sentences and cleans each sentence."""
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    clean_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        filtered_words = [word for word in words if word.lower() not in stop_words and word not in string.punctuation]
        clean_sentences.append(' '.join(filtered_words))

    return sentences, clean_sentences

In [5]:
def extract_keywords(sentences):
    """Extracts keywords from sentences using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5)
    tfidf_matrix = vectorizer.fit_transform(sentences)
    keywords = vectorizer.get_feature_names_out()
    return keywords

In [7]:
def generate_questions(sentence):
    """Generates questions based on simple syntactic rules."""
    words = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    questions = []
        # Rule 1: If sentence has a proper noun (NNP), form a 'What' question
    for i, (word, tag) in enumerate(pos_tags):
        if tag == 'NNP':
            question = sentence.replace(word, f"What is {word}?")
            questions.append(question)
            break

    # Rule 2: If sentence starts with a verb, form a 'How' question
    if pos_tags[0][1].startswith('VB'):
        question = f"How does {sentence}?"
        questions.append(question)

    return questions

In [8]:
def rank_questions(questions, keywords):
    """Ranks questions based on relevance to extracted keywords."""
    ranked = []
    for question in questions:
        score = sum(1 for word in word_tokenize(question) if word.lower() in keywords)
        ranked.append((question, score))

    # Sort by score
    ranked = sorted(ranked, key=lambda x: x[1], reverse=True)
    return [q for q, _ in ranked[:5]]

In [9]:
def main(text):
    """Processes the input text and returns the top 5 questions."""
    sentences, clean_sentences = preprocess_text(text)
    keywords = extract_keywords(clean_sentences)
    all_questions = []

    for sentence in sentences:
        questions = generate_questions(sentence)
        all_questions.extend(questions)

    top_questions = rank_questions(all_questions, keywords)
    return top_questions

In [16]:
paragraph = (
        "My name is Satyam and i am a computer science student"
        "i live in bihar which is a state of India"
        "I love machine learning and artificial intelligence so much"
    )

questions = main(paragraph)
print("Top 5 Questions:")
for i, q in enumerate(questions, 1):
    print(f"{i}. {q}")


Top 5 Questions:
1. My name is What is Satyam? and i am a computer science studenti live in bihar which is a state of IndiaI love machine learning and artificial intelligence so much


In [17]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\isatyamks\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [14]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\isatyamks\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [18]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

ModuleNotFoundError: No module named 'transformers'

In [21]:
!pip install tansformer

Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
   ---------------------------------------- 0.0/10.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.1 MB ? eta -:--:--
   ----- ---------------------------------- 1.3/10.1 MB 5.2 MB/s eta 0:00:02
   ------ --------------------------------- 1.6/10.1 MB 5.6 MB/s eta 0:00:02
   ----------- ---------------------------- 2.9/10.1 MB 4.5 MB/s eta 0:00:02
   -------------- ------------------------- 3.7/10.1 MB 4.4 MB/s eta 0:00:02
   --------------

In [5]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [11]:
model_name = "t5-small" 
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)


In [12]:
def generate_questions_t5(paragraph, max_questions=5):
    """Generates questions using the T5 model."""
    # Prepare the input for the model
    input_text = f"generate questions: {paragraph}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Generate output using the model
    outputs = model.generate(
    input_ids,
    max_length=128,
    num_beams=10,  # Increase beam search width
    temperature=0.7,  # Encourage more creative outputs
    top_p=0.9,  # Nucleus sampling for diverse outputs
    num_return_sequences=max_questions
    )


    # Decode and format the output
    questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return questions

In [13]:
pqaragraph = (
        "Machine learning is a subset of artificial intelligence that enables systems to learn patterns from data and make predictions. "
        "It is widely used in various domains like healthcare, finance, and technology. "
        "Supervised, unsupervised, and reinforcement learning are the primary categories of machine learning.")
questions = generate_questions_t5(pqaragraph)
print("Top Questions:")
for i, q in enumerate(questions, 1):
    print(f"{i}. {q}")


Top Questions:
1. machine learning: Machine learning is a subset of artificial intelligence that enables systems to learn patterns from data and make predictions. It is widely used in various domains like healthcare, finance, and technology. Supervised, unsupervised, and reinforcement learning are the primary categories of machine learning.
2. questions: Machine learning is a subset of artificial intelligence that enables systems to learn patterns from data and make predictions. It is widely used in various domains like healthcare, finance, and technology. Supervised, unsupervised, and reinforcement learning are the primary categories of machine learning.
3. : Machine learning is a subset of artificial intelligence that enables systems to learn patterns from data and make predictions. It is widely used in various domains like healthcare, finance, and technology. Supervised, unsupervised, and reinforcement learning are the primary categories of machine learning.
4. questions: Machine le