In [None]:
#############   USE   THIS ONE  FOR STEP 1
import os
import re
import json
import fitz  # PyMuPDF
from docx import Document

def extract_text_from_txt(path):
    with open(path, 'r', encoding='utf-8', errors='ignore') as file:
        return file.read()

def extract_text_from_docx(path):
    doc = Document(path)
    return '\n'.join(para.text.strip() for para in doc.paragraphs if para.text.strip())

def extract_text_from_pdf(path):
    text = ''
    try:
        with fitz.open(path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Warning: Failed to read PDF '{path}': {e}")
    return text

def load_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    loaders = {'.pdf': extract_text_from_pdf, '.docx': extract_text_from_docx, '.txt': extract_text_from_txt}
    if ext not in loaders:
        raise ValueError(f"Unsupported file format: {ext}")
    return loaders[ext](file_path)

def clean_raw_text(raw_text):
    # Basic cleaning: remove page numbers, bullets, weird characters, multiple spaces, multiple newlines
    text = re.sub(r'\n\d+\n', '\n', raw_text)  # Remove isolated page numbers
    text = re.sub(r'[•●▪■\u2022\uf0b7]', '', text)  # Remove common bullets
    text = re.sub(r'[ \t]+', ' ', text)  # Replace multiple spaces/tabs with single space
    text = re.sub(r'\n{3,}', '\n\n', text)  # Limit newlines to max two

    # Remove obvious OCR garbage or corrupted words (customize as needed)
    text = re.sub(r'\bCo\s*i\s*ant\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]+', '', text)  # Remove control chars

    # Strip trailing/leading spaces on each line
    lines = [line.strip() for line in text.splitlines()]
    return "\n".join(lines)

def is_probable_header(line):
    line = line.strip()
    # Exclude numbered or lettered list items like '1) ...', 'a) ...', '2. ...'
    if re.match(r'^(\d+[\.\)]|[a-zA-Z][\.\)])\s', line):
        return False

    # Ends with colon or dash
    if re.search(r'[:\-]\s*$', line):
        return True

    # Mostly capitalized words, short line
    tokens = line.split()
    if 1 <= len(tokens) <= 15:
        upper_words = sum(1 for t in tokens if t and t[0].isupper())
        if upper_words / len(tokens) > 0.7:
            return True

    # Fully uppercase (ignore short lines)
    if len(line) > 3 and line.isupper():
        return True

    return False


def structure_text_to_sections(text):
    lines = text.splitlines()
    sections = []
    current_header = None
    current_content = []

    for idx, line in enumerate(lines):
        if not line.strip():
            # Empty line, consider as paragraph break - add current content if any
            if current_content:
                # Append paragraph (join with space)
                current_content.append('')  # Add paragraph break as empty string
            continue

        if is_probable_header(line):
            # Save previous section if exists
            if current_header or current_content:
                content_text = " ".join(p for p in current_content if p).strip()
                if current_header is None:
                    # No header before content? Use default header
                    current_header = "Document"
                sections.append({
                    "header": current_header,
                    "content": content_text
                })
                current_content = []
            current_header = line.rstrip(':-').strip()
        else:
            current_content.append(line.strip())

    # Add last section
    if current_header or current_content:
        content_text = " ".join(p for p in current_content if p).strip()
        if current_header is None:
            current_header = "Document"
        sections.append({
            "header": current_header,
            "content": content_text
        })

    # Remove empty content sections if any
    sections = [s for s in sections if s['content'].strip() != '']

    return sections

def save_json(data, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def main(filepath):
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File '{filepath}' does not exist.")

    print(f"Processing file: {filepath}")

    raw_text = load_text(filepath)
    cleaned_text = clean_raw_text(raw_text)
    sections = structure_text_to_sections(cleaned_text)

    if not sections:
        print("⚠️ Warning: No sections found after processing.")
    else:
        print(f"✅ Found {len(sections)} sections.")

    # Save to JSON
    save_json(sections, "structured_output.json")
    print("Structured JSON saved to 'structured_output.json'")

    # Print first 3 sections as preview
    print("\n--- Sample output (first 3 sections) ---\n")
    print(json.dumps(sections[:3], ensure_ascii=False, indent=2))

if __name__ == "__main__":
    main("ex1.pdf")  # Change to your file path


Processing file: ex1.pdf
✅ Found 9 sections.
Structured JSON saved to 'structured_output.json'

--- Sample output (first 3 sections) ---

[
  {
    "header": "Software Engineering Section 1",
    "content": "Difference between Software and Computer programs Software engineering is intended to support professional software development, rather than individual programming. It includes techniques that support program specification, design, and evolution, none of which are normally relevant for personal software development. Many people think that software is simply another word for computer programs. However, when we are talking about software engineering, software is not just the programs themselves but also all associated documentation and configuration data that is required to make these programs operate correctly. A professionally developed software system is often more than a single program. The system usually consists of a number of separate programs and configuration files that are 

In [2]:
###################### USE THIS ONE FOR STEP 2
import json
import re
import spacy
from keybert import KeyBERT
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Load models once
nlp = spacy.load("en_core_web_sm")
kw_model = KeyBERT()
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
ner_pipeline = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    tokenizer="dslim/bert-base-NER",
    aggregation_strategy="simple"
)

def clean_sentence(sentence):
    # Remove leading numbering or bullets in sentences and newlines inside
    sentence = re.sub(r'^[\d\)\.\-\s]+', '', sentence)
    sentence = sentence.replace('\n', ' ').strip()
    return sentence

def segment_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 15]

def extract_keywords(sentence, top_n=5):
    kws = kw_model.extract_keywords(sentence, keyphrase_ngram_range=(1, 3), stop_words='english', top_n=top_n)
    return [kw[0] for kw in kws]

def extract_entities(sentence):
    entities = ner_pipeline(sentence)
    # Aggregate unique entity texts, filter punctuation and spaces
    unique_entities = list({ent['word'].strip() for ent in entities if ent['word'].strip() and not ent['word'].isspace()})
    return unique_entities

def compute_relevance(sentence, context_emb):
    sent_emb = sentence_model.encode(sentence, convert_to_tensor=True)
    score = util.pytorch_cos_sim(sent_emb, context_emb).item()
    return round(score, 4)

def process_section(section, global_context_emb):
    """
    Process a single section {header, content}:
    - Segment content to sentences
    - For each sentence, extract keywords, entities, compute relevance vs global context
    """
    header = section.get('header', '').strip()
    content = section.get('content', '').strip()
    combined_text = f"{header}. {content}"  # Use header+content as local context if needed
    
    # Compute local context embedding for relevance comparison (optional: could use global context too)
    local_context_emb = sentence_model.encode(combined_text, convert_to_tensor=True)
    
    sentences = segment_sentences(content)
    results = []
    for i, sent in enumerate(sentences):
        clean_sent = clean_sentence(sent)
        keywords = extract_keywords(clean_sent)
        entities = extract_entities(clean_sent)
        answer_candidates = entities if entities else keywords
        relevance_score = compute_relevance(clean_sent, global_context_emb)
        
        results.append({
            "header": header,
            "sentence_num": i + 1,
            "sentence": sent,
            "clean_sentence": clean_sent,
            "keywords": keywords,
            "entities": entities,
            "answer_candidates": answer_candidates,
            "relevance_score": relevance_score
        })
    return results

def load_structured_json(path="structured_output.json"):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def build_global_context(sections, max_chars=2048):
    # Combine headers + contents to build a global context string for relevance
    combined_text = " ".join(f"{sec.get('header','')} {sec.get('content','')}" for sec in sections)
    return combined_text[:max_chars]

def main():
    structured_data = load_structured_json("structured_output.json")
    global_context_text = build_global_context(structured_data)
    global_context_emb = sentence_model.encode(global_context_text, convert_to_tensor=True)

    all_results = []
    for section in structured_data:
        section_results = process_section(section, global_context_emb)
        all_results.extend(section_results)

    # Optionally filter by relevance threshold (e.g., 0.5)
    filtered_results = [r for r in all_results if r["relevance_score"] >= 0.5]

    # Save output for MCQ generation
    with open("processed_stage2.json", "w", encoding="utf-8") as f:
        json.dump(filtered_results, f, indent=2, ensure_ascii=False)

    print(f"Processed {len(all_results)} sentences, filtered to {len(filtered_results)} by relevance >= 0.5")
    print("Output saved to 'processed_stage2.json'")

if __name__ == "__main__":
    main()





Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Processed 76 sentences, filtered to 21 by relevance >= 0.5
Output saved to 'processed_stage2.json'


In [3]:
######################## USE THIS ONE FOR STEP 3
import json
import random
from typing import List
import torch
from transformers import pipeline
from nltk.corpus import wordnet
from sentence_transformers import SentenceTransformer, util
import nltk
import re

# Ensure nltk data is downloaded
nltk.download("wordnet")
nltk.download("omw-1.4")

# Load the QG model with proper task
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-small-qg-hl")

# Load embedding model for distractors
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Input JSON
input_file = r"C:\Users\RusRus\processed_stage2.json"
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)


def generate_question_qg(context: str, answer: str) -> str:
    """Generate a question using valhalla/t5-small-qg-hl with <hl> highlighting."""
    if answer not in context:
        return ""  # Avoid incorrect highlighting
    highlighted = context.replace(answer, f"<hl> {answer} <hl>")
    prompt = f"generate question: {highlighted}"
    try:
        result = qg_pipeline(prompt, max_length=64, do_sample=False)
        return result[0]["generated_text"].strip() if result else ""
    except Exception as e:
        print(f"QG error: {e}")
        return ""


def get_wordnet_distractors(word: str) -> List[str]:
    """Generate distractors using WordNet."""
    distractors = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            name = lemma.name().replace("_", " ").lower()
            if name != word.lower():
                distractors.add(name)
    return list(distractors)[:5]


def get_embedding_distractors(correct_answer: str, context: str, top_k=5) -> List[str]:
    """Find better distractors using sentence embeddings and word filtering."""
    words = list(set(context.lower().split()))
    words = [w.strip(".,()[]") for w in words if len(w) > 3 and w.lower() != correct_answer.lower()]
    words = [w for w in words if w.isalpha()]  # Remove punctuation, numbers, etc.

    if not words:
        return []

    try:
        correct_embedding = embedding_model.encode(correct_answer, convert_to_tensor=True)
        candidate_embeddings = embedding_model.encode(words, convert_to_tensor=True)
        similarities = util.pytorch_cos_sim(correct_embedding, candidate_embeddings)[0]
        sorted_indices = similarities.argsort(descending=True)  # Get most similar
        distractors = []
        for idx in sorted_indices:
            candidate = words[idx]
            if candidate.lower() != correct_answer.lower() and candidate not in distractors:
                distractors.append(candidate)
            if len(distractors) == top_k:
                break
        return distractors
    except Exception as e:
        print(f"Distractor generation error: {e}")
        return []


def clean_question(question: str, answer: str) -> str:
    """Ensure the answer is not directly embedded in the question."""
    q_lower = question.lower()
    a_lower = answer.lower()
    if a_lower in q_lower:
        return question.replace(answer, "_____").strip()
    return question.strip()


def replace_abbreviation_in_question(question: str, answer: str, abbreviation_candidates: List[str]) -> str:
    """
    Replace any detected abbreviation or acronym in the question with the full answer.

    abbreviation_candidates: list of abbreviations/acronyms extracted from the sentence that relate to the answer.
    """
    # To avoid partial replacement, do whole word match with regex
    for abbr in abbreviation_candidates:
        pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', flags=re.IGNORECASE)
        if pattern.search(question):
            question = pattern.sub(answer, question)
    return question


def find_abbreviations(sentence: str, answer: str) -> List[str]:
    """
    Heuristic: find abbreviations in sentence inside parentheses next to answer.

    Example: "Network Interface Card (NIC)" → extract "NIC"
    """
    pattern = re.compile(rf"{re.escape(answer)}\s*\(([^)]+)\)", flags=re.IGNORECASE)
    match = pattern.search(sentence)
    if match:
        # Return all uppercase abbreviations or short acronyms split by comma/semicolon if any
        abbrs = [abbr.strip() for abbr in re.split(r'[;,]', match.group(1))]
        # Filter to plausible abbreviations (usually uppercase or short)
        return [a for a in abbrs if len(a) <= 6 and a.isupper()]
    return []


# Generate MCQs
output_mcqs = []
mcq_target = 7  # Number of MCQs desired
skipped_entries = 0

for entry in data:
    if len(output_mcqs) >= mcq_target:
        break

    sentence = entry.get("clean_sentence", "").strip()
    answer = entry.get("answer_candidates", [])[0] if entry.get("answer_candidates") else ""

    if not sentence or not answer:
        skipped_entries += 1
        continue

    question = generate_question_qg(sentence, answer)
    if not question:
        skipped_entries += 1
        continue

    # Detect abbreviations related to answer
    abbreviations = find_abbreviations(sentence, answer)

    # Replace abbreviation in question with full answer text
    if abbreviations:
        question = replace_abbreviation_in_question(question, answer, abbreviations)

    question = clean_question(question, answer)

    # Relaxed filtering: min question length 3 words, allow no question mark (some models omit)
    if len(question.split()) < 3:
        skipped_entries += 1
        continue

    if answer.lower() in question.lower():
        # If answer still leaks into question, skip
        skipped_entries += 1
        continue

    # Get distractors
    distractors = get_embedding_distractors(answer, sentence)
    if len(distractors) < 3:
        distractors += get_wordnet_distractors(answer)
    distractors = list(set(distractors))[:3]

    if len(distractors) < 3:
        skipped_entries += 1
        continue

    options = distractors + [answer]
    random.shuffle(options)

    output_mcqs.append({
        "header": entry.get("header", "General"),
        "question": question,
        "options": options,
        "answer": answer,
        "source_sentence": sentence
    })

# Save output
output_file = r"C:\Users\RusRus\improved_mcqs.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(output_mcqs, f, indent=2)

print(f"✅ MCQ generation complete. Generated {len(output_mcqs)} MCQs, skipped {skipped_entries} entries due to quality/filtering.")
print(f"Output saved to: {output_file}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RusRus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\RusRus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/tran

✅ MCQ generation complete. Generated 7 MCQs, skipped 11 entries due to quality/filtering.
Output saved to: C:\Users\RusRus\improved_mcqs.json


In [None]:
# SECOND OPTION FOR STEP 3 , MORE STRICT FILTERING
# import json
# import random
# from typing import List
# import torch
# from transformers import pipeline
# from nltk.corpus import wordnet
# from sentence_transformers import SentenceTransformer, util
# import nltk

# # Ensure nltk data is downloaded
# nltk.download("wordnet")
# nltk.download("omw-1.4")

# # Load the QG model with proper task
# qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-small-qg-hl")

# # Load embedding model for distractors
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# # Input JSON
# input_file = r"C:\Users\RusRus\processed_stage2.json"
# with open(input_file, "r", encoding="utf-8") as f:
#     data = json.load(f)

# def generate_question_qg(context: str, answer: str) -> str:
#     """Generate a question using valhalla/t5-small-qg-hl with <hl> highlighting."""
#     if answer not in context:
#         return ""  # Avoid incorrect highlighting
#     highlighted = context.replace(answer, f"<hl> {answer} <hl>")
#     prompt = f"generate question: {highlighted}"
#     try:
#         result = qg_pipeline(prompt, max_length=64, do_sample=False)
#         return result[0]["generated_text"].strip() if result else ""
#     except Exception as e:
#         print(f"QG error: {e}")
#         return ""

# def get_wordnet_distractors(word: str) -> List[str]:
#     """Generate distractors using WordNet"""
#     distractors = set()
#     for syn in wordnet.synsets(word):
#         for lemma in syn.lemmas():
#             name = lemma.name().replace("_", " ").lower()
#             if name != word.lower():
#                 distractors.add(name)
#     return list(distractors)[:5]

# def get_embedding_distractors(correct_answer: str, context: str, top_k=5) -> List[str]:
#     """Find better distractors using sentence embeddings and word filtering."""
#     words = list(set(context.lower().split()))
#     words = [w.strip(".,()[]") for w in words if len(w) > 3 and w.lower() != correct_answer.lower()]
#     words = [w for w in words if w.isalpha()]  # Remove punctuation, numbers, etc.

#     if not words:
#         return []

#     try:
#         correct_embedding = embedding_model.encode(correct_answer, convert_to_tensor=True)
#         candidate_embeddings = embedding_model.encode(words, convert_to_tensor=True)
#         similarities = util.pytorch_cos_sim(correct_embedding, candidate_embeddings)[0]
#         sorted_indices = similarities.argsort(descending=True)  # Get most similar, not least
#         distractors = []
#         for idx in sorted_indices:
#             candidate = words[idx]
#             if candidate.lower() != correct_answer.lower() and candidate not in distractors:
#                 distractors.append(candidate)
#             if len(distractors) == top_k:
#                 break
#         return distractors
#     except Exception as e:
#         print(f"Distractor generation error: {e}")
#         return []


# def clean_question(question: str, answer: str) -> str:
#     """Ensure the answer is not directly embedded in the question."""
#     q_lower = question.lower()
#     a_lower = answer.lower()
#     if a_lower in q_lower:
#         return question.replace(answer, "_____").strip()
#     return question.strip()

# # Generate MCQs
# output_mcqs = []
# for entry in data:
#     sentence = entry.get("clean_sentence", "").strip()
#     answer = entry.get("answer_candidates", [])[0] if entry.get("answer_candidates") else ""

#     if not sentence or not answer:
#         continue

#     question = generate_question_qg(sentence, answer)
#     if not question:
#         continue

#     question = clean_question(question, answer)
#     if answer.lower() in question.lower():
#         continue  # Skip if answer still leaks into question

#     # Get distractors
#     distractors = get_embedding_distractors(answer, sentence)
#     if len(distractors) < 3:
#         distractors += get_wordnet_distractors(answer)
#     distractors = list(set(distractors))[:3]

#     if len(distractors) < 3:
#         continue

#     options = distractors + [answer]
#     random.shuffle(options)

#     output_mcqs.append({
#         "header": entry.get("header", "General"),
#         "question": question,
#         "options": options,
#         "answer": answer,
#         "source_sentence": sentence
#     })

# # Save output
# output_file = r"C:\Users\RusRus\improved_mcqs.json"
# with open(output_file, "w", encoding="utf-8") as f:
#     json.dump(output_mcqs, f, indent=2)

# print(f"✅ MCQ generation complete. Output saved to: {output_file}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RusRus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\RusRus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/tran

✅ MCQ generation complete. Output saved to: C:\Users\RusRus\improved_mcqs.json
