In [None]:
import fitz  # PyMuPDF
import re
import json

# Define the chapter mapping from textbook page ranges
chapter_pages = {
    "Some Basic Concepts of Chemistry": (1, 18),
    "Structure of Atom": (19, 38),
    "Classification of Elements and Periodicity in Properties": (39, 52),
    "Chemical Bonding and Molecular Structure": (53, 76),
    "States of Matter": (77, 94),
    "Thermodynamics": (95, 110),
    "Equilibrium": (111, 134),
    "Redox Reactions": (135, 146),
    "Hydrogen": (147, 156),
    "The s-Block Elements": (157, 168),
    "The p-Block Elements (Group 13 and 14)": (169, 180),
    "Organic Chemistry – Some Basic Principles and Techniques": (181, 202),
    "Hydrocarbons": (203, 226),
    "Environmental Chemistry": (227, 238),
    "The Solid State": (239, 254),
    "Solutions": (255, 274),
    "Electrochemistry": (275, 294),
    "Chemical Kinetics": (295, 316),
    "Surface Chemistry": (317, 332),
    "General Principles and Processes of Isolation of Elements": (333, 344),
    "The p-Block Elements (Group 15, 16, 17 and 18)": (345, 366),
    "The d-and f-Block Elements": (367, 382),
    "Coordination Compounds": (383, 404),
    "Haloalkanes and Haloarenes": (405, 422),
    "Alcohols, Phenols and Ethers": (423, 440),
    "Aldehydes, Ketones and Carboxylic acids": (441, 462),
    "Amines": (463, 478),
    "Biomolecules": (479, 498),
    "Polymers": (499, 510),
    "Chemistry in Everyday Life": (511, 520),
}

# Build a lookup: book page -> chapter
chapter_map = {}
for name, (start, end) in chapter_pages.items():
    for page in range(start, end + 1):
        chapter_map[page] = name

def get_chapter_from_book_page(book_page):
    return chapter_map.get(book_page, "Unknown Chapter")

def decode_unicode(text):
    try:
        return text.encode('utf-8').decode('unicode_escape')
    except Exception:
        return text

def extract_pdf_text(pdf_path, start_page=4):
    doc = fitz.open(pdf_path)
    pages = []
    for i in range(start_page, len(doc)):
        text = doc[i].get_text()
        book_page = i + 1  # Assuming page 1 of PDF = book page 1
        pages.append((book_page, f"\n--- Page {book_page} ---\n" + text))
    return pages

def parse_questions(text, chapter_name):
    result = []

    question_blocks = re.split(r'\n(?=\d+\.)', text)

    for block in question_blocks:
        q_match = re.match(r'(\d+)\.\n(.*)', block, re.DOTALL)
        if not q_match:
            continue

        q_number = q_match.group(1)
        q_content = decode_unicode(q_match.group(2).strip())

        parts = re.split(r'\(a\)', q_content, maxsplit=1)
        if len(parts) < 2:
            continue

        question_text = decode_unicode(parts[0].strip())
        options_text = "(a)" + parts[1]

        option_matches = re.findall(r'\(([a-dA-D])\)\n(.*?)(?=\n\([a-dA-D]\)|\n\d+\.|\Z)', options_text, re.DOTALL)
        options = {}
        for opt in option_matches:
            letter = opt[0].lower()
            opt_text = decode_unicode(opt[1].strip().replace('\n', ' '))
            options[letter] = opt_text

        entry = {
            'chapter': decode_unicode(chapter_name),
            'question': question_text,
            'options': options,
        }
        result.append(entry)
    return result

def main():
    pdf_file_path = "Chemistry-Question-Bank.pdf"
    pages = extract_pdf_text(pdf_file_path, start_page=4)

    questions_data = []
    for book_page, page_text in pages:
        chapter_name = get_chapter_from_book_page(book_page)
        page_questions = parse_questions(page_text, chapter_name)
        questions_data.extend(page_questions)

    with open("questions_output.json", "w", encoding='utf-8') as f:
        json.dump(questions_data, f, indent=2, ensure_ascii=False)

    print("Extraction complete. Parsed questions:")
    for q in questions_data[:3]:
        print(q, "\n" + "-" * 40)

    return questions_data

if __name__ == '__main__':
    questions_data = main()


Extraction complete. Parsed questions:
{'chapter': 'Some Basic Concepts of Chemistry', 'question': 'A mixture of sand and iodine can be separated by', 'options': {'a': 'crystallisation', 'b': 'distillation', 'c': 'sublimation', 'd': 'fractionation'}} 
----------------------------------------
{'chapter': 'Some Basic Concepts of Chemistry', 'question': 'Difference in density is the basis of', 'options': {'a': 'ultrafiltration', 'b': 'molecular sieving', 'c': 'molecular attraction', 'd': 'gravity separation'}} 
----------------------------------------
{'chapter': 'Some Basic Concepts of Chemistry', 'question': 'Which of the following is an example of a heterogeneous\nsubstance?', 'options': {'a': 'Bottled water', 'b': 'Table salt', 'c': 'Pieces of copper', 'd': 'Candle'}} 
----------------------------------------


  return text.encode('utf-8').decode('unicode_escape')
  return text.encode('utf-8').decode('unicode_escape')


In [70]:
questions_data[2501]

{'chapter': 'The d-and f-Block Elements',
 'question': 'Which one of the following ions has electronic\nconfiguration [Ar] 3d6 ?',
 'options': {'a': '3+ Ni',
  'b': '3+ Mn',
  'c': '3+ Fe',
  'd': '3+ Co (At. Nos. Mn = 25, Fe = 26, Co = 27, Ni = 28)'}}

In [71]:
len(questions_data)

3802

In [72]:
# Filter out questions with any empty field
filtered_questions = [
    q for q in questions_data if q['question'].strip() and q['chapter'].strip()
]

# Save the cleaned data
with open("questions_output_cleaned.json", "w") as f:
    json.dump(filtered_questions, f, indent=2)

print("Filtered and saved. Sample cleaned questions:")
for q in filtered_questions[:3]:
    print(q, "\n" + "-"*40)

Filtered and saved. Sample cleaned questions:
{'chapter': 'Some Basic Concepts of Chemistry', 'question': 'A mixture of sand and iodine can be separated by', 'options': {'a': 'crystallisation', 'b': 'distillation', 'c': 'sublimation', 'd': 'fractionation'}} 
----------------------------------------
{'chapter': 'Some Basic Concepts of Chemistry', 'question': 'Difference in density is the basis of', 'options': {'a': 'ultrafiltration', 'b': 'molecular sieving', 'c': 'molecular attraction', 'd': 'gravity separation'}} 
----------------------------------------
{'chapter': 'Some Basic Concepts of Chemistry', 'question': 'Which of the following is an example of a heterogeneous\nsubstance?', 'options': {'a': 'Bottled water', 'b': 'Table salt', 'c': 'Pieces of copper', 'd': 'Candle'}} 
----------------------------------------


In [73]:
len(filtered_questions)

3115