In [178]:
from docx import Document
import re
import os
import json
from copy import deepcopy

doc = Document("questions.docx")

In [179]:
DUMMY_QUESTION = {
    "question": "",
    "answers": [],
    "correct_answer": None,
}

In [180]:
rels = doc.part.rels

output_folder = "images"

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

idx_to_image = {}

for rel in rels:
    if "image" in rels[rel].target_ref:
        image = rels[rel].target_part.blob

        image_name = rels[rel].target_ref.split("/")[-1]
        idx = rels[rel].rId

        idx_to_image[idx] = image_name

        with open(os.path.join(output_folder, image_name), "wb") as img_file:
            img_file.write(image)

print(idx_to_image)

{'rId2': 'image1.jpeg', 'rId3': 'image1.jpeg', 'rId4': 'image2.jpeg', 'rId5': 'image3.jpeg', 'rId6': 'image4.jpeg', 'rId7': 'image5.jpeg', 'rId8': 'image6.jpeg', 'rId9': 'image7.jpeg', 'rId10': 'image8.jpeg', 'rId11': 'image9.jpeg', 'rId12': 'image10.jpeg', 'rId13': 'image11.jpeg', 'rId14': 'image12.jpeg', 'rId15': 'image13.jpeg', 'rId16': 'image14.jpeg', 'rId17': 'image15.jpeg', 'rId18': 'image16.jpeg', 'rId19': 'image17.jpeg', 'rId20': 'image18.jpeg'}


In [181]:
curr_question = None
questions = []


for paragraph in doc.paragraphs:
    if len(paragraph.runs) == 0:
        continue

    run = paragraph.runs[0]

    if run.bold and "Câu Hỏi" in run.text:
        if curr_question is not None:
            questions.append(curr_question)

        curr_question = deepcopy(DUMMY_QUESTION)
        curr_answer_idx = -1

    elif run.font.highlight_color == 3:
        curr_question["explanation"] = run.text

    elif curr_question is None:
        continue

    elif paragraph._element.xml.find("w:drawing") != -1:
        # <a:blip r:embed="rId##" cstate="print"/> using re
        image_id = re.search(r'r:embed="rId\d+"', paragraph._element.xml)
        if image_id:
            image_id = image_id.group(0).split('"')[1]
            curr_question["image"] = idx_to_image[image_id]

    elif not run.font.bold:
        if paragraph._element.xml.find("w:numPr") != -1:
            if paragraph.text.strip():
                curr_question["answers"].append(paragraph.text)

            # red
            if run.font.highlight_color == 6:
                curr_question["correct_answer"] = len(curr_question["answers"]) - 1

        elif len(curr_question["answers"]) == 0:
            curr_question["question"] += paragraph.text

questions.append(curr_question)

questions = [
    q for q in questions if len(q["answers"]) > 0 and q["correct_answer"] is not None
]

print(len(questions))

json.dump(questions, open("questions.json", "w"), indent=4, ensure_ascii=False)

200
