# Sample Question Extraction

In [None]:
#!pip install pymupdf

In [None]:
import fitz
import re
import os
import json
import google.generativeai as genai

genai.configure(api_key="AIzaSyCSC0LPUznCj0USGxAVXjXT_4vgVqp-ah4")

In [None]:


PROMPT_TEMPLATE_TYPE = """
You are an expert IELTS Listening data extractor.
Below is text extracted from an IELTS Listening test PDF.

Your task:
1. Identify and separate the text according to the following IELTS question types:
   - Form Completion
   - Multiple Choice
   - Short Answer Question
   - Sentence Completion
   - Matching 1
   - Matching 2
   - Labelling
   - Note Completion
2. For each question type, extract and organize the following:
   - "Question": all question text for that type
   - "Answer": all answer keys or provided answers
   - "Transcript": all relevant transcript text for that type
3. If any section is missing, leave it as an empty string "".
4. Return only **valid JSON** (no markdown, no explanations) in this exact structure:

{{
  "Form Completion": {{"Question": "", "Answer": "", "Transcript": ""}},
  "Multiple Choice": {{"Question": "", "Answer": "", "Transcript": ""}},
  "Short Answer Question": {{"Question": "", "Answer": "", "Transcript": ""}},
  "Sentence Completion": {{"Question": "", "Answer": "", "Transcript": ""}},
  "Matching 1": {{"Question": "", "Answer": "", "Transcript": ""}},
  "Matching 2": {{"Question": "", "Answer": "", "Transcript": ""}},
  "Labelling": {{"Question": "", "Answer": "", "Transcript": ""}},
  "Note Completion": {{"Question": "", "Answer": "", "Transcript": ""}}
}}

Here is the extracted PDF text:
---
{text}
---
Output only valid JSON, without markdown formatting or additional comments.
"""

PROMPT_TEMPLATE_SECTION = """
You are an expert IELTS Listening data extractor.
Below is text extracted from an IELTS Listening test PDF.

Your task:
1. Identify and separate the text according to each Section (Section 1, Section 2, Section 3, Section 4).
2. For each section, extract and organize the following:
   - "Question": all question text for that section
   - "Answer": all answer keys or provided answers
   - "Transcript": all relevant transcript text for that section
3. If any section is missing, leave it as an empty string "".
4. Return only **valid JSON** (no markdown, no explanations) in this exact structure:

{{
  "Section 1": {{"Question": "", "Answer": "", "Transcript": ""}},
  "Section 2": {{"Question": "", "Answer": "", "Transcript": ""}},
  "Section 3": {{"Question": "", "Answer": "", "Transcript": ""}},
  "Section 4": {{"Question": "", "Answer": "", "Transcript": ""}}
}}

Here is the extracted PDF text:
---
{text}
---
Output only valid JSON, without markdown formatting or additional comments.
"""

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text")
    return text

def safe_json_parse(response_text):
    cleaned = re.sub(r"```(?:json)?", "", response_text)
    cleaned = cleaned.replace("```", "").strip()

    # Extract the first {...} block
    match = re.search(r'\{[\s\S]*\}', cleaned)
    if match:
        cleaned = match.group(0)

    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        # Attempt to repair common issues
        repaired = re.sub(r'(?<!\\)"(?=\s*[^":,{}\[\]]+\s*[:,\]])', r'\\"', cleaned)
        try:
            return json.loads(repaired)
        except json.JSONDecodeError as e:
            print("JSON parsing failed:", e)
            print("\nRaw output from model:\n", response_text)
            return None


def extract_ielts_data(text, mode="type"):
    prompt = PROMPT_TEMPLATE_TYPE if mode == "type" else PROMPT_TEMPLATE_SECTION
    prompt = prompt.format(text=text)

    model = genai.GenerativeModel("gemini-2.0-flash")
    response = model.generate_content(prompt)

    return safe_json_parse(response.text)

def process_pdf(pdf_path, mode="type"):
    print(f"\nProcessing {pdf_path} in mode: {mode}")
    text = extract_text_from_pdf(pdf_path)
    structured_data = extract_ielts_data(text, mode=mode)

    if structured_data:
        os.makedirs("extracted_test", exist_ok=True)

        if "sample_type.pdf" in pdf_path:
            output_file = os.path.join("sample_question_type", "sample_type.json")
        else:
            base_name = os.path.basename(pdf_path).replace(".pdf", ".json")
            output_file = os.path.join("extracted_test", base_name)

        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(structured_data, f, indent=2, ensure_ascii=False)

        print(f"Saved structured data to {output_file}")
    else:
        print(f"Failed to extract data from {pdf_path}")


In [5]:
process_pdf("sample_question_type/sample_type.pdf", mode="type")
for i in range(1,10):
    pdf_path = f"sample_test/sample_{i}.pdf"
    process_pdf(pdf_path, mode="section")

Saved structured data to sample_question_type\sample_type.json

Processing sample_test/sample_1.pdf in mode: section
Saved structured data to extracted_test\sample_1.json

Processing sample_test/sample_2.pdf in mode: section
Saved structured data to extracted_test\sample_2.json

Processing sample_test/sample_3.pdf in mode: section
Saved structured data to extracted_test\sample_3.json

Processing sample_test/sample_4.pdf in mode: section
Saved structured data to extracted_test\sample_4.json

Processing sample_test/sample_5.pdf in mode: section
Saved structured data to extracted_test\sample_5.json

Processing sample_test/sample_6.pdf in mode: section
Saved structured data to extracted_test\sample_6.json

Processing sample_test/sample_7.pdf in mode: section
Saved structured data to extracted_test\sample_7.json

Processing sample_test/sample_8.pdf in mode: section
Saved structured data to extracted_test\sample_8.json

Processing sample_test/sample_9.pdf in mode: section
Saved structured da

# Question Type Determine


In [9]:
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

def save_json(data, file_path):
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=2, ensure_ascii=False)

def summarize_reference(reference_data):
    summary = {}
    for qtype, content in reference_data.items():
        q_text = content.get("Question", "")
        summary[qtype] = q_text[:200]  # only first 200 chars for context
    return summary

def classify_question_type(reference_data, section_data):
    short_ref = summarize_reference(reference_data)

    prompt = f"""
        You are an IELTS Listening expert.

        Below is a brief description of IELTS question types based on a reference sample:
        {json.dumps(short_ref, indent=2)}

        Now you are given one section from another IELTS Listening test.
        Identify which question type it most closely matches.

        Choose from:
        - Form Completion
        - Multiple Choice
        - Short Answer Question
        - Sentence Completion
        - Matching 1
        - Matching 2
        - Labelling
        - Note Completion

        Output valid JSON:
        {{
        "QuestionType": "<one of the above types>"
        }}

        Here is the section content:

        Question:
        {section_data.get("Question", '')[:1000]}

        Answer:
        {section_data.get("Answer", '')[:500]}

        Transcript:
        {section_data.get("Transcript", '')[:500]}
        """

    model = genai.GenerativeModel("gemini-2.5-flash")
    response = model.generate_content(prompt)
    return safe_json_parse(response.text)

def process_classification(reference_file, input_folder):
    reference_data = load_json(reference_file)

    for filename in os.listdir(input_folder):
        if not filename.endswith(".json") or filename == "sample_type.json":
            continue

        file_path = os.path.join(input_folder, filename)
        print(f"\nProcessing {filename}")

        data = load_json(file_path)

        for section_name, section_data in data.items():
            print(f"  - Classifying {section_name}")
            try:
                result = classify_question_type(reference_data, section_data)
                if result and "QuestionType" in result:
                    data[section_name]["QuestionType"] = result["QuestionType"]
                else:
                    data[section_name]["QuestionType"] = "Unknown"
            except Exception as e:
                print(f"Skipping {section_name} due to error: {e}")
                data[section_name]["QuestionType"] = "Error"

        save_json(data, file_path)
        print(f"Updated question types saved -> {file_path}")

process_classification(
    reference_file="extracted_test/sample_type.json",
    input_folder="extracted_test"
)


Processing sample_1.json
  - Classifying Section 1
  - Classifying Section 2
  - Classifying Section 3
  - Classifying Section 4
Updated question types saved -> extracted_test\sample_1.json

Processing sample_2.json
  - Classifying Section 1
  - Classifying Section 2
  - Classifying Section 3
  - Classifying Section 4
Updated question types saved -> extracted_test\sample_2.json

Processing sample_3.json
  - Classifying Section 1
  - Classifying Section 2
  - Classifying Section 3
  - Classifying Section 4
Updated question types saved -> extracted_test\sample_3.json

Processing sample_4.json
  - Classifying Section 1
  - Classifying Section 2
  - Classifying Section 3
  - Classifying Section 4
Updated question types saved -> extracted_test\sample_4.json

Processing sample_5.json
  - Classifying Section 1
  - Classifying Section 2
  - Classifying Section 3
  - Classifying Section 4
Updated question types saved -> extracted_test\sample_5.json

Processing sample_6.json
  - Classifying Sec