# Question Generator

In [1]:
# Import necessary libraries
import os
import re
import json
import pandas as pd
import google.generativeai as genai
import textstat

from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize GenAI
genai.configure(api_key="AIzaSyCSC0LPUznCj0USGxAVXjXT_4vgVqp-ah4")
model = genai.GenerativeModel("gemini-2.5-flash")

In [51]:
# Configuration
QUESTION_TYPE_CSV = "model_training/processed_data/questionType.csv"
WORD_CSV = "model_training/processed_data/ielts_vocab.csv"
TRAINING_CSV = "model_training/processed_data/training_set.csv"
GENERATED_JSON = "model_training/generated_questions/generated_questions.json"
TEMP_CSV = "model_training/generated_questions/temp_generated_questions.json"

MAX_ATTEMPT = 5
REWARD_GOAL = 4

In [52]:
# Load Data
question_type_df = pd.read_csv(QUESTION_TYPE_CSV)
common_vocab_df = pd.read_csv(WORD_CSV)
training_df = pd.read_csv(TRAINING_CSV)

In [66]:
# Mock Data Only
section_choices = {
    1: [
        {
            "typeID": "T001",
            "theme": "Booking a hotel room",
            "topic": "Asking about room facilities",
            "spec": "Include questions about availability and prices."
        },
        {
            "typeID": "T002",
            "theme": "Booking a restaurant",
            "topic": "Asking about restaurant food",
            "spec": "Include questions about taste and prices."
        }
    ],
    2: [
        {
            "typeID": "T004",
            "theme": "Campus tour",
            "topic": "Identifying main buildings",
            "spec": "Include left/right directions."
        }
    ],
    3: [
        {
            "typeID": "T006",
            "theme": "Group project planning",
            "topic": "Assigning tasks",
            "spec": "Include four students."
        }
    ],
    4: [
        {
            "typeID": "T009",
            "theme": "AI in communication",
            "topic": "Impact of AI",
            "spec": "Use academic style and one real example."
        }
    ]
}

In [None]:
# Prompt Template
PROMPT_TEMPLATE = """
You are an expert IELTS Listening question generator.
Create realistic IELTS Listening questions and transcripts following the official format.

--- QUESTION REQUIREMENTS ---
Section: {section}
Question Type: {typeID} - {type_name}
Question Numbers: {question_range}
Number of Questions: {question_count}

Theme: {theme}
Specific Topic: {specific_topic}
Additional Specifications from Test Creator: {specifications}

Instructions to Display: {instruction}
Expected Answer Format: {answer_format}
Format Rules: {format}
Key Listening Skills: {key_skills}
Typical Duration: {avg_duration}
Expected Transcript Length: {avg_script_length} words
Audio Speed: {audio_speed}
Key Features: {key_features}

--- OUTPUT REQUIREMENTS ---
1. Produce exactly {question_count} questions.
2. Output MUST be valid JSON ONLY, with these keys:
   "Section", "Type", "Instructions", "Diagram",
   "Questions", "Answers", "Options", "Transcript".
3. "Questions" must be a list of strings.
4. "Answers" must be a list of strings of equal length.
5. The type should be T001, T002, T003 and so on only.
6. Question instruction should be the instructions to display and expected answer format.
7. For multiple-choice types, include "Options" (list of lists).
8. The diagram should be drawn in the characters and plain text only. You should handle the space and next line correctly.
9. Transcript MUST naturally reference ALL question numbers in {question_range}.
10. The transcript should include the introduction as the exact IELTS listening test. Do not include other explanations including question numbers and pause. Only the Narrator, People and their conversation.
11. No Markdown. No explanations. JSON ONLY.

Return the JSON format only.
"""

In [78]:
# JSON Parser
def safe_json_parse(raw):
    if not raw: return None
    raw = raw.replace("```json", "").replace("```", "").strip()
    try:
        return json.loads(raw)
    except:
        return None

In [83]:
# Reward Functions
def calculate_readability_score(text):
    return textstat.flesch_reading_ease(str(text))

def is_in_average_word_count(text, section_label):
    if not text: return False
    words = re.findall(r'\b\w+\b', str(text))
    wc = len(words)
    expected = {
        "Section 1": (500, 700),
        "Section 2": (600, 800),
        "Section 3": (800, 1000),
        "Section 4": (1000, 1200)
    }
    low, high = expected.get(section_label, (0, 99999))
    return low <= wc <= high

def calculate_common_word_ratio(text):
    common_vocab = set(common_vocab_df["Words"].str.lower())
    words = [w.lower() for w in re.findall(r'\b\w+\b', str(text))]
    if not words: return 0
    uncommon = [w for w in words if w not in common_vocab]
    return len(uncommon) / len(words)

def calculate_similarity(text):
    existing_texts = []

    # 1. From training_df
    if "transcript" in training_df.columns:
        existing_texts += training_df["transcript"].dropna().astype(str).tolist()

    # 2. From generated JSON
    if os.path.exists(GENERATED_JSON):
        with open(GENERATED_JSON, "r", encoding="utf-8") as f:
            saved_data = json.load(f)

            for item in saved_data:

                # CASE A: item is dict
                if isinstance(item, dict):
                    transcript = item.get("Transcript", "")

                # CASE B: item is string
                elif isinstance(item, str):
                    transcript = item

                # CASE C: item is list
                elif isinstance(item, list):
                    transcript = " ".join(map(str, item))

                # Unknown type
                else:
                    continue

                # Normalise to string
                existing_texts.append(str(transcript))

    # No existing corpus
    if not existing_texts:
        return 0.0

    # Normalise input text
    if isinstance(text, list):
        text = " ".join(map(str, text))
    text = str(text)

    # Build TF-IDF Similarity
    corpus = existing_texts + [text]

    vec = TfidfVectorizer().fit_transform(corpus)
    sims = cosine_similarity(vec[-1], vec[:-1]).flatten()

    return max(sims) if len(sims) else 0.0

In [84]:
# Question number calculation
def get_question_counts(types):
    if len(types) == 1:
        return {types[0]: 10}
    return {types[0]: 5, types[1]: 5}

def number_ranges(counts, section_num):
    start = (section_num - 1) * 10 + 1
    ranges = {}
    cur = start
    for t, c in counts.items():
        ranges[t] = f"{cur}-{cur+c-1}"
        cur += c
    return ranges

In [85]:
# Model Call
def model_generate(prompt):
    response = model.generate_content(prompt)
    return safe_json_parse(response.text)

def generate_full_set(section_choices):
    all_results = []

    dt_key = datetime.now().strftime("%Y_%m_%d_%H_%M")

    for section_label, entries in section_choices.items():
        section_num = int(re.search(r"\d+", str(section_label)).group())
        # Map typeIDs to counts
        types = [e["typeID"] for e in entries]
        counts = get_question_counts(types)
        ranges = number_ranges(counts, section_num)

        for entry in entries:
            typeID = entry["typeID"]
            theme = entry["theme"]
            topic = entry["topic"]
            spec = entry["spec"]

            # Find the type info by typeID
            type_row = question_type_df[question_type_df["typeID"] == typeID]
            if type_row.empty:
                print(f" WARNING: typeID '{typeID}' not found in question_type_df. Using placeholder info.")
                type_info = {
                    "type": f"Unknown Type ({typeID})",
                    "instruction": "Follow standard instructions.",
                    "answer_format": "List of answers",
                    "format": "Text",
                    "key_skills": "Listening",
                    "avg_duration": "3-4 min",
                    "avg_script_length": "600",
                    "key_features": "IELTS standard",
                    "audio_speed": "Normal"
                }
            else:
                type_info = type_row.iloc[0]

            q_type_name = type_info["type"]
            question_count = counts[typeID]
            question_range = ranges[typeID]

            best_reward = -99
            best_json = None

            for attempt in range(1, MAX_ATTEMPT + 1):
                print(f"\n[GENERATING] {section_label} - {q_type_name} Attempt {attempt}")

                prompt = PROMPT_TEMPLATE.format(
                    section=section_label,
                    question_range=question_range,
                    question_count=question_count,
                    typeID=typeID,
                    type_name=q_type_name,
                    theme=theme,
                    specific_topic=topic,
                    specifications=spec,
                    instruction=type_info["instruction"],
                    answer_format=type_info["answer_format"],
                    format=type_info["format"],
                    key_skills=type_info["key_skills"],
                    avg_duration=type_info["avg_duration"],
                    avg_script_length=type_info["avg_script_length"],
                    key_features=type_info["key_features"],
                    audio_speed=type_info["audio_speed"],
                )

                model_json = model_generate(prompt)

                if not isinstance(model_json, dict):
                    print("  Invalid JSON, using placeholder")
                    continue

                transcript = model_json.get("Transcript", "")
                reward = 0
                if calculate_readability_score(transcript) >= 55: reward += 1
                if is_in_average_word_count(transcript, section_label): reward += 1
                if calculate_common_word_ratio(transcript) >= 0.1: reward += 1
                if calculate_similarity(transcript) <= 0.85: reward += 1

                print(f" -> Reward: {reward}")

                if reward > best_reward:
                    best_reward = reward
                    best_json = model_json

                if reward == REWARD_GOAL:
                    break

            # Fallback placeholder
            if best_json is None:
                best_json = {
                    "Section": section_label,
                    "Type": q_type_name,
                    "Instructions": type_info["instruction"],
                    "Diagram": None,
                    "Questions": [f"Placeholder Q{i}" for i in range(1, question_count+1)],
                    "Answers": [f"Answer_{i}" for i in range(1, question_count+1)],
                    "Options": [None]*question_count,
                    "Transcript": f"Placeholder transcript {question_range}"
                }

            all_results.append(best_json)

    wrapped_output = {dt_key: all_results}

    # Save temp copy
    with open(TEMP_CSV, "w", encoding="utf-8") as f:
        json.dump(wrapped_output, f, indent=2, ensure_ascii=False)

    # Save or append into master JSON
    if os.path.exists(GENERATED_JSON):
        with open(GENERATED_JSON, "r", encoding="utf-8") as f:
            existing = json.load(f)
    else:
        existing = {}

    existing[dt_key] = all_results

    with open(GENERATED_JSON, "w", encoding="utf-8") as f:
        json.dump(existing, f, indent=2, ensure_ascii=False)

    print(f"\n Full question set saved under key {dt_key} in {GENERATED_JSON}")
    return wrapped_output

In [86]:
# Main Pipeline
questions = generate_full_set(section_choices)


[GENERATING] 1 - Table Completion Attempt 1
 -> Reward: 4

[GENERATING] 1 - Form Completion Attempt 1
 -> Reward: 4

[GENERATING] 2 - Map Labelling Attempt 1
 -> Reward: 4

[GENERATING] 3 - Matching Attempt 1
 -> Reward: 4

[GENERATING] 4 - Sentence Completion Attempt 1
 -> Reward: 3

[GENERATING] 4 - Sentence Completion Attempt 2
 -> Reward: 3

[GENERATING] 4 - Sentence Completion Attempt 3
 -> Reward: 3

[GENERATING] 4 - Sentence Completion Attempt 4
 -> Reward: 3

[GENERATING] 4 - Sentence Completion Attempt 5
 -> Reward: 3

 Full question set saved under key 2025_12_07_03_02 in model_training/generated_questions/generated_questions.json


# View Question Generated

In [None]:
print(questions)

{'2025_12_07_02_28': [{'Section': '1', 'Type': 'T001 - Table Completion', 'Instructions': 'Complete the table below.\nWrite ONE WORD AND / OR A NUMBER for each answer.', 'Diagram': '                          Grand Hotel\n                     Room Booking Information\n-------------------------------------------------------------------\nRoom Type    Key Facilities         Availability       Price (per night)\n-------------------------------------------------------------------\nStandard     - Free (1) ____________\nDouble       - Air conditioning\n             - Mini-bar\n-------------------------------------------------------------------\nDeluxe       - Private balcony      From 25th March    £ (5) ____________\nSuite        - (2) ____________ bathtub\n             - Fast Wi-Fi\n             - Kitchenette\n-------------------------------------------------------------------\nExecutive    - King-size bed        Available from\nRoom         - Work desk            1st (3) ____________\n     

# Save to Files

In [3]:
# Import necessary libraries
import os
import re
import json

from datetime import datetime
from fpdf import FPDF

In [4]:
# Configurations
TEMP_JSON = "model_training/generated_questions/temp_generated_questions.json"

DEJAVUSANS_FONT = "frontend/fonts/DejaVuSans.ttf"
SPACEMONO_FONT = "frontend/fonts/SpaceMono-Regular.ttf"

IELTS_LOGO = "frontend/static/images/ielts_logo.png"

In [3]:
# Determine next set folder
# Make sure the the folder exists
base_folder = "sets"
os.makedirs(base_folder, exist_ok=True)

# Find existing set numbers
existing = [int(re.search(r"set(\d+)", d).group(1)) for d in os.listdir(base_folder) if re.match(r"set\d+", d)]
next_set = max(existing, default=0) + 1
set_folder = os.path.join(base_folder, f"set{next_set}")
os.makedirs(set_folder, exist_ok=True)

In [None]:
def get_key_and_sections():
    with open(TEMP_JSON, "r", encoding="utf-8") as file:
        data = json.load(file)

    if not isinstance(data, dict):
        raise ValueError("JSON root must be a dict containing the timestamp key.")

    # Extract first key
    key = next(iter(data.keys()))
    raw_sections = data[key]

    # normalize section list
    sections = []
    for item in raw_sections:
        if isinstance(item, str):
            sections.append(json.loads(item))
        else:
            sections.append(item)

    return key, sections

In [None]:
def format_date_from_key(key):
    date_part = "_".join(key.split("_")[:3])
    date_obj = datetime.strptime(date_part, "%Y_%m_%d")
    return date_obj.strftime("%d %B %Y") 

In [None]:
class PDF(FPDF):
    LEFT_CONTENT_MARGIN = 20 

    # Header
    def header(self):
        if self.page_no() > 1:  
            self.image(IELTS_LOGO, x=20, y=20, w=20)
        self.set_y(30)

    # Title
    def title_page(self, set_number, date_str):
        self.set_line_width(0.8)
        self.rect(10, 10, 190, 277)

        self.image(IELTS_LOGO, x=(210 - 65) / 2, y=28, w=65)

        self.set_y(95)
        self.set_font("DejaVu", "B", 30)
        self.multi_cell(0, 12, "Listening Test", align="C")
        self.ln(4)

        self.set_font("DejaVu", "", 16)
        self.multi_cell(0, 10, f"Set {set_number}", align="C")
        self.ln(2)

        self.set_font("DejaVu", "", 12)
        self.multi_cell(0, 8, date_str, align="C")
        self.ln(10)

        # Instruction box
        box_x = 20
        box_y = 150
        box_w = 170
        box_h = 60

        self.set_line_width(0.6)
        self.rect(box_x, box_y, box_w, box_h)

        self.set_xy(box_x + 10, box_y + 10)
        self.set_font("DejaVu", "", 11)
        instructions_text = (
            "• You will hear four recordings.\n"
            "• Write your answers on the question paper.\n"
            "• You will have time to read the questions before you listen.\n"
            "• Use a pencil. Write clearly and follow instructions.\n"
            "• At the end, you will have 10 minutes to transfer your answers."
        )
        self.multi_cell(box_w - 20, 6, instructions_text)

        self.add_page()

    # Part Header
    def part_header(self, part_number):
        self.set_font("DejaVu", "B", 16)
        self.set_x(self.LEFT_CONTENT_MARGIN)
        self.multi_cell(0, 10, f"Part {part_number}", align="L")
        self.ln(5)

    # Instructions
    def write_instructions(self, instructions):
        self.set_x(self.LEFT_CONTENT_MARGIN)
        self.set_font("DejaVu", "", 12)
        self.multi_cell(0, 6, instructions)
        self.ln(4)

    # Body
    # 1. Question only
    def write_questions(self, questions):
        self.set_font("DejaVu", "", 10)
        for q in questions:
            self.set_x(self.LEFT_CONTENT_MARGIN)
            self.multi_cell(0, 6, q)
        self.ln(4)

    # 2. MCQ
    def write_mcq(self, questions, options):
        self.set_x(self.LEFT_CONTENT_MARGIN)
        self.set_font("DejaVu", "", 10)
        for q in questions:
            self.set_x(self.LEFT_CONTENT_MARGIN)
            self.multi_cell(0, 6, f"{q}")
            for o in options:
                self.set_x(self.LEFT_CONTENT_MARGIN)
                self.multi_cell(0, 6, f"{o}")
        self.ln(4)

    # 3. Matching
    def write_matching(self, questions, options):
        self.set_font("DejaVu", "", 10)
        self.set_x(self.LEFT_CONTENT_MARGIN)
        self.multi_cell(0, 6, "-----------------------------------------")
        for o in options:
            self.set_x(self.LEFT_CONTENT_MARGIN)
            self.multi_cell(0, 6, f"{o}")
        self.set_x(self.LEFT_CONTENT_MARGIN)
        self.multi_cell(0, 6, "-----------------------------------------")
        self.ln(2)

        for q in questions:
            self.set_x(self.LEFT_CONTENT_MARGIN)
            self.multi_cell(0, 6, f"{q}: ____________________")
        self.ln(4)

    # 4. With Diagram
    def write_diagram(self, diagram, questions):
        self.set_x(self.LEFT_CONTENT_MARGIN)
        self.set_font("SpaceMono", "", 8)
        self.multi_cell(0, 3, diagram)
        self.set_font("DejaVu", "", 10)
        self.ln(2)
        for q in questions:
            self.set_x(self.LEFT_CONTENT_MARGIN)
            self.multi_cell(0, 6, f"{q}. ____________________")
        self.ln(4)

    # Answers
    def write_answers(self):
        self.set_x(self.LEFT_CONTENT_MARGIN)
        self.set_font("DejaVu", "B", 14)
        self.multi_cell(0, 6, "Answers")
        self.ln(2)

    def write_answers_line(self, answers):
        self.set_font("DejaVu", "", 11)
        line_height = 6
        bottom_margin = 25

        for num, ans in answers:
            self.set_x(self.LEFT_CONTENT_MARGIN)
            full_text = f"{num}. {ans}"

            block_height = line_height

            if self.get_y() + block_height + bottom_margin > self.h:
                self.add_page()
                self.set_x(self.LEFT_CONTENT_MARGIN)

            self.multi_cell(0, line_height, full_text)
            self.ln(2)
            
        self.set_x(self.LEFT_CONTENT_MARGIN)
        self.ln(4)

    # Transcript
    def write_transcripts(self):
        self.set_x(self.LEFT_CONTENT_MARGIN)
        self.set_font("DejaVu", "B", 14)
        self.multi_cell(0, 6, "Transcripts")
        self.ln(2)

    def write_transcripts_line(self, transcripts):
        self.set_font("DejaVu", "", 11)

        paragraphs = transcripts.split("\n")
        line_height = 6
        bottom_margin = 25

        for para in paragraphs:
         
            effective_width = self.w - self.r_margin - self.l_margin
        
            approx_char_per_line = int(effective_width / (self.get_string_width("A") * 1.05))
            lines_needed = max(1, (len(para) // approx_char_per_line) + 1)
            block_height = lines_needed * line_height

            if self.get_y() + block_height + bottom_margin > self.h:
                self.add_page()

            self.multi_cell(0, line_height, para)
            self.ln(2)

    # Footer
    def footer(self):
        self.set_line_width(0.8)
        self.rect(10, 10, 190, 277)
        self.set_y(-25)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

    # Break Line
    def break_line(self):
        self.set_x(self.LEFT_CONTENT_MARGIN)
        self.set_font("DejaVu", "B", 14)
        self.multi_cell(0, 6, "-------------------------------------------------------------------------------------------")
        self.ln(2)

In [None]:
# Full Set -> Question + Answers + Transcript PDF
def export_full_pdf():
    key, sections = get_key_and_sections()
    formatted_date = format_date_from_key(key)
    full_pdf_path = os.path.join(set_folder, "full_set.pdf")

    pdf = PDF()
    pdf.add_font("DejaVu", "", DEJAVUSANS_FONT, uni=True)
    pdf.add_font("DejaVu", "B", DEJAVUSANS_FONT, uni=True)
    pdf.add_font("SpaceMono", "", SPACEMONO_FONT, uni=True)

    pdf.set_left_margin(25)
    pdf.set_right_margin(25)
    pdf.set_top_margin(10)

    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.alias_nb_pages() 
    
    pdf.add_page()
    pdf.title_page(next_set, formatted_date)

    next_section = None
    first_part = True

    for section in sections:
        section_num = str(section.get("Section", "")).strip()
        instructions = section.get("Instructions", "")
        questions = section.get("Questions", [])
        diagram = section.get("Diagram", "")
        options = section.get("Options", [])
        type_code = section.get("Type").split()[0]

        # Track section
        current_section = section_num

        if not first_part:
            pdf.add_page()
        first_part = False

        if current_section != next_section:
            pdf.part_header(section_num)

        pdf.write_instructions(instructions)

        if type_code in ["T001", "T003", "T004", "T008", "T011"]:
            if diagram and diagram.strip() != "":
                pdf.write_diagram(diagram, questions)
            else:
                pdf.write_questions(questions)

        if type_code in ["T005", "T007"]:
            pdf.write_mcq(questions, options)

        if type_code in ["T006"]:
            pdf.write_matching(questions, options)

        if type_code in ["T002", "T009", "T010"]:
            pdf.write_questions(questions)

        next_section = section_num

    # Print Answers
    pdf.add_page()
    question_number = 1
    pdf.write_answers()

    for section in sections:
        section_num = str(section.get("Section", "")).strip()
        answers = section.get("Answers", [])

        pdf.part_header(section_num)

        for ans in answers:
            pdf.write_answers_line([(question_number, ans)])
            question_number += 1

        pdf.break_line()

    # Print Transcripts
    pdf.add_page()
    pdf.write_transcripts()

    for section in sections:
        section_num = str(section.get("Section", "")).strip()
        transcripts = section.get("Transcript", "")

        pdf.part_header(section_num)

        pdf.write_transcripts_line(transcripts)
        pdf.break_line()

    pdf.output(full_pdf_path)

export_full_pdf()

In [None]:
# 2. Questoins Only PDF
def export_questions_pdf():
    key, sections = get_key_and_sections()
    formatted_date = format_date_from_key(key)
    questions_pdf_path = os.path.join(set_folder, "questions.pdf")

    pdf = PDF()
    pdf.add_font("DejaVu", "", DEJAVUSANS_FONT, uni=True)
    pdf.add_font("DejaVu", "B", DEJAVUSANS_FONT, uni=True)
    pdf.add_font("SpaceMono", "", SPACEMONO_FONT, uni=True)

    pdf.set_left_margin(25)
    pdf.set_right_margin(25)
    pdf.set_top_margin(10)

    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.alias_nb_pages() 
    
    pdf.add_page()
    pdf.title_page(set_number=1, date_str=formatted_date)

    next_section = None
    first_part = True

    for section in sections:
        section_num = str(section.get("Section", "")).strip()
        instructions = section.get("Instructions", "")
        questions = section.get("Questions", [])
        diagram = section.get("Diagram", "")
        options = section.get("Options", [])
        type_code = section.get("Type").split()[0]

        # Track section
        current_section = section_num

        if not first_part:
            pdf.add_page()
        first_part = False

        if current_section != next_section:
            pdf.part_header(section_num)

        pdf.write_instructions(instructions)

        if type_code in ["T001", "T003", "T004", "T008", "T011"]:
            if diagram and diagram.strip() != "":
                pdf.write_diagram(diagram, questions)
            else:
                pdf.write_questions(questions)

        if type_code in ["T005", "T007"]:
            pdf.write_mcq(questions, options)

        if type_code in ["T006"]:
            pdf.write_matching(questions, options)

        if type_code in ["T002", "T009", "T010"]:
            pdf.write_questions(questions)

        next_section = section_num

    pdf.output(questions_pdf_path)

export_questions_pdf

In [None]:
# 3. Transcript Only TXT
def export_transcript_txt():
    key, sections = get_key_and_sections()
    formatted_date = format_date_from_key(key)
    transcript_txt_path = os.path.join(set_folder, "transcript.txt")

    with open(transcript_txt_path, "w", encoding="utf-8") as file:
        # Header
        file.write("                               IELTS Listening Test \n")
        file.write("                                        Set {next_set}        \n")
        file.write(f"                                   {formatted_date}\n\n")

        # Body
        next_section = None

        for section in sections:

            section_num = str(section.get("Section", "")).strip()
            transcript = section.get("Transcript", "")

            # Track section
            current_section = section_num

            if current_section != next_section:
                file.write(f"Part {section_num}\n")
            else:
                file.write("\n")

            file.write(f"{transcript}\n")

            file.write(f"\n -------------------------------------------------------------------------------------------------\n")

            next_section = section_num
 
        file.write("                               End of Paper")

    print("TXT generated:", transcript_txt_path)

export_transcript_txt()

TypeError: string indices must be integers, not 'str'

In [None]:
# 4. Questions Only TXT
def export_question_txt():
    key, sections = get_key_and_sections()
    formatted_date = format_date_from_key(key)
    question_txt_path = os.path.join(set_folder, "questions.txt")

    with open(question_txt_path, "w", encoding="utf-8") as file:

        # Header
        file.write("                               IELTS Listening Test \n")
        file.write("                                        Set {next_set}       \n")
        file.write(f"                                   {formatted_date}\n\n")

        # Body
        next_section = None

        for section in sections:

            section_num = str(section.get("Section", "")).strip()
            instructions = section.get("Instructions", "")
            questions = section.get("Questions", [])
            diagram = section.get("Diagram", "")
            options = section.get("Options", [])
            type_code = section.get("Type").split()[0]

            # Track section
            current_section = section_num

            if current_section != next_section:
                file.write(f"Part {section_num}\n")
            else:
                file.write("\n")

            file.write(f"{instructions}\n\n")

            if type_code in ["T001", "T003", "T004", "T008", "T011"]:
                if diagram and diagram.strip() != "":
                    file.write(f"{diagram}\n\n")
                    file.write(f"Answers: \n")
                    for q in questions:
                        file.write(f"{q}. ________________\n")
                else:
                    for q in questions:
                        file.write(f"{q}\n")

            if type_code in ["T005", "T007"]:
                if options and len(options) != 0:
                    for q in questions:
                        file.write(f"{q}\n")
                        for o in options:
                            if isinstance(o, list):  
                                o = " ".join(o)
                            file.write(f"{o}\n")
                        file.write(f"\n")

            if type_code == "T006":
                file.write(f"--------------------------------\n")
                for o in options:
                    if isinstance(o, list):  
                        o = " ".join(o)
                    file.write(f"    {o}\n")
                file.write(f"--------------------------------\n\n")
                for q in questions:
                    file.write(f"{q} _____________________\n")
            
            if type_code in ["T002", "T009", "T010"]:
                for q in questions:
                    file.write(f"{q}\n")

            file.write(f"\n -------------------------------------------------------------------------------------------------\n")

            next_section = section_num
 
        file.write("                               End of Paper")

export_question_txt()