In [9]:
# Import necessary libraries
import pandas as pd
import random
import re
import json
import os
import google.generativeai as genai
import textstat
import numpy as np

from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# Initialize genai
genai.configure(api_key="AIzaSyBS-2pbdjYouOkcqHaX4ZI5HHPpSSmq3iw")
model = genai.GenerativeModel("gemini-2.5-flash")

### Random Choose Question Type

In [11]:
question_types_df = pd.read_csv('processed_data/questionType.csv')
assignments = {}

for section in sorted(question_types_df['part'].unique()):
    section_types = question_types_df[question_types_df['part'] == section]['type'].tolist()
    k = random.choice([1, 2])
    selected = random.sample(section_types, k=min(k, len(section_types)))
    
    assignments[f"Section {section}"] = selected

print(assignments)

{'Section 1': ['Table Completion', 'Form Completion'], 'Section 2': ['Map Labelling'], 'Section 3': ['Flow Chart Completion'], 'Section 4': ['Sentence Completion', 'Sentence Completion']}


### Question Generator

In [12]:
PROMPT_TEMPLATE = """
You are an expert IELTS Listening question generator.
Generate a realistic IELTS Listening question according to the following details:

Question Type: {typeID} - {type_name}
Theme: {theme}
Specific Topic: {specific_topic}
Specifications: {specifications}
Instructions: {instruction}
Format: {format}
Answer Format: {answer_format}
Key Skills: {key_skills}
Average Duration: {avg_duration}
Average Script Length: {avg_script_length}
Key Features: {key_features}
Audio Speed: {audio_speed}

Requirements:
1. Generate questions, answers, and the audio transcript.
2. If the question type requires a diagram (Map, Plan, Flow Chart), generate a simple diagram (as text description or URL placeholder).
3. Return the output in JSON format:
{{
  "Type": [],
  "Instructions": [],
  "Diagram": [],
  "Questions": [],
  "Answers": [],
  "Transcript": ""
}}
Type -> Question types with type names only
Instructions -> Instructions for the question based on the references above
Diagram -> Diagram description or URL placeholder (if applicable, else null)
Questions -> List of questions
Answers -> List of answers
Transcript -> Full audio transcript. The transcript should include the introductions as a real IELTS Listening test.
4. Ensure the JSON is properly formatted. Do not include any explanations or additional text outside the JSON.
5. Do not add other fields other than the ones mentioned in the JSON format above.
"""

In [13]:
questions = []

def safe_json_parse(response_text):
    cleaned = re.sub(r"```(?:json)?", "", response_text)
    cleaned = cleaned.replace("```", "").strip()
    match = re.search(r'\{[\s\S]*\}', cleaned)
    if match:
        cleaned = match.group(0)
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        print("JSON parsing failed. Raw model output:\n", response_text)
        return None
    
def generate_question(typeID, type_name, theme, specific_topic, specifications, instruction, answer_format, format, key_skills, avg_duration, avg_script_length, key_features, audio_speed):
    prompt = PROMPT_TEMPLATE.format(
        typeID=typeID,
        type_name=type_name,
        theme=theme,
        specific_topic=specific_topic,
        specifications=specifications,
        instruction=instruction,
        answer_format=answer_format,
        format=format,
        key_skills=key_skills,
        avg_duration=avg_duration,
        avg_script_length=avg_script_length,
        key_features=key_features,
        audio_speed=audio_speed
    )
    
    response = model.generate_content(prompt)

    clean_response = safe_json_parse(response.text)
    
    return clean_response

def set_question(theme, specific_topic, specifications):
    for section, types in assignments.items():
        print(f"\n{section} Questions:")
        for q_type in types:
            type_info = question_types_df[question_types_df['type'] == q_type].iloc[0]
            question_data = generate_question(
                typeID=type_info['type'],
                type_name=type_info['type'],
                theme=theme,
                specific_topic=specific_topic,
                specifications=specifications,
                instruction=type_info['instruction'],
                answer_format=type_info['answer_format'],
                format=type_info['format'],
                key_skills=type_info['key_skills'],
                avg_duration=type_info['avg_duration'],
                avg_script_length=type_info['avg_script_length'],
                key_features=type_info['key_features'],
                audio_speed=type_info['audio_speed']
            )
            questions.append(question_data)
            print(question_data)

In [14]:
theme = "Education"
specific_topic = "University Lectures"
specifications = "Academic context, formal tone"

set_question(theme, specific_topic, specifications)


Section 1 Questions:
{'Type': ['Table Completion'], 'Instructions': 'Complete the table below.\nWrite ONE WORD AND / OR A NUMBER for each answer.', 'Diagram': None, 'Questions': ['1. Introduction to ____________________', '2. Read Chapter 1 of ____________________', '3. Modern ____________________ History', '4. ____________________ PM', '5. ____________________ and Society', '6. Main Building ____________________', '7. Familiarize with current economic ____________________'], 'Answers': ['Psychology', 'Psychology', 'European', '3', 'Economics', '104', 'news'], 'Transcript': "You will hear a conversation between Sarah, a new university student, and Dr. Evans, an academic advisor, discussing Sarah's lecture schedule for her first semester.\nFirst, you have some time to look at questions 1 to 7.\n\n(Pause 30 seconds)\n\nNow listen carefully and answer questions 1 to 7.\n\nDr. Evans: Good morning, Sarah. Welcome to the university, and congratulations on securing your place here. Please ha

In [15]:
print(questions)

[{'Type': ['Table Completion'], 'Instructions': 'Complete the table below.\nWrite ONE WORD AND / OR A NUMBER for each answer.', 'Diagram': None, 'Questions': ['1. Introduction to ____________________', '2. Read Chapter 1 of ____________________', '3. Modern ____________________ History', '4. ____________________ PM', '5. ____________________ and Society', '6. Main Building ____________________', '7. Familiarize with current economic ____________________'], 'Answers': ['Psychology', 'Psychology', 'European', '3', 'Economics', '104', 'news'], 'Transcript': "You will hear a conversation between Sarah, a new university student, and Dr. Evans, an academic advisor, discussing Sarah's lecture schedule for her first semester.\nFirst, you have some time to look at questions 1 to 7.\n\n(Pause 30 seconds)\n\nNow listen carefully and answer questions 1 to 7.\n\nDr. Evans: Good morning, Sarah. Welcome to the university, and congratulations on securing your place here. Please have a seat.\nSarah: Th

### Save to Files

In [16]:
def normalize_question(q):
    if q is None:
        return None
    
    clean = {}

    clean['DateTime_Generated'] = datetime.now().strftime("%Y_%m_%d_%H_%M")

    # Flatten Type
    t = q.get("Type")
    if isinstance(t, list):
        clean["Type"] = " ".join(str(x) for x in t)
    else:
        clean["Type"] = t

    # Flatten Instructions
    instr = q.get("Instructions")
    if isinstance(instr, list):
        clean["Instructions"] = " ".join(str(x) for x in instr)
    else:
        clean["Instructions"] = instr

    clean["Questions"] = q.get("Questions")
    clean["Answers"] = q.get("Answers")
    clean["Diagram"] = q.get("Diagram")
    clean["Transcript"] = q.get("Transcript")

    return clean


question_df = pd.DataFrame([normalize_question(q) for q in questions])
question_df.head()

Unnamed: 0,DateTime_Generated,Type,Instructions,Questions,Answers,Diagram,Transcript
0,2025_11_21_10_03,Table Completion,Complete the table below.\nWrite ONE WORD AND ...,"[1. Introduction to ____________________, 2. R...","[Psychology, Psychology, European, 3, Economic...",,"You will hear a conversation between Sarah, a ..."
1,2025_11_21_10_03,Form Completion,Complete the form below. Write ONE WORD AND / ...,[UNIVERSITY GUEST LECTURE REGISTRATION\n\nStud...,"[Smith, 98765, 07700900505, uniwest, Computer,...",,MAN: You will hear a conversation between a st...
2,2025_11_21_10_03,Map Labelling,"Label the map below. Write the correct letter,...","[1. Lecture Theatre One, 2. Student Common Roo...","[D, E, F, G, H, I]","Imagine a long rectangular building, with a ma...","MALE SPEAKER: Good morning, everyone, and a ve..."
3,2025_11_21_10_03,Flow Chart Completion,Complete the flow chart below. Choose SIX answ...,"[1, 2, 3, 4, 5, 6]","[G, B, H, I, E, F]",## Process for Delivering a Successful Academi...,## IELTS Listening Test: Section 3\n\nYou will...
4,2025_11_21_10_03,Sentence Completion,Complete the sentences below using NO MORE THA...,"[1. In the traditional paradigm, the universit...","[purveyor of information, facilitating learnin...",,You will hear a lecture on the evolving role o...


In [18]:
def save_csv(df):
    os.makedirs("processed_data", exist_ok=True)
    filename = "processed_data/generated_question.csv"

    if os.path.exists(filename):
        df.to_csv(filename, mode='a', header=False, index=False)
    else:
        df.to_csv(filename, index=False)

    print(f"Saved generated questions to {filename}")

def create_set_folder():
    os.makedirs("set", exist_ok=True)

    existing = [d for d in os.listdir("set") if d.startswith("set")]
    next_id = len(existing) + 1

    folder_path = f"set/set{next_id}"
    os.makedirs(folder_path, exist_ok=True)

def write_text_files(folder, df):
    sections = {
        1: df.iloc[0],
        2: df.iloc[1],
        3: df.iloc[2],
        4: df.iloc[3]
    }

    # Questions Only
    with open(f"{folder}/questions.txt", "w", encoding="utf-8") as f:
        for sec, row in sections.items():
            f.write(f"Section {sec}\n")
            f.write("Instructions:\n")
            f.write(str(row["Instructions"]) + "\n\n")
            for q in row["Questions"]:
                f.write(str(q) + "\n")
            f.write("\n")

    # Full set: Instructions + Questions + Answers + Transcript
    with open(f"{folder}/full_set.txt", "w", encoding="utf-8") as f:
        for sec, row in sections.items():
            f.write(f"Section {sec}\n")
            f.write("Instructions:\n")
            f.write(str(row["Instructions"]) + "\n\n")

            f.write("Questions:\n")
            for q in row["Questions"]:
                f.write(str(q) + "\n")
            f.write("\n")

            f.write("Answers:\n")
            for a in row["Answers"]:
                f.write(str(a) + "\n")
            f.write("\n")

            f.write("Transcript:\n")
            f.write(str(row["Transcript"]) + "\n")
            f.write("\n\n")

    # Transcripts Only
    with open(f"{folder}/transcripts.txt", "w", encoding="utf-8") as f:
        for sec, row in sections.items():
            f.write(f"Section {sec}\n")
            f.write(str(row["Transcript"]) + "\n\n")

def save_all_outputs(df):
    save_csv(df)
    create_set_folder()
    existing = [d for d in os.listdir("set") if d.startswith("set")]
    next_id = len(existing)
    folder_path = f"set/set{next_id}"
    write_text_files(folder_path, df)

save_all_outputs(question_df)


Saved generated questions to processed_data/generated_question.csv


### Validation

In [19]:
train_df = pd.read_csv("processed_data/training_set.csv")
gen_df = pd.read_csv("processed_data/generated_question.csv")
word_df = pd.read_csv("processed_data/ielts_vocab.csv")

In [20]:
common_vocab = set(word_df["Words"].astype(str).str.lower().tolist())
print(common_vocab)



In [21]:
# Validation Function
def avg_word_length(text):
    if pd.isna(text):
        return 0
    words = re.findall(r'\b\w+\b', text)
    return np.mean([len(word) for word in words]) if words else 0

def uncommon_word_ratio(text, common_words):
    if pd.isna(text):
        return 0
    words = [w.lower() for w in re.findall(r'\b\w+\b', text)]
    if not words:
        return 0
    uncommon = [w for w in words if w not in common_words]
    return len(uncommon) / len(words)

def readability_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0
    return textstat.flesch_reading_ease(text)

train_corpus = " ".join(train_df["transcript"].dropna().astype(str).tolist())

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectors = vectorizer.fit_transform(
    [train_corpus] + gen_df["Transcript"].fillna("").astype(str).tolist()
)

similarities = cosine_similarity(tfidf_vectors[0:1], tfidf_vectors[1:]).flatten()

In [22]:
gen_tfidf = vectorizer.transform(gen_df["Transcript"].fillna("").astype(str).tolist())
gen_similarity_matrix = cosine_similarity(gen_tfidf)

duplicate_flags = []
for i in range(len(gen_similarity_matrix)):
    sims = gen_similarity_matrix[i]
    sims[i] = 0  # ignore itself
    if np.max(sims) > 0.9:
        duplicate_flags.append(True)
    else:
        duplicate_flags.append(False)

gen_df["is_duplicate_question"] = duplicate_flags
gen_df["max_question_similarity"] = [
    sorted([sim for sim in row if sim < 1.0], reverse=True)[0]
    for row in gen_similarity_matrix
]

In [23]:
gen_df["avg_word_length"] = gen_df["Transcript"].apply(avg_word_length)
gen_df["uncommon_word_ratio"] = gen_df["Transcript"].apply(
    lambda txt: uncommon_word_ratio(txt, common_vocab)
)
gen_df["readability_score"] = gen_df["Transcript"].apply(readability_score)
gen_df["style_similarity"] = similarities


In [24]:
gen_df.head()

Unnamed: 0,DateTime_Generated,Type,Instructions,Questions,Answers,Diagram,Transcript,is_duplicate_question,max_question_similarity,avg_word_length,uncommon_word_ratio,readability_score,style_similarity
0,2025_11_21_10_03,Table Completion,Complete the table below.\nWrite ONE WORD AND ...,"['1. Introduction to ____________________', '2...","['Psychology', 'Psychology', 'European', '3', ...",,"You will hear a conversation between Sarah, a ...",True,0.139318,4.464548,0.199267,61.145777,0.148855
1,2025_11_21_10_03,Form Completion,Complete the form below. Write ONE WORD AND / ...,['UNIVERSITY GUEST LECTURE REGISTRATION\n\nStu...,"['Smith', '98765', '07700900505', 'uniwest', '...",,MAN: You will hear a conversation between a st...,True,0.096394,4.420635,0.236508,64.562687,0.12177
2,2025_11_21_10_03,Map Labelling,"Label the map below. Write the correct letter,...","['1. Lecture Theatre One', '2. Student Common ...","['D', 'E', 'F', 'G', 'H', 'I']","Imagine a long rectangular building, with a ma...","MALE SPEAKER: Good morning, everyone, and a ve...",True,0.102648,4.851117,0.193548,50.017366,0.158097
3,2025_11_21_10_03,Flow Chart Completion,Complete the flow chart below. Choose SIX answ...,"['1', '2', '3', '4', '5', '6']","['G', 'B', 'H', 'I', 'E', 'F']",## Process for Delivering a Successful Academi...,## IELTS Listening Test: Section 3\n\nYou will...,True,0.139318,4.816523,0.233429,54.584885,0.126499
4,2025_11_21_10_03,Sentence Completion,Complete the sentences below using NO MORE THA...,"['1. In the traditional paradigm, the universi...","['purveyor of information', 'facilitating lear...",,You will hear a lecture on the evolving role o...,True,0.3465,6.30033,0.216172,2.267702,0.062046
