In [3]:
''' 
Task: 
- Make sure is 40 questions
'''

' \nTask: \n- Make sure is 40 questions\n'

In [29]:
# Import necessary libraries
import pandas as pd
import random
import re
import json
import json5
import os
import google.generativeai as genai
import textstat

from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# Initialize genai
genai.configure(api_key="AIzaSyBS-2pbdjYouOkcqHaX4ZI5HHPpSSmq3iw")
model = genai.GenerativeModel("gemini-2.5-flash")

In [6]:
# Initializer
QUESTION_TYPE_CSV = "model_training/processed_data/questionType.csv"
TRAINING_CSV = "model_training/processed_data/training_set.csv"
GENERATED_CSV = "model_training/processed_data/generated_question.csv"
WORD_CSV = "model_training/processed_data/ielts_vocab.csv"

MAX_ATTEMPT = 5
REWARD_GOAL = 4

assignments = {}
questions = []

question_type_df = pd.read_csv(QUESTION_TYPE_CSV)
common_vocab_df = pd.read_csv(WORD_CSV)
training_df = pd.read_csv(TRAINING_CSV)

In [7]:
# Choose Question TypeError
def choose_question_type():
    for section in sorted(question_type_df['part'].unique()):
        section_types = question_type_df[question_type_df['part'] == section]['type'].tolist()
        k = random.choice([1,2])
        selected = random.sample(section_types, k=min(k,len(section_types)))
        assignments[f"Section {section}"] = selected
    return assignments

In [8]:
# PROMPT TEMPLATE
PROMPT_TEMPLATE = """
You are an expert IELTS Listening question generator.
Generate a realistic IELTS Listening question according to the following details:

Question Type: {typeID} - {type_name}
Theme: {theme}
Specific Topic: {specific_topic}
Specifications: {specifications}
Instructions: {instruction}
Format: {format}
Answer Format: {answer_format}
Key Skills: {key_skills}
Average Duration: {avg_duration}
Average Script Length: {avg_script_length}
Key Features: {key_features}
Audio Speed: {audio_speed}

Requirements:
1. Generate questions, answers, and the audio transcript.
2. If the question type requires a diagram (Map, Plan, Flow Chart), generate a simple diagram by using text and characters.
3. Return the output in JSON format:
{{
  "Type": [],
  "Instructions": [],
  "Questions": [],
  "Answers": [],
  "Transcript": ""
}}
Type -> Question types with type names only
Instructions -> Instructions for the question based on the references above
Questions -> A simple diagram for map, plan or flow chart if applicable and list of questions.
Answers -> List of answers
Transcript -> Full audio transcript. The transcript should include the introductions as a real IELTS Listening test. The length must between {avg_script_length}.
4. Ensure the JSON is properly formatted. Do not include any explanations or additional text outside the JSON.
5. Do not add other fields other than the ones mentioned in the JSON format above.
"""

In [9]:
# Validation Function
def calculate_readability_score(text):
    return textstat.flesch_reading_ease(text)

def is_in_average_word_count(text, section):
    def calculate_total_word_count():
        nonlocal text
        if isinstance(text, (list, pd.Series)):
            text = " ".join(map(str, text))

        words = re.findall(r'\b\w+\b', str(text))
        return len(words)
    
    expected_ranges = {
        "Section 1": (500, 700),
        "Section 2": (600, 800),
        "Section 3": (800, 1000),
        "Section 4": (1000, 1200)
    }

    total_words = calculate_total_word_count()

    low, high = expected_ranges[section]
    return low <= total_words <= high

def calculate_common_word_ratio(text):
    common_vocab = set(common_vocab_df["Words"].astype(str).str.lower().tolist())
    
    if pd.isna(text) or not str(text).strip():
        return 0
    
    words = [w.lower() for w in re.findall(r'\b\w+\b', str(text))]
    if not words:
        return 0
    
    uncommon = [w for w in words if w not in common_vocab]
    return len(uncommon) / len(words)

def calculate_similarity(text):
    existing_texts = []

    if 'transcript' in training_df.columns:
        existing_texts += training_df['transcript'].dropna().astype(str).tolist()

    if os.path.exists(GENERATED_CSV):
        generated_df = pd.read_csv(GENERATED_CSV)
        if 'Transcript' in generated_df.columns:
            existing_texts += generated_df['Transcript'].dropna().astype(str).tolist()

        if not existing_texts:
            return 0.0
    
    corpus = existing_texts + [str(text)]

    vectorizer = TfidfVectorizer().fit(corpus)
    vectors = vectorizer.transform(corpus)

    sim_scores = cosine_similarity(vectors[-1], vectors[:-1]).flatten()

    max_sim = sim_scores.max() if len(sim_scores) > 0 else 0.0
    return max_sim


In [10]:
# Utility Function
def safe_json_parse(raw_text):
    start = raw_text.find("{")
    end = raw_text.rfind("}")
    
    if start == -1 or end == -1:
        return None

    json_block = raw_text[start:end+1]
    cleaned = json_block
    cleaned = cleaned.replace("```json", "").replace("```", "")
    cleaned = re.sub(r"\*\*(.*?)\*\*", r"\1", cleaned)
    cleaned = re.sub(r",\s*([\]}])", r"\1", cleaned)
    cleaned = cleaned.strip()

    try:
        parsed = json5.loads(cleaned)
        return parsed

    except Exception as e:
        return None

In [11]:
# Convert to DataFrame
def convert_to_df(questions, section):
    def normalize_question(q):
        if q is None:
            return None
        
        clean = {}

        clean['DateTime_Generated'] = datetime.now().strftime("%Y_%m_%d_%H_%M")

        if section == "":
            clean['Section'] = section
        else:
            clean['Section'] = q.get('Section')
            
        # Flatten Type
        t = q.get("Type")
        if isinstance(t, list):
            clean["Type"] = " ".join(str(x) for x in t)
        else:
            clean["Type"] = t

        # Flatten Instructions
        instr = q.get("Instructions")
        if isinstance(instr, list):
            clean["Instructions"] = " ".join(str(x) for x in instr)
        else:
            clean["Instructions"] = instr

        clean["Questions"] = q.get("Questions")
        clean["Answers"] = q.get("Answers")
        clean["Transcript"] = q.get("Transcript")

        return clean
    
    return pd.DataFrame([normalize_question(q) for q in questions])

In [12]:
# Generate Question
def generate_question_model(typeID, type_name, theme, specific_topic, specifications, instruction, answer_format, format, key_skills, avg_duration, avg_script_length, key_features, audio_speed):
    prompt = PROMPT_TEMPLATE.format(
        typeID=typeID,
        type_name=type_name,
        theme=theme,
        specific_topic=specific_topic,
        specifications=specifications,
        instruction=instruction,
        answer_format=answer_format,
        format=format,
        key_skills=key_skills,
        avg_duration=avg_duration,
        avg_script_length=avg_script_length,
        key_features=key_features,
        audio_speed=audio_speed
    )
    
    response = model.generate_content(prompt)

    clean_response = safe_json_parse(response.text)
    
    return clean_response

def generate_question(theme, specific_topic, specifications):
    assignments = choose_question_type()
    for section, types in assignments.items():
        print(f"\n{section} Questions:")
        for q_type in types:
            type_info = question_type_df[question_type_df['type'] == q_type].iloc[0]

            for x in range(MAX_ATTEMPT):
                print(f"  Try Attempt: {x}")
                question_data = generate_question_model(
                    typeID=type_info['type'],
                    type_name=type_info['type'],
                    theme=theme,
                    specific_topic=specific_topic,
                    specifications=specifications,
                    instruction=type_info['instruction'],
                    answer_format=type_info['answer_format'],
                    format=type_info['format'],
                    key_skills=type_info['key_skills'],
                    avg_duration=type_info['avg_duration'],
                    avg_script_length=type_info['avg_script_length'],
                    key_features=type_info['key_features'],
                    audio_speed=type_info['audio_speed']
                )
                
                temp_df = convert_to_df([question_data], section)
                if 'Transcript' in temp_df.columns and pd.notna(temp_df['Transcript'].iloc[0]):
                    transcript_text = temp_df['Transcript'].iloc[0]
                    reability_score = calculate_readability_score(transcript_text)
                    avg_word_length = is_in_average_word_count(transcript_text, section)
                    common_word_ratio = calculate_common_word_ratio(transcript_text)
                    similarity = calculate_similarity(transcript_text)
                    reward = 0
                else:
                    transcript_text = ""
                    reability_score = 0
                    avg_word_length = False
                    common_word_ratio = 0
                    similarity = 0
                    reward = 0

                print(f"    Current Score: {reability_score} | {avg_word_length} | {common_word_ratio} | {similarity}")

                # Calculate Reward
                if reability_score >= 60:
                    reward += 1
                if avg_word_length:
                    reward += 1
                if common_word_ratio >= 0.1:
                    reward += 1
                if similarity <= 0.85:
                    reward += 1

                print(f"    Current Reward: {reward}")

                if reward == REWARD_GOAL:
                    break

            questions.append(temp_df)

In [13]:
# Main Pipeline
theme = "Education"
specific_topic = "University Lectures"
specifications = "Academic context, formal tone"

generate_question(theme, specific_topic, specifications)


Section 1 Questions:
  Try Attempt: 0
    Current Score: 72.87268558468857 | True | 0.2833935018050541 | 0.467939455502918
    Current Reward: 4

Section 2 Questions:
  Try Attempt: 0
    Current Score: 61.288388463641354 | False | 0.18421052631578946 | 0.6050274123924136
    Current Reward: 3
  Try Attempt: 1
    Current Score: 57.00673174872668 | False | 0.18115942028985507 | 0.6163843612919224
    Current Reward: 2
  Try Attempt: 2
    Current Score: 56.19629900213829 | False | 0.1753958587088916 | 0.6080322619666219
    Current Reward: 2
  Try Attempt: 3
    Current Score: 43.41327615780449 | False | 0.18885096700796358 | 0.6317757514840734
    Current Reward: 2
  Try Attempt: 4
    Current Score: 0 | False | 0 | 0
    Current Reward: 1

Section 3 Questions:
  Try Attempt: 0
    Current Score: 0 | False | 0 | 0
    Current Reward: 1
  Try Attempt: 1
    Current Score: 14.846695979899494 | False | 0.25895765472312704 | 0.476714777574597
    Current Reward: 2
  Try Attempt: 2
    Cu

In [None]:
# Save to Flie
def save_csv(df):
    os.makedirs("model_training/processed_data", exist_ok=True)
    if os.path.exists(GENERATED_CSV):
        df.to_csv(GENERATED_CSV, mode='a', header=False, index=False)
    else:
        df.to_csv(GENERATED_CSV, index=False)
    print(f"Saved generated questions to {GENERATED_CSV}")

def create_set_folder():
    os.makedirs("set", exist_ok=True)
    existing = [d for d in os.listdir("set") if d.startswith("set")]
    next_id = len(existing) + 1
    folder_path = f"set/set{next_id}"
    os.makedirs(folder_path, exist_ok=True)
    return folder_path

def write_text_files(folder, df):
    sections = {
        1: df.iloc[0],
        2: df.iloc[1],
        3: df.iloc[2],
        4: df.iloc[3]
    }

    # Questions Only
    with open(f"{folder}/questions.txt", "w", encoding="utf-8") as f:
        for sec, row in sections.items():
            f.write(f"Section {sec}\n")
            f.write("Instructions:\n")
            f.write(str(row["Instructions"]) + "\n\n")
            for q in row["Questions"]:
                f.write(str(q) + "\n")
            f.write("\n")

    # Full set: Instructions + Questions + Answers + Transcript
    with open(f"{folder}/full_set.txt", "w", encoding="utf-8") as f:
        for sec, row in sections.items():
            f.write(f"Section {sec}\n")

            f.write("Instructions:\n")
            f.write(str(row["Instructions"]) + "\n\n")

            f.write("Questions:\n")
            for q in row["Questions"]:
                f.write(str(q) + "\n")
            f.write("\n")

            f.write("Answers:\n")
            for a in row["Answers"]:
                f.write(str(a) + "\n")
            f.write("\n")

            f.write("Transcript:\n")
            f.write(str(row["Transcript"]) + "\n")
            f.write("\n\n")

    # Transcripts Only
    with open(f"{folder}/transcripts.txt", "w", encoding="utf-8") as f:
        for sec, row in sections.items():
            f.write(f"Section {sec}\n")
            f.write(str(row["Transcript"]) + "\n\n")

def save_all_outputs(questions):
    if all(isinstance(q, pd.DataFrame) for q in questions):
        df = pd.concat(questions, ignore_index=True)
    else:
        df = pd.DataFrame(questions)
    
    save_csv(df)
    folder_path = create_set_folder()
    write_text_files(folder_path, df)
    print(f"All outputs saved to folder: {folder_path}")


save_all_outputs()

NameError: name 'questions' is not defined