In [1]:
# Test for continuous learning question generator

In [None]:
import pandas as pd
import random
import re
import json
import os
import google.generativeai as genai
import textstat
import numpy as np

from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize genai
genai.configure(api_key="AIzaSyBS-2pbdjYouOkcqHaX4ZI5HHPpSSmq3iw")
model = genai.GenerativeModel("gemini-2.5-flash")

# Variable Initialization
assignments = {}
questions = []

# Prompt Template
PROMPT_TEMPLATE = """
You are an expert IELTS Listening question generator.
Generate a realistic IELTS Listening question according to the following details:

Question Type: {typeID} - {type_name}
Theme: {theme}
Specific Topic: {specific_topic}
Specifications: {specifications}
Instructions: {instruction}
Format: {format}
Answer Format: {answer_format}
Key Skills: {key_skills}
Average Duration: {avg_duration}
Average Script Length: {avg_script_length}
Key Features: {key_features}
Audio Speed: {audio_speed}

Requirements:
1. Generate questions, answers, and the audio transcript.
2. If the question type requires a diagram (Map, Plan, Flow Chart), generate a simple diagram (as text description or URL placeholder).
3. Return the output in JSON format:
{{
  "Type": [],
  "Instructions": [],
  "Diagram": [],
  "Questions": [],
  "Answers": [],
  "Transcript": ""
}}
Type -> Question types with type names only
Instructions -> Instructions for the question based on the references above
Diagram -> Diagram description or URL placeholder (if applicable, else null)
Questions -> List of questions
Answers -> List of answers
Transcript -> Full audio transcript. The transcript should include the introductions as a real IELTS Listening test.
4. Ensure the JSON is properly formatted. Do not include any explanations or additional text outside the JSON.
5. Do not add other fields other than the ones mentioned in the JSON format above.
"""

# Random choose question type
def choose_question_type(input="processed_data/questionType.csv"):
    df = pd.read_csv(input)

    for section in sorted(df['part'].unique()):
        section_types = df[df['part'] == section]['type'].tolist()
        k = random.choice([1,2])
        selected = random.sample(section_types, k=min(k,len(section_types)))

        assignments[f"Section {section}"] = selected

# Parse JSON safety
def safe_json_parse(response_text):
    cleaned = re.sub(r"```(?:json)?", "", response_text)
    cleaned = cleaned.replace("```", "").strip()
    match = re.search(r'\{[\s\S]*\}', cleaned)
    if match:
        cleaned = match.group(0)
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        print("JSON parsing failed. Raw model output:\n", response_text)
        return None
    
# Generate question by using Gemini
def generate_question(typeID, type_name, theme, specific_topic, specifications, instruction, answer_format, format, key_skills, avg_duration, avg_script_length, key_features, audio_speed):
    prompt = PROMPT_TEMPLATE.format(
        typeID=typeID,
        type_name=type_name,
        theme=theme,
        specific_topic=specific_topic,
        specifications=specifications,
        instruction=instruction,
        answer_format=answer_format,
        format=format,
        key_skills=key_skills,
        avg_duration=avg_duration,
        avg_script_length=avg_script_length,
        key_features=key_features,
        audio_speed=audio_speed
    )
    
    response = model.generate_content(prompt)

    clean_response = safe_json_parse(response.text)
    
    return clean_response

# Generate each question
def set_question(theme, specific_topic, specifications, input="processed_data/questionType.csv"):
    df = pd.read_csv(input)
    for section, types in assignments.items():
        print(f"\n{section} Questions:")
        for q_type in types:
            type_info = df[df['type'] == q_type].iloc[0]
            question_data = generate_question(
                typeID=type_info['type'],
                type_name=type_info['type'],
                theme=theme,
                specific_topic=specific_topic,
                specifications=specifications,
                instruction=type_info['instruction'],
                answer_format=type_info['answer_format'],
                format=type_info['format'],
                key_skills=type_info['key_skills'],
                avg_duration=type_info['avg_duration'],
                avg_script_length=type_info['avg_script_length'],
                key_features=type_info['key_features'],
                audio_speed=type_info['audio_speed']
            )
            questions.append(question_data)
            print(question_data)

# Save to DataFrames
def normalize_question(q):
    if q is None:
        return None
    
    clean = {}

    clean['DateTime_Generated'] = datetime.now().strftime("%Y_%m_%d_%H_%M")

    # Flatten Type
    t = q.get("Type")
    if isinstance(t, list):
        clean["Type"] = " ".join(str(x) for x in t)
    else:
        clean["Type"] = t

    # Flatten Instructions
    instr = q.get("Instructions")
    if isinstance(instr, list):
        clean["Instructions"] = " ".join(str(x) for x in instr)
    else:
        clean["Instructions"] = instr

    clean["Questions"] = q.get("Questions")
    clean["Answers"] = q.get("Answers")
    clean["Diagram"] = q.get("Diagram")
    clean["Transcript"] = q.get("Transcript")

    return clean

def save_csv(df):
    os.makedirs("processed_data", exist_ok=True)
    filename = "processed_data/generated_question.csv"

    if os.path.exists(filename):
        df.to_csv(filename, mode='a', header=False, index=False)
    else:
        df.to_csv(filename, index=False)

    print(f"Saved generated questions to {filename}")

def create_set_folder():
    os.makedirs("set", exist_ok=True)

    existing = [d for d in os.listdir("set") if d.startswith("set")]
    next_id = len(existing) + 1

    folder_path = f"set/set{next_id}"
    os.makedirs(folder_path, exist_ok=True)

def write_text_files(folder, df):
    sections = {
        1: df.iloc[0],
        2: df.iloc[1],
        3: df.iloc[2],
        4: df.iloc[3]
    }

    # Questions Only
    with open(f"{folder}/questions.txt", "w", encoding="utf-8") as f:
        for sec, row in sections.items():
            f.write(f"Section {sec}\n")
            f.write("Instructions:\n")
            f.write(str(row["Instructions"]) + "\n\n")
            for q in row["Questions"]:
                f.write(str(q) + "\n")
            f.write("\n")

    # Full set: Instructions + Questions + Answers + Transcript
    with open(f"{folder}/full_set.txt", "w", encoding="utf-8") as f:
        for sec, row in sections.items():
            f.write(f"Section {sec}\n")
            f.write("Instructions:\n")
            f.write(str(row["Instructions"]) + "\n\n")

            f.write("Questions:\n")
            for q in row["Questions"]:
                f.write(str(q) + "\n")
            f.write("\n")

            f.write("Answers:\n")
            for a in row["Answers"]:
                f.write(str(a) + "\n")
            f.write("\n")

            f.write("Transcript:\n")
            f.write(str(row["Transcript"]) + "\n")
            f.write("\n\n")

    # Transcripts Only
    with open(f"{folder}/transcripts.txt", "w", encoding="utf-8") as f:
        for sec, row in sections.items():
            f.write(f"Section {sec}\n")
            f.write(str(row["Transcript"]) + "\n\n")

def save_all_outputs(df):
    save_csv(df)
    create_set_folder()
    existing = [d for d in os.listdir("set") if d.startswith("set")]
    next_id = len(existing)
    folder_path = f"set/set{next_id}"
    write_text_files(folder_path, df)

# Testing Purpose
theme = "Education"
specific_topic = "University Lectures"
specifications = "Academic context, formal tone"

# Main pipeline
choose_question_type()
set_question(theme, specific_topic, specifications)

# Save to files
question_df = pd.DataFrame([normalize_question(q) for q in questions])
save_all_outputs(question_df)

  from .autonotebook import tqdm as notebook_tqdm


{'Section 1': ['Plan Labelling', 'Form Completion'], 'Section 2': ['Multiple Choice', 'Map Labelling'], 'Section 3': ['Flow Chart Completion', 'Multiple Choice'], 'Section 4': ['Note Completion']}
