In [1]:
# Test for continuous learning question generator

In [None]:
import os
import re
import json
import random
import pandas as pd
import numpy as np
import textstat
import google.generativeai as genai
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize genai
genai.configure(api_key="AIzaSyBS-2pbdjYouOkcqHaX4ZI5HHPpSSmq3iw")
model = genai.GenerativeModel("gemini-2.5-flash")

# File paths
QUESTION_TYPE_CSV = "processed_data/questionType.csv"
SAMPLE_QUESTIONS_CSV = "processed_data/training_set.csv"
IELTS_VOCAB_CSV = "processed_data/ielts_vocab.csv"

# Reward hyperparameters
READABILITY_MIN = 6
READBILITY_MAX = 12
VOCAB_RATIO_MIN = 0.03
MAX_SIMILARITY = 0.90
REWARD_GOAL = 3
MAX_ATTEMPS = 5

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

PROMPT_TEMPLATE = """
You are an expert IELTS Listening question generator.
Generate a realistic IELTS Listening question according to the following details:

Question Type: {typeID} - {type_name}
Theme: {theme}
Specific Topic: {specific_topic}
Specifications: {specifications}
Instructions: {instruction}
Format: {format}
Answer Format: {answer_format}
Key Skills: {key_skills}
Average Duration: {avg_duration}
Average Script Length: {avg_script_length}
Key Features: {key_features}
Audio Speed: {audio_speed}

Requirements:
1. Generate questions, answers, and the audio transcript.
2. If the question type requires a diagram (Map, Plan, Flow Chart), generate a simple diagram (as text description or URL placeholder).
3. Return the output in JSON format:
{{
  "Type": [],
  "Instructions": [],
  "Diagram": [],
  "Questions": [],
  "Answers": [],
  "Transcript": ""
}}
Type -> Question types with type names only
Instructions -> Instructions for the question based on the references above
Diagram -> Diagram description or URL placeholder (if applicable, else null)
Questions -> List of questions
Answers -> List of answers
Transcript -> Full audio transcript. The transcript should include the introductions as a real IELTS Listening test.
4. Ensure the JSON is properly formatted. Do not include any explanations or additional text outside the JSON.
5. Do not add other fields other than the ones mentioned in the JSON format above.
"""

# Utility Functions
def safe_json_parse(response_text):
    cleaned = re.sub(r"```(?:json)?", "", response_text)
    cleaned = cleaned.replace("```", "").strip()
    match = re.search(r'\{[\s\S]*\}', cleaned)
    if match:
        cleaned = match.group(0)
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        print("JSON parsing failed. Raw model output:\n", response_text)
        return None
    
def generate_from_model(typeID, type_name, theme, specific_topic, specifications, instruction, answer_format, format, key_skills, avg_duration, avg_script_length, key_features, audio_speed):
    prompt = PROMPT_TEMPLATE.format(
        typeID=typeID,
        type_name=type_name,
        theme=theme,
        specific_topic=specific_topic,
        specifications=specifications,
        instruction=instruction,
        answer_format=answer_format,
        format=format,
        key_skills=key_skills,
        avg_duration=avg_duration,
        avg_script_length=avg_script_length,
        key_features=key_features,
        audio_speed=audio_speed
    )
    
    response = model.generate_content(prompt)

    clean_response = safe_json_parse(response.text)
    
    return clean_response

# Load Data and Prepare TF-IDF
def load_vocab():
    df = pd.read_csv(IELTS_VOCAB_CSV)
    vocab_list = set(df['Words'].lower().strip().tolist())
    vocab_list = {w for w in vocab_list if w}
    return vocab_list

def load_sample_questions():
    if os.path.exists(SAMPLE_QUESTIONS_CSV):
        df = pd.read_csv(SAMPLE_QUESTIONS_CSV)
        for required in ['questionID', 'question', 'answer', 'transcript', 'typeID']:
            if required not in df.columns:
                df[required] = df.get(required, "")
        return df
    else:
        return pd.DataFrame(columns=['questionID', 'question', 'answer', 'transcript', 'typeID'])
    
def build_vectorizer(sample_texts):
    vectorizer =TfidfVectorizer(stop_words='english', max_features=5000)
    if len(sample_texts) == 0:
        dummy = ["no samples available"]
        vecs = vectorizer.fit_transform(dummy)
        return vectorizer, vecs
    vecs = vectorizer.fit_transform(sample_texts)
    return vectorizer, vecs

# Reward
def readbility_score(text):
    try:
        return textstat.flesch_kincaid_grade(text)
    except Exception:
        return 999.0
    
def check_vocab_usage(text, vocabulary):
    words = re.findall(r"\b[a-zA-Z]+\b", text.lower())
    if not words:
        return 0.0
    used = sum(1 for w in words if w in vocabulary)
    return used / len(words)

def similarity_to_existing(text, vectorizer, sample_vectors):
    try:
        vec = vectorizer.transform([text])
        sim = cosine_similarity(vec, sample_vectors)
        max_sim = float(np.max(sim))
        return max_sim
    except Exception:
        return 0.0
    
def calculate_reward(result, vocab_set, vectorizer, sample_vectors):
    if not result:
        return 0, 0.0, 999.0, 1.0
    
    transcript = result.get("Transcript", "") or ""
    questions_list = result.get("Questions", []) or []
    question_text = " ".join([str(q) for q in questions_list])

    candidate_text = (transcript + "\n" + question_text).strip()
    if not candidate_text:
        return 0, 0.0, 999.0, 1.0

    vocab_ratio = check_vocab_usage(candidate_text, vocab_set)
    read_grade = readability_score(transcript if transcript else question_text)
    max_sim = similarity_to_existing(question_text if question_text else transcript, vectorizer, sample_vectors)

    reward = 0
    if vocab_ratio >= VOCAB_RATIO_MIN:
        reward += 1
    if READABILITY_MIN <= read_grade <= READABILITY_MAX:
        reward += 1
    if max_sim < MAX_SIMILARITY:
        reward += 1

    return reward, vocab_ratio, read_grade, max_sim