In [None]:
''' 
AIzaSyBS-2pbdjYouOkcqHaX4ZI5HHPpSSmq3iw
'''

In [16]:
# -----------------------------
# IELTS Listening Question Generator
# -----------------------------
import os
import re
import json
import json5
import random
import pandas as pd
import google.generativeai as genai
import textstat
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------------
# CONFIG
# -----------------------------
genai.configure(api_key="AIzaSyBS-2pbdjYouOkcqHaX4ZI5HHPpSSmq3iw")  # replace with your key
model = genai.GenerativeModel("gemini-2.5-flash")

QUESTION_TYPE_CSV = "model_training/processed_data/questionType.csv"
TRAINING_CSV = "model_training/processed_data/training_set.csv"
GENERATED_CSV = "model_training/processed_data/generated_question.csv"
WORD_CSV = "model_training/processed_data/ielts_vocab.csv"

MAX_ATTEMPT = 5
REWARD_GOAL = 4

assignments = {}
questions = []

# Load reference data
question_type_df = pd.read_csv(QUESTION_TYPE_CSV)
common_vocab_df = pd.read_csv(WORD_CSV)
training_df = pd.read_csv(TRAINING_CSV)

# -----------------------------
# PROMPT TEMPLATE
# -----------------------------
PROMPT_TEMPLATE = """
You are an expert IELTS Listening question generator.
Generate realistic IELTS Listening questions for the requested section and question range.

SECTION: {section}
Question Numbers: {question_range}
Number of questions: {question_count}
Question Type: {typeID} - {type_name}
Theme: {theme}
Specific Topic: {specific_topic}
Specifications: {specifications}
Instructions: {instruction}
Answer Format: {answer_format}
Format: {format}

REQUIREMENTS:
1) You MUST produce exactly {question_count} Questions, Answers, and for multiple choice: Options array in JSON.
2) Each Options array must correspond to its question, e.g., Options[0] = ["A. ...","B. ...",...].
3) The Transcript MUST mention the correct question range, e.g. "Now you have time to look at questions {question_range}".
4) Return ONLY JSON in this schema:

{{
  "Section": "{section}",
  "Type": "{type_name}",
  "Instructions": "{instruction}",
  "Diagram": "<optional diagram>",
  "Questions": ["question text 1", "...", "question text N"],
  "Options": [ ["A. ...","B. ..."], [...], ... ], 
  "Answers": ["answer1","answer2",...],
  "Transcript": "<full transcript text>"
}}

Do not include any explanatory text outside JSON.
"""

# -----------------------------
# UTILITY FUNCTIONS
# -----------------------------
def choose_question_type():
    for section in sorted(question_type_df['part'].unique()):
        section_types = question_type_df[question_type_df['part'] == section]['type'].tolist()
        k = random.choice([1,2])
        selected = random.sample(section_types, k=min(k,len(section_types)))
        assignments[f"Section {section}"] = selected
    return assignments

def calculate_readability_score(text):
    return textstat.flesch_reading_ease(text)

def is_in_average_word_count(text, section_label):
    def count_words(txt):
        if isinstance(txt, (list, pd.Series)):
            txt = " ".join(map(str, txt))
        return len(re.findall(r'\b\w+\b', str(txt)))
    ranges = {"Section 1": (500,700),"Section 2":(600,800),"Section 3":(800,1000),"Section 4":(1000,1200)}
    total = count_words(text)
    low, high = ranges.get(section_label,(0,1e9))
    return low <= total <= high

def calculate_common_word_ratio(text):
    vocab = set(common_vocab_df["Words"].astype(str).str.lower())
    if pd.isna(text) or not str(text).strip(): return 0
    words = [w.lower() for w in re.findall(r'\b\w+\b', str(text))]
    if not words: return 0
    return len([w for w in words if w not in vocab]) / len(words)

def calculate_similarity(text):
    existing_texts = []
    if 'transcript' in training_df.columns:
        existing_texts += training_df['transcript'].dropna().astype(str).tolist()
    if os.path.exists(GENERATED_CSV):
        df = pd.read_csv(GENERATED_CSV)
        if 'Transcript' in df.columns:
            existing_texts += df['Transcript'].dropna().astype(str).tolist()
    if not existing_texts: return 0.0
    corpus = existing_texts + [str(text)]
    vecs = TfidfVectorizer().fit(corpus).transform(corpus)
    sim_scores = cosine_similarity(vecs[-1], vecs[:-1]).flatten()
    return sim_scores.max() if len(sim_scores) else 0.0

def safe_json_parse(raw_text):
    if not raw_text: return None
    text = raw_text.replace("```json","").replace("```","").strip()
    obj_match = re.search(r'({[\s\S]*})', text)
    if not obj_match: return None
    try:
        return json5.loads(obj_match.group(1))
    except:
        return None

def convert_to_df(model_json, section_label):
    if model_json is None: return pd.DataFrame([None])
    if isinstance(model_json, dict): records = [model_json]
    elif isinstance(model_json, list): records = model_json
    else: return pd.DataFrame([None])

    normalized = []
    for q in records:
        if not q: continue
        clean = {}
        clean['DateTime_Generated'] = datetime.now().strftime("%Y_%m_%d_%H_%M")
        clean['Section'] = q.get('Section', section_label)
        clean['Type'] = " ".join(q['Type']) if isinstance(q.get("Type"), list) else q.get("Type")
        clean['Instructions'] = " ".join(q['Instructions']) if isinstance(q.get("Instructions"), list) else q.get("Instructions")
        clean['Diagram'] = q.get("Diagram", None)
        clean['Questions'] = q.get("Questions")
        clean['Options'] = q.get("Options", [None]*len(q.get("Questions",[])))
        clean['Answers'] = q.get("Answers")
        clean['Transcript'] = q.get("Transcript")
        normalized.append(clean)
    return pd.DataFrame(normalized)

def get_question_counts(types):
    if len(types) == 1: return {types[0]:10}
    elif len(types) == 2: return {types[0]:5, types[1]:5}
    else: return {t: max(1,10//len(types)) for t in types}

def question_number_ranges_for_section(counts, section_number):
    base = (section_number-1)*10+1
    ranges = {}
    current = base
    for typ, cnt in counts.items():
        start = current
        end = current+cnt-1
        ranges[typ] = f"{start}-{end}"
        current = end+1
    return ranges

def generate_question_model(typeID,type_name,section,question_range,question_count,theme,specific_topic,specifications,instruction,answer_format,format,key_skills,avg_duration,avg_script_length,key_features,audio_speed):
    prompt = PROMPT_TEMPLATE.format(
        section=section,
        question_range=question_range,
        question_count=question_count,
        typeID=typeID,
        type_name=type_name,
        theme=theme,
        specific_topic=specific_topic,
        specifications=specifications,
        instruction=instruction,
        answer_format=answer_format,
        format=format,
        key_skills=key_skills,
        avg_duration=avg_duration,
        avg_script_length=avg_script_length,
        key_features=key_features,
        audio_speed=audio_speed
    )
    response = model.generate_content(prompt)
    return safe_json_parse(response.text)

# -----------------------------
# MAIN GENERATION LOOP
# -----------------------------
def generate_question(theme, specific_topic, specifications):
    assignments = choose_question_type()
    for section_label, types in assignments.items():
        sec_num = int(re.search(r'(\d+)', section_label).group(1))
        print(f"\n=== {section_label} ===")
        counts = get_question_counts(types)
        ranges = question_number_ranges_for_section(counts, sec_num)

        for q_type in types:
            type_info = question_type_df[question_type_df['type']==q_type].iloc[0]
            question_count = counts[q_type]
            question_range = ranges[q_type]
            print(f"\nGenerating TYPE: {q_type} | Questions {question_range} (count={question_count})")
            best_reward = -1
            best_record = None
            for attempt in range(MAX_ATTEMPT):
                print(f" Attempt {attempt+1}/{MAX_ATTEMPT}")
                model_json = generate_question_model(
                    typeID=type_info['type'], type_name=type_info['type'], section=section_label,
                    question_range=question_range, question_count=question_count,
                    theme=theme, specific_topic=specific_topic, specifications=specifications,
                    instruction=type_info['instruction'], answer_format=type_info['answer_format'],
                    format=type_info['format'], key_skills=type_info['key_skills'],
                    avg_duration=type_info['avg_duration'], avg_script_length=type_info['avg_script_length'],
                    key_features=type_info['key_features'], audio_speed=type_info['audio_speed']
                )

                if not isinstance(model_json, dict): continue
                q_list = model_json.get("Questions")
                a_list = model_json.get("Answers")
                if not isinstance(q_list,list) or not isinstance(a_list,list): continue
                if len(q_list)!=question_count or len(a_list)!=question_count: continue
                transcript = model_json.get("Transcript","")
                if str(question_range).split('-')[0] not in transcript: continue

                reward = 0
                if calculate_readability_score(transcript)>=60: reward+=1
                if is_in_average_word_count(transcript, section_label): reward+=1
                if calculate_common_word_ratio(transcript)>=0.1: reward+=1
                if calculate_similarity(transcript)<=0.85: reward+=1

                if reward>best_reward:
                    best_reward=reward
                    best_record=model_json
                if reward==REWARD_GOAL: break

            if best_record is None:
                placeholder={"Section":section_label,"Type":type_info['type'],
                             "Instructions":type_info['instruction'],"Diagram":None,
                             "Questions":[f"Placeholder Q{i}" for i in range(1,question_count+1)],
                             "Options":[[f"A{i}","B{i}","C{i}","D{i}"] for i in range(1,question_count+1)],
                             "Answers":[f"Answer{i}" for i in range(1,question_count+1)],
                             "Transcript":f"Placeholder transcript mentioning questions {question_range}."}
                best_record=placeholder

            temp_df = convert_to_df(best_record, section_label)
            questions.append(temp_df)
    print("\n=== Generation COMPLETE ===")

# -----------------------------
# SAVE FILES
# -----------------------------
def make_json_safe(obj):
    try: return json.dumps(obj, ensure_ascii=False)
    except: return str(obj)

def save_csv(df):
    os.makedirs("model_training/processed_data", exist_ok=True)
    safe_df = df.copy()
    for col in ["Questions","Answers","Transcript","Instructions","Options"]:
        safe_df[col]=safe_df[col].apply(make_json_safe)
    if os.path.exists(GENERATED_CSV):
        safe_df.to_csv(GENERATED_CSV, mode='a', header=False, index=False)
    else:
        safe_df.to_csv(GENERATED_CSV,index=False)
    print(f"Saved generated questions to {GENERATED_CSV}")

def create_set_folder():
    os.makedirs("set", exist_ok=True)
    existing = [d for d in os.listdir("set") if d.startswith("set")]
    next_id = len(existing)+1
    folder = f"set/set{next_id}"
    os.makedirs(folder, exist_ok=True)
    return folder

def write_text_files(folder, df):
    df_sorted = df.sort_values(by="Section")
    question_counter = 1

    def strip_leading_numbers(text):
        return re.sub(r'^\s*\d+\.\s*', '', str(text))

    # QUESTIONS ONLY
    with open(f"{folder}/questions.txt","w",encoding="utf-8") as f:
        for _, row in df_sorted.iterrows():
            f.write(f"SECTION {row['Section']}\n\n")
            questions = row["Questions"]
            options_list = row.get("Options",[None]*len(questions))
            instructions = row.get("Instructions","")
            q_start = question_counter
            q_end = question_counter + len(questions)-1
            f.write(f"Questions {q_start}-{q_end}\n\n")
            f.write(instructions.strip()+"\n\n")
            for i,q_text in enumerate(questions):
                q_text_clean = strip_leading_numbers(q_text)
                f.write(f"{question_counter}. {q_text_clean}\n")
                opts = options_list[i] if options_list[i] else []
                for opt in opts: f.write(f"   {opt}\n")
                question_counter+=1
            f.write("\n\n")

    # FULL SET
    question_counter=1
    with open(f"{folder}/full_set.txt","w",encoding="utf-8") as f:
        for _, row in df_sorted.iterrows():
            f.write(f"SECTION {row['Section']}\n\n")
            questions = row["Questions"]
            options_list = row.get("Options",[None]*len(questions))
            answers = row["Answers"]
            transcript = row.get("Transcript","")
            instructions = row.get("Instructions","")
            q_start = question_counter
            q_end = question_counter + len(questions)-1
            f.write(f"Questions {q_start}-{q_end}\n\n")
            f.write("Instructions:\n"+instructions.strip()+"\n\n")
            f.write("Questions:\n")
            for i,q_text in enumerate(questions):
                q_text_clean = strip_leading_numbers(q_text)
                f.write(f"{question_counter}. {q_text_clean}\n")
                opts = options_list[i] if options_list[i] else []
                for opt in opts: f.write(f"   {opt}\n")
                question_counter+=1
            f.write("\nAnswers:\n")
            for i,a in enumerate(answers):
                f.write(f"{q_start+i}. {a}\n")
            f.write("\nTranscript:\n"+transcript.strip()+"\n\n\n")

    # TRANSCRIPTS ONLY
    with open(f"{folder}/transcripts.txt","w",encoding="utf-8") as f:
        for _, row in df_sorted.iterrows():
            f.write(f"SECTION {row['Section']}\n")
            f.write(str(row["Transcript"])+"\n\n")

def save_all_outputs(questions):
    if all(isinstance(q,pd.DataFrame) for q in questions):
        df=pd.concat(questions,ignore_index=True)
    else:
        df=pd.DataFrame(questions)
    save_csv(df)
    folder=create_set_folder()
    write_text_files(folder,df)
    print(f"All outputs saved to folder: {folder}")

# -----------------------------
# RUN
# -----------------------------
theme="Education"
specific_topic="University Lectures"
specifications="Academic context, formal tone"

generate_question(theme,specific_topic,specifications)
save_all_outputs(questions)



=== Section 1 ===

Generating TYPE: Table Completion | Questions 1-10 (count=10)
 Attempt 1/5


ParserError: Error tokenizing data. C error: Expected 7 fields in line 31, saw 8
