In [2]:
import os
import json
import re

from config import GEMINI_API_KEY, TAVILY_API_KEY

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts.prompt import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

In [3]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY


if "TAVILY_API_KEY" not in os.environ:
    os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY

In [4]:
direct_question_generation_prompt_template = PromptTemplate(
    input_variables=[],
    template="""
Here are some example questions a prospective student might ask about the **pillars or specializations at the Singapore University of Technology and Design (SUTD)**.  
Please generate a **mix of basic to advanced questions**, and output them as a **JSON array of question strings**.

**Examples:**
{{
    "0": "What pillars are there in SUTD?",
    "1": "How do I choose a pillar, and when do students typically make this decision?",
    "2": "Can I take modules from other pillars even after I choose my specialization?",
    "3": "What kind of projects will I work on in the Engineering Product Development (EPD) pillar?",
    "4": "What career paths do graduates from the Architecture and Sustainable Design (ASD) pillar usually pursue?",
    "5": "How does the Engineering Systems and Design (ESD) pillar differ from industrial engineering in other universities?",
    "6": "What programming languages or tools will I learn in the Computer Science and Design (CSD) pillar?",
    "7": "How is AI integrated into the Design and Artificial Intelligence (DAI) pillar, and are there industry partnerships involved?",
    "8": "Is it possible to switch pillars after I've been allocated one?",
    "9": "Are there any interdisciplinary projects that involve students from multiple pillars?"
}}

Now generate 5-10 new questions in the same JSON format, keeping a healthy mix of beginner and advanced queries.
"""
)

In [32]:
direct_question_generation_prompt_template = PromptTemplate(
    input_variables=[],
    template="""
Here are some example questions a prospective student might ask about the **Fifth Row (Clubs, Sports, Arts)** at the Singapore University of Technology and Design (SUTD)**.  
Please generate a **mix of basic to advanced questions**, and output them as a **JSON array of question strings**. Please include some variety in the keywords such as using SUTD's unique naming conventions such as "Fifth Rows" or "Freshmores".

Now generate 5-10 new questions in JSON format.
{{
    "0": "QUESTION_0",
}}
"""
)

In [45]:
themes = {
    "0": "SUTD's Pillars and Specializations",
    "1": "Social Events and Activities at SUTD",
    "2": "Interdisciplinary Collaboration and Projects at SUTD",
    "3": "Global Partnerships and Opportunities at SUTD (e.g., MIT, Zhejiang)",
    "4": "Career Prospects and Industry Connections for SUTD Graduates",
    "5": "Innovation and Entrepreneurship at SUTD",
    "6": "Student Life and Community at SUTD",
    "7": "Fifth Rows",
    "8": "Admissions Process and Requirements for SUTD",
    "9": "Research and Development at SUTD"
}

In [46]:
direct_question_generation_prompt_template = PromptTemplate(
    input_variables=["THEME"],
    template="""
Here are some example questions a prospective student might ask about the **{THEME}** at the Singapore University of Technology and Design (SUTD)**.  
Please generate a **mix of basic to advanced questions**, and output them as a **JSON array of question strings**. Please balance some variety in the keywords such as using SUTD's unique naming conventions such as "Fifth Rows" or "Freshmores".

Now generate 5-10 new questions in JSON format.
{{
    "0": "QUESTION_0",
}}
"""
)

In [47]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.5 # We want some randomness
)

json_output_parser = JsonOutputParser()

chain = llm | json_output_parser

In [48]:
for theme in themes.values():
    prompt = direct_question_generation_prompt_template.format(THEME=theme)
    
    themed_questions = chain.invoke(prompt)
    
    if isinstance(themed_questions, list):
        themed_questions = themed_questions[0]
    
    for question in themed_questions.values():
        with open(output_file, "a", encoding="utf-8") as f:
            json.dump({"question": question}, f)
            f.write("\n")

In [34]:
questions = llm.invoke(direct_question_generation_prompt_template.format())

In [35]:
print(questions)

content='```json\n{\n    "0": "What Fifth Row activities are most popular among Freshmores?",\n    "1": "Can you provide a comprehensive list of all Fifth Row clubs and sports available at SUTD?",\n    "2": "How does SUTD support student-led initiatives for creating new Fifth Row clubs or activities?",\n    "3": "What are the time commitments typically involved in participating in different Fifth Row activities?",\n    "4": "Are there opportunities for Fifth Row activities to collaborate with external organizations or companies?",\n    "5": "How does SUTD ensure inclusivity and accessibility in Fifth Row activities for students with diverse backgrounds and abilities?",\n    "6": "What resources (e.g., funding, equipment, mentorship) are available to Fifth Row clubs and sports teams?",\n    "7": "How does participation in Fifth Row activities contribute to the overall SUTD student experience and personal development?",\n    "8": "Are there any Fifth Row activities that focus on social i

In [36]:
questions = json_output_parser.invoke(questions)

In [37]:
output_file = "questions2.jsonl"

if type(questions) == list:
    questions = questions[0]

with open(output_file, "a", encoding="utf-8") as f:
    for question in questions.values():
        json.dump({"question": question}, f)
        f.write("\n")