In [164]:
import os
import json
import re

from config import GEMINI_API_KEY, TAVILY_API_KEY

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts.prompt import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.tools.tavily_search import TavilySearchResults

from pprint import pprint

In [165]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY


if "TAVILY_API_KEY" not in os.environ:
    os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY


In [None]:
theme_prompt = PromptTemplate(
    input_variables=[],
    template="""
You are helping to generate lexicon-rich question templates for prospective students exploring the Singapore University of Technology and Design (SUTD). Generate a JSON object containing 5-8 key themes or points of interest that prospective university students commonly inquire about when considering the SUTD university experience. The keys of the JSON object should be sequential integers starting from 0, and the values should be descriptive strings representing the themes.

You're not limited to these, an example of the desired output format:
{{
  "0": "Interdisciplinary Curriculum",
  "1": "Design-Centric and Hands-On Learning",
  "2": "Campus Culture and Student Life",
  "3": "Graduate Employability and Industry Connections",
  "4": "Pillars and Specialisations",
  "5": "Housing and Campus Facilities",
  "6": "Admissions Criteria, Financial Aid, and Scholarships"
  "7": "Fifth Row (Clubs, Sports, Arts)",
}}

The output should be in a JSON-like format within the curly braces.
    """
)

In [167]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.5 # We want some randomness
)

json_output_parser = JsonOutputParser()

chain = llm | json_output_parser

In [168]:
theme_prompt_results = chain.invoke(theme_prompt.format())

In [169]:
pprint(theme_prompt_results)

{'0': "SUTD's Unique Curriculum Structure and Learning Approach",
 '1': 'Career Opportunities, Industry Partnerships, and Alumni Network',
 '2': 'Student Life, Campus Culture, and Extracurricular Activities (Fifth '
      'Row)',
 '3': 'Admissions Requirements, Application Process, and Portfolio Advice',
 '4': 'Financial Aid, Scholarships, and Tuition Fees',
 '5': 'Research Opportunities and Faculty Expertise',
 '6': 'Housing Options, Campus Facilities, and Location',
 '7': 'Design Thinking and Innovation Focus'}


In [None]:
theme_prompt_results = {
    "0": "Undergraduate Pillars and Specializations",
    "1": "Fifth Row (Clubs, Sports, Arts)",
}

In [None]:
"""
Example output

theme_prompt_results = {
    '0': 'Academic Programs and Faculty',
    '1': 'Career Services and Internship Opportunities',
    '2': 'Campus Life and Student Culture',
    '3': 'Tuition, Financial Aid, and Scholarships',
    '4': 'Location and Campus Environment',
    '5': 'Housing and Accommodation Options',
    '6': 'Student Support Services (Counseling, Health)',
    '7': 'Research Opportunities (Undergraduate)',
    '8': 'Safety and Security on Campus'
}
"""

"\nExample output\n\ntheme_prompt_results = {\n    '0': 'Academic Programs and Faculty',\n    '1': 'Career Services and Internship Opportunities',\n    '2': 'Campus Life and Student Culture',\n    '3': 'Tuition, Financial Aid, and Scholarships',\n    '4': 'Location and Campus Environment',\n    '5': 'Housing and Accommodation Options',\n    '6': 'Student Support Services (Counseling, Health)',\n    '7': 'Research Opportunities (Undergraduate)',\n    '8': 'Safety and Security on Campus'\n}\n"

In [None]:
question_prompt_template = PromptTemplate(
    input_variables=["THEME"],
    template="""
You are helping to generate lexicon-rich question templates for prospective students exploring the Singapore University of Technology and Design (SUTD). Each question should contain a lexicon placeholder (e.g., {{CLUB}}, {{PROGRAM_TYPE}}, {{SEMESTER_YEAR}}) that can later be substituted with concrete values retrieved from the web.

Here are some themes relevant to prospective students, along with sample question templates:

Theme: Student Life and Culture  
{{
  "Social Events": "What kind of social events or traditions does SUTD host during {{SEMESTER_OR_PERIOD}}?",
  "Campus Culture": "What is student culture like for those studying in {{PROGRAM_TYPE}} at SUTD?",
  "Arts and Culture": "What opportunities are there for students interested in {{ARTS_ACTIVITY}} at SUTD?"
}}

Theme: Admissions and Applications  
{{
  "Application Requirements": "What are the application requirements for {{PROGRAM_TYPE}} programs at SUTD?",
  "Essay Guidance": "Do you have any advice for writing the application essay for {{PROGRAM_TYPE}} at SUTD?",
  "Standardized Tests": "What are the policies regarding {{TEST_TYPE}} scores when applying to SUTD?",
  "Application Deadline": "What is the application deadline for entry in {{SEMESTER_YEAR}}?"
}}

Theme: {THEME}
{{
  "{{SUB_THEME_1_KEY}}": "{{SUB_THEME_1_QUERY}}",
  "{{SUB_THEME_2_KEY}}": "{{SUB_THEME_2_QUERY}}",
  "{{SUB_THEME_3_KEY}}": "{{SUB_THEME_3_QUERY}}",
  "{{SUB_THEME_4_KEY}}": "{{SUB_THEME_4_QUERY}}"
}}
Generate 3-5 relevant sub-themes (keys) and corresponding question templates (values) for the theme: {THEME}. Use placeholders like {{COURSE}}, {{FACILITY}}, {{HOSTEL}}, {{CLUB}}, {{SEMESTER_YEAR}}, etc., to keep the questions lexicon-ready. The output should be in a JSON-like format within the curly braces.
"""
)

question_templates = {}
for theme in theme_prompt_results.values():
    question_template_prompt_results = chain.invoke(question_prompt_template.format(THEME=theme))
    question_templates[theme] = question_template_prompt_results

In [173]:
question_template_lexicon_pairs = {}

for theme in question_templates.keys():
    for subtheme, question_template in question_templates[theme].items():
        matches = re.findall(r"\{([^}]+)\}", question_template)
        
        # make sure only 1 match, otherwise skip
        if len(matches) == 1:
            question_template_lexicon_pairs[question_template] = matches[0]

print(len(question_template_lexicon_pairs.keys()))
print(question_template_lexicon_pairs.values())

2
dict_values(['PILLAR', 'SPECIALIZATION'])


In [174]:
search_query_prompt_template = PromptTemplate(
    input_variables=["QUESTION_TEMPLATE_TO_SEARCH", "LEXICON_PLACEHOLDER_TO_FILL"],
    template="""
You are an expert in generating effective web search queries for finding information related to the Singapore University of Technology and Design (SUTD). Your goal is to create a concise web search query that will retrieve possible values for the lexicon placeholder `{{LEXICON_PLACEHOLDER}}` within the context of SUTD.

Here are a few examples:

**Example 1:**
Question Template: "What does {{CLUB}} do in SUTD?"
LEXICON_PLACEHOLDER: {{CLUB}}
Search Query: {{"search_query": "SUTD student clubs organizations list"}}

**Example 2:**
Question Template: "Who are the professors teaching {{COURSE}} at SUTD?"
LEXICON_PLACEHOLDER: {{COURSE}}
Search Query: {{"search_query": "SUTD course catalog undergraduate graduate"}}

Now, generate a search query for the following:

Question Template: "{QUESTION_TEMPLATE_TO_SEARCH}"
LEXICON_PLACEHOLDER: {LEXICON_PLACEHOLDER_TO_FILL}
Search Query:
"""
)

In [175]:
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_message

lexicon_question_search_query_pairs = {}

@retry(
    stop=stop_after_attempt(25),
    wait=wait_exponential(multiplier=1, min=1, max=60),
    retry=retry_if_exception_message(match="429|ResourceExhausted")
)
def invoke_chain(chain, prompt):
    return chain.invoke(prompt)

for question, lexicon in question_template_lexicon_pairs.items():
    try:
        lexicon_search_query = invoke_chain(chain, search_query_prompt_template.format(QUESTION_TEMPLATE_TO_SEARCH=question, LEXICON_PLACEHOLDER_TO_FILL=lexicon))
    
        lexicon_question_search_query_pairs[(question, lexicon)] = lexicon_search_query
    except:
        pass

In [176]:
for (question, lexicon), search_query in lexicon_question_search_query_pairs.items():
    print(f"Question: {question}")
    print(f"Lexicon: {lexicon}")
    print(f"Search Query: {search_query}")
    print()

Question: What specialization options are available after completing the {PILLAR} pillar at SUTD?
Lexicon: PILLAR
Search Query: {'search_query': 'SUTD pillars specializations after pillar'}

Question: What are some common career paths for graduates who specialized in {SPECIALIZATION} at SUTD?
Lexicon: SPECIALIZATION
Search Query: {'search_query': 'SUTD degree programs majors specializations'}



In [177]:
search = TavilySearchResults(max_results=3)

In [178]:
"""
example_search = search.invoke("What does Climbers Club do in SUTD?")

pprint(example_search)
"""

'\nexample_search = search.invoke("What does Climbers Club do in SUTD?")\n\npprint(example_search)\n'

In [179]:
lexicon_extraction_prompt_template = PromptTemplate(
    input_variables=["QUESTION_TEMPLATE_TO_PARSE", "LEXICON_PLACEHOLDER_TO_EXTRACT", "RETRIEVED_CONTEXT"],
    template = """
You are an expert in information extraction. Your goal is to parse the provided text and identify specific values that can fill the placeholder `{{LEXICON_PLACEHOLDER}}` in the following question template:

"{{QUESTION_TEMPLATE}}"

The following is the text retrieved from a web search (or other source) that may contain the information you need:

--- START OF CONTEXT ---
{{RETRIEVED_CONTEXT}}
--- END OF CONTEXT ---

Based on this context, please extract all relevant and distinct values that can be used to replace the placeholder `{{LEXICON_PLACEHOLDER}}` in the question template. Return the values as a JSON object where each value is numbered starting from "0". If no relevant values are found in the context, return in JSON format:

Example 1:
Question Template: "What does {{CLUB}} do in SUTD?"
LEXICON_PLACEHOLDER: {{CLUB}}
Retrieved Context: "SUTD offers a wide range of student clubs including the SUTD Robotics Club, SUTD Design Society, SUTD Photography Club, and the SUTD Debate Club..."
Extracted Values:
{{
  "0": "SUTD Robotics Club",
  "1": "SUTD Design Society",
  "2": "SUTD Photography Club",
  "3": "SUTD Debate Club"
}}

Example 2:
Question Template: "Who are the professors teaching {{COURSE}} at SUTD?"
LEXICON_PLACEHOLDER: {{COURSE}}
Retrieved Context: "The undergraduate course catalog lists Introduction to Programming taught by Prof. Lee, and Calculus I taught by Dr. Tan..."
Extracted Values:
{{
  "0": "Introduction to Programming",
  "1": "Calculus I"
}}

Now, apply this to the following:

Question Template: "{QUESTION_TEMPLATE_TO_PARSE}"
LEXICON_PLACEHOLDER: {LEXICON_PLACEHOLDER_TO_EXTRACT}
Retrieved Context:
--- START OF CONTEXT ---
{RETRIEVED_CONTEXT}
--- END OF CONTEXT ---

Extracted Values:
"""
)

In [180]:
question_lexicon_list_pairs = {}

for (question, lexicon), search_query in lexicon_question_search_query_pairs.items():
    lexicon_search_query_result = search.invoke(search_query['search_query'])
    
    QUESTION_TEMPLATE_TO_PARSE = question
    LEXICON_PLACEHOLDER_TO_EXTRACT = lexicon
    RETRIEVED_CONTEXT = lexicon_search_query_result
    
    prompt = lexicon_extraction_prompt_template.format(
        QUESTION_TEMPLATE_TO_PARSE=QUESTION_TEMPLATE_TO_PARSE,
        LEXICON_PLACEHOLDER_TO_EXTRACT=LEXICON_PLACEHOLDER_TO_EXTRACT,
        RETRIEVED_CONTEXT=RETRIEVED_CONTEXT
    )

    results = invoke_chain(llm, prompt) # Call model instead of chain, because the JSON output parser seems to bug out with particularly long context
    
    question_lexicon_list_pairs[question] = results

In [181]:
questions = []

for question, lexicon_list in question_lexicon_list_pairs.items():
    try:
        lexicon_list = json_output_parser.invoke(lexicon_list)
    except:
        lexicon_list = []
    
    if type(lexicon_list) == dict:
        lexicon_list = list(lexicon_list.values())
    
    pattern = r"\{[^}]+\}"
    
    questions.extend([re.sub(pattern, lexicon, question, count=1) for lexicon in lexicon_list])

pprint(questions)

['What specialization options are available after completing the Architecture '
 'and Sustainable Design (ASD) pillar at SUTD?',
 'What specialization options are available after completing the Engineering '
 'Product Development (EPD) pillar at SUTD?',
 'What specialization options are available after completing the Engineering '
 'Systems and Design (ESD) pillar at SUTD?',
 'What specialization options are available after completing the Information '
 'Systems Technology and Design (ISTD) pillar at SUTD?',
 'What are some common career paths for graduates who specialized in '
 'architecture and engineering at SUTD?',
 'What are some common career paths for graduates who specialized in design '
 'and artificial intelligence at SUTD?',
 'What are some common career paths for graduates who specialized in '
 'Engineering at SUTD?',
 'What are some common career paths for graduates who specialized in '
 'Humanities at SUTD?',
 'What are some common career paths for graduates who specializ

In [182]:
output_file = "questions.jsonl"

with open(output_file, "a", encoding="utf-8") as f:
    for question in questions:
        json.dump({"question": question}, f)
        f.write("\n")