#### Task A2

In [None]:
import pandas as pd
import os
import json
import pyterrier as pt
from llm_hlp import next_queries_gemini, next_queries_gpt

In [None]:
with open('OpenAI_token.txt', 'r') as file:
    content = file.read()
    os.environ["OPENAI_API_KEY"] = content

with open('Gemini_token.txt', 'r') as file:
    content = file.read()
    os.environ["GEMINI_API_KEY"] = content

In [None]:
if not pt.java.started():
    pt.java.init()

In [None]:
index_path = os.path.abspath("./example_data/index_CORE") # modify if needed
index_ref = pt.IndexRef.of(index_path)
index = pt.IndexFactory.of(index_ref)
meta_ind = index.getMetaIndex()

In [None]:
example_prompt = "You are tasked with writing potential next queries for a multiple given queries and already clicked on documents."
# change this example prompt and put it in your strategy
# you can also change the instruction IF YOU KNOW WHAT YOU ARE DOING
# if not, please just let them be
prompt_instructions = 'You are required to return EXACTLY %NUMOFQUERIES% potential next queries, NOTHING ELSE. Queries need to be distinct from each other. Order them in descending order by probability of them being the next query. Return these next %NUMOFQUERIES% queries as a Python-style list. These are the previous queries (preceded by "?") as well as the corresponding clicked on documents (preceded by ">"):\n'

In [None]:
# set these variables

use_gpt = False
modelname = 'gemini-2.0-flash' # 'gpt-4.1-nano' # 'gemini-2.0-flash' #'gemini-2.5-flash-preview-05-20' # 'gemini-2.0-flash'

name = 'this-is-my-name' 
strategy = 'This is my strategy ......' +\
    '_____ PROMPT: ' + example_prompt + ' _____ INSTRUCTION: ' + prompt_instructions + ' _____ LLM: ' + modelname

In [None]:
if not os.path.exists('task_A_2_automated_' + modelname + '--' + name + '.json'):
    with open('task_A_2_automated_' + modelname + '--' + name + '.json', 'w') as f:
        json.dump({'name': name, 'strategy': strategy}, f, indent = 2)

In [None]:
queries = pd.read_csv('predetermined_queries_Task_A_test.csv', header = None)
    
with open('task_A_2_automated_' + modelname + '--' + name + '.json') as f:
    next_queries = json.load(f)

for qu_id in queries[2].unique():
    squ_id = str(qu_id)
    if squ_id not in next_queries or len(next_queries[squ_id]) < 10:
        text_for_llm = ''
        for i, row in queries[queries[2] == qu_id].iterrows():
            text_for_llm += '?: ' + row[3] + '\n'

            for relevant_doc in row[5][1:-1].split(', '):
                doc_no = meta_ind.getDocument('docno', str(relevant_doc))
                if doc_no > -1:
                    doc_cont = meta_ind.getItem("title", doc_no)   
                
                    text_for_llm += '>: ' + doc_cont + '\n'

            text_for_llm += '--\n'
        
        next_new_queries = []
        already_entered_queries = []
        already_entered_queries_for_llm = ''
        if squ_id in next_queries:
            next_new_queries = next_queries[squ_id]
        
            for i in range(len(next_new_queries)):
                already_entered_queries.append(next_new_queries[i])
                already_entered_queries_for_llm += next_new_queries[i] + ', '
        
        if len(already_entered_queries) > 0:
            curr_prompt_instructions = ' These queries have already been written as potential next ones, YOU CANNOT REPEAT THEM: ' + already_entered_queries_for_llm[:-2] + '. ' + prompt_instructions
        else:
            curr_prompt_instructions = prompt_instructions
            
        print(example_prompt + ' ' + curr_prompt_instructions.replace('%NUMOFQUERIES%', str(10 - len(already_entered_queries))) + ' ' + text_for_llm)
                
        if use_gpt:
            potential_next_queries, error_occurred = next_queries_gpt(text_for_llm, example_prompt, 10 - len(already_entered_queries), curr_prompt_instructions. modelname)
        else:
            potential_next_queries, error_occurred = next_queries_gemini(text_for_llm, example_prompt, 10 - len(already_entered_queries), curr_prompt_instructions, modelname)

        for new_query in potential_next_queries:
            if len(next_new_queries) < 11:
                next_new_queries.append(new_query)
        next_queries[str(squ_id)] = next_new_queries
        
        with open('task_A_2_automated_' + modelname +'--' + name + '.json', 'w') as f:
            json.dump(next_queries, f, indent = 2)
            
        print('These ' + str(len(next_new_queries)) + ' options have been entered by the LLM as potential next queries:')
        for i in range(len(next_new_queries)):
            print(str(i + 1) + ' -- ' + next_new_queries[i])
        
        if len(potential_next_queries) + len(already_entered_queries) != 10:
            print('Please run this query again, there have only been ' + str(len(next_new_queries)) + ' queries.')

        print('Thanks!')
        break
    
if len(next_queries) == 35:
    print('Nice job, well done! Please send your `task_A_2_automated_' + modelname + '--' + name + '.json` file to Christin. Thank you so much! :)')