In [1]:
from datasets import load_dataset
from collections import defaultdict
from utils import questionDecompose, evidenceExtractor, run_RAG, postprocess, evidenceExtractor_multi
from LLMs import set_model
import json
ds = load_dataset('hotpotqa/hotpot_qa', 'fullwiki', trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ds_train_rm_distract
import random

seed=2025
k=300
random.seed(seed)
indices = random.sample(range(len(ds['train'])), k)
test_ds = ds['train'].select(indices)

name="gpt-4o-mini"
model = set_model(name)

In [3]:

total_cnt=0
cnt=0
preds=[]
labels=[]

for data in test_ds:
    question, answer, sentences = data['question'], data['answer'], data['context']['sentences']
    print(f"{total_cnt+1}/{len(test_ds)} Question: {question}")
    subQuestions=questionDecompose(model, question)
    print(f"subQuestions: {subQuestions}")
    print(f"# of subQuestions: {len(subQuestions)}")
    fine_grained_context=""
    relevance_num=set()
    context=""
    context_dict={}
    for idx, sent_ls in enumerate(sentences):
        context+=f"{idx}. "
        context+=" ".join(sent_ls)
        context_dict[str(idx)] = " ".join(sent_ls)
        context+='\n\n'
    
    # extract evidence number
    for subQ in subQuestions:
        num = evidenceExtractor(model, context, context_dict, subQ)
        print(f"{subQ} {num}")
        relevance_num.add(num)

    # generate find_grained_context
    relevance_num = list(relevance_num)
    for num in relevance_num:
        fine_grained_context+=context_dict[num]
        fine_grained_context+='\n\n'

    pred = run_RAG(model=model, context=fine_grained_context, question=question)
    pred, answer = postprocess(pred, label=answer)
    preds.append(pred)
    labels.append(answer)
    with open(f'./{name}_hotpotQA_train_k:{k}_seed:{seed}.jsonl', 'a') as f:
        json_file = {
            'id': total_cnt,
            'question': question,
            'answer': answer,
            'pred': pred,
            'subQuestions': subQuestions,
            'original_context': context,
            'fine_grained_context': fine_grained_context,
            'chosen_titles': [data['context']['title'][int(num)-1] for num in relevance_num],
            'correct': pred == answer
        }
        f.write(json.dumps(json_file)+'\n')
    print(f"pred: {pred}\tlabel: {answer}")
    if pred == answer:
        cnt+=1
    total_cnt+=1
print(f"exact_score: {cnt}/{total_cnt}")

1/300 Question: Who wrote the book that The Universe Versus Alex Woods has been compared to?
subQuestions: ['What book has The Universe Versus Alex Woods been compared to?', 'Who wrote that book?']
# of subQuestions: 2
What book has The Universe Versus Alex Woods been compared to? 1
Who wrote that book? 1
Mark Haddon wrote "The Curious Incident of the Dog in the Night-Time."
pred: mark haddon	label: mark haddon
2/300 Question: Chris Tarrant, known for his screamer, announced he was leaving Fremantle Football Club in which year?
subQuestions: ['In which year did Chris Tarrant leave Fremantle Football Club?']
# of subQuestions: 1
In which year did Chris Tarrant leave Fremantle Football Club? 8
2010
pred: 2010	label: 2010
3/300 Question: When was the self-described "neoceltic pagan folk" band based in the Netherlands formed?
subQuestions: ['When was the band based in the Netherlands formed?', 'What is the genre of the band?']
# of subQuestions: 2
When was the band based in the Netherlands

In [4]:

total_cnt=0
cnt=0
preds_multi=[]
labels_multi=[]

for data in test_ds:
    question, answer, sentences = data['question'], data['answer'], data['context']['sentences']
    print(f"{total_cnt+1}/{len(test_ds)} Question: {question}")
    fine_grained_context=""
    relevance_num=set()
    context=""
    context_dict={}
    for idx, sent_ls in enumerate(sentences):
        context+=f"{idx}. "
        context+=" ".join(sent_ls)
        context_dict[str(idx)] = " ".join(sent_ls)
        context+='\n\n'
    
    # extract evidence number
    
    num = evidenceExtractor_multi(model, context, context_dict, question)
    print(num)
    num = eval(num)
    for i in num:
        relevance_num.add(str(i))

    # generate find_grained_context
    relevance_num = list(relevance_num)
    for num in relevance_num:
        fine_grained_context+=context_dict[num]
        fine_grained_context+='\n\n'

    pred = run_RAG(model=model, context=fine_grained_context, question=question)
    pred, answer = postprocess(pred, label=answer)
    preds.append(pred)
    labels.append(answer)
    with open(f'./multi_{name}_hotpotQA_train_k:{k}_seed:{seed}.jsonl', 'a') as f:
        json_file = {
            'id': total_cnt,
            'question': question,
            'answer': answer,
            'pred': pred,
            'original_context': context,
            'fine_grained_context': fine_grained_context,
            'chosen_titles': [data['context']['title'][int(num)-1] for num in relevance_num],
            'correct': pred == answer
        }
        f.write(json.dumps(json_file)+'\n')
    print(f"pred: {pred}\tlabel: {answer}")
    if pred == answer:
        cnt+=1
    total_cnt+=1
print(f"exact_score: {cnt}/{total_cnt}")

1/300 Question: Who wrote the book that The Universe Versus Alex Woods has been compared to?
[1, 4]
Mark Haddon wrote the book that The Universe Versus Alex Woods has been compared to.
pred: mark haddon	label: mark haddon
2/300 Question: Chris Tarrant, known for his screamer, announced he was leaving Fremantle Football Club in which year?
[8]
2010
pred: 2010	label: 2010
3/300 Question: When was the self-described "neoceltic pagan folk" band based in the Netherlands formed?
[9]
I don't know.
pred: i don't know	label: 2008
4/300 Question: In Boneland, Colin is now a Professor at an observatory established by whom?
[3, 5]
The Jodrell Bank Observatory was established by Sir Bernard Lovell.
pred: sir bernard lovell	label: sir bernard lovell
5/300 Question: Which film was directed by Vincent McEveety, Charley and the Angel orThe Haunted Mansion?
[5]
Charley and the Angel
pred: charley and the angel	label: charley and the angel
6/300 Question: What driver won the 53 lap race, who was part of 

In [5]:

total_cnt=0
cnt=0
preds=[]
labels=[]

for data in test_ds:
    question, answer, sentences = data['question'], data['answer'], data['context']['sentences']
    print(f"{total_cnt+1}/{len(test_ds)} Question: {question}")
    subQuestions=questionDecompose(model, question)
    print(f"subQuestions: {subQuestions}")
    print(f"# of subQuestions: {len(subQuestions)}")
    fine_grained_context=""
    relevance_num=set()
    context=""
    context_dict={}
    for idx, sent_ls in enumerate(sentences):
        context+=f"{idx}. "
        context+=" ".join(sent_ls)
        context_dict[str(idx)] = " ".join(sent_ls)
        context+='\n\n'
    
    # extract evidence number
    for subQ in subQuestions:
        num = evidenceExtractor_multi(model, context, context_dict, subQ)
        print(f"{subQ} {num}")
        num = eval(num)
        for i in num:
            relevance_num.add(str(i))

    # generate find_grained_context
    relevance_num = list(relevance_num)
    for num in relevance_num:
        fine_grained_context+=context_dict[num]
        fine_grained_context+='\n\n'

    pred = run_RAG(model=model, context=fine_grained_context, question=question)
    pred, answer = postprocess(pred, label=answer)
    preds.append(pred)
    labels.append(answer)
    with open(f'./brave_me_{name}_hotpotQA_train_k:{k}_seed:{seed}.jsonl', 'a') as f:
        json_file = {
            'id': total_cnt,
            'question': question,
            'answer': answer,
            'pred': pred,
            'subQuestions': subQuestions,
            'original_context': context,
            'fine_grained_context': fine_grained_context,
            'chosen_titles': [data['context']['title'][int(num)-1] for num in relevance_num],
            'correct': pred == answer
        }
        f.write(json.dumps(json_file)+'\n')
    print(f"pred: {pred}\tlabel: {answer}")
    if pred == answer:
        cnt+=1
    total_cnt+=1
print(f"exact_score: {cnt}/{total_cnt}")

1/300 Question: Who wrote the book that The Universe Versus Alex Woods has been compared to?
subQuestions: ['What is the book that The Universe Versus Alex Woods has been compared to?', 'Who wrote that book?']
# of subQuestions: 2
What is the book that The Universe Versus Alex Woods has been compared to? [1]
Who wrote that book? [1, 9]
The book has been compared to "The Curious Incident of the Dog in the Night-Time."
pred: "the curious incident of the dog in the night-time."	label: mark haddon
2/300 Question: Chris Tarrant, known for his screamer, announced he was leaving Fremantle Football Club in which year?
subQuestions: ['In which year did Chris Tarrant leave Fremantle Football Club?']
# of subQuestions: 1
In which year did Chris Tarrant leave Fremantle Football Club? [8]
2010
pred: 2010	label: 2010
3/300 Question: When was the self-described "neoceltic pagan folk" band based in the Netherlands formed?
subQuestions: ['When was the band based in the Netherlands formed?', 'What is th

In [6]:
# cnt=0
# total_cnt=0
# for p, l in zip(preds, labels):
#     if p.endswith('.'):
#         p = p[:-1]
#     if p.lower() == l.lower():
#         cnt+=1
#     total_cnt+=1
print(f"exact_score: {cnt}/{total_cnt}")
    

exact_score: 162/300


In [7]:
cnt=0
total_cnt=0
preds_og=[]
labels_og=[]
for data in test_ds:
    question, answer, sentences = data['question'], data['answer'], data['context']['sentences']
    print(f"{total_cnt+1}/{len(test_ds)} Question: {question}")
    context=""
    for idx, sent_ls in enumerate(sentences):
        context+=" ".join(sent_ls)
        context+='\n\n'
    pred = run_RAG(model=model, context=context, question=question)
    pred, answer = postprocess(pred, label=answer)
    preds_og.append(pred)
    labels_og.append(answer)
    
    print(f"pred: {pred}\tlabel: {answer}")
    if pred == answer:
        cnt+=1
    total_cnt+=1
print(f"exact_score: {cnt}/{total_cnt}")

1/300 Question: Who wrote the book that The Universe Versus Alex Woods has been compared to?
The book that The Universe Versus Alex Woods has been compared to is The Curious Incident of the Dog in the Night-Time, written by Mark Haddon.
pred: the curious incident of the dog in the night-time, written by mark haddon	label: mark haddon
2/300 Question: Chris Tarrant, known for his screamer, announced he was leaving Fremantle Football Club in which year?
Chris Tarrant announced he was leaving Fremantle Football Club at the end of the 2010 AFL season.
pred: 2010	label: 2010
3/300 Question: When was the self-described "neoceltic pagan folk" band based in the Netherlands formed?
Omnia was formed in 1999.
pred: 1999	label: 2008
4/300 Question: In Boneland, Colin is now a Professor at an observatory established by whom?
Colin is now a Professor at the Jodrell Bank Observatory, which was established by Sir Bernard Lovell.
pred: sir bernard lovell	label: sir bernard lovell
5/300 Question: Which f

31/100

In [8]:
cnt=0
total_cnt=0
for p, l in zip(preds_og, labels_og):
    if p.endswith('.'):
        p = p[:-1]
    if p.lower() == l.lower():
        cnt+=1
    total_cnt+=1
print(f"exact_score: {cnt}/{total_cnt}")
    

exact_score: 53/100


35/100