In [98]:
import os
from neo4j import GraphDatabase
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

graph = GraphDatabase.driver(
	os.environ['NEO4J_URI'],
	auth=(os.environ['NEO4J_USER'], os.environ['NEO4J_PWD'])
)
database = os.environ['NEO4J_DATABASE']


def get_single_facts_by_title(k: int):
    query = '''
    MATCH (f:Fact)
	WHERE NOT (f.title IN $titles_to_ignore)
    WITH DISTINCT f.title AS fact_title
    CALL (fact_title) {
		MATCH (b:Bird)-[:HAS_FACT]->(f:Fact {title: fact_title})
		WITH DISTINCT b, fact_title
		CALL (b, fact_title) {
			MATCH (b)-[:HAS_FACT]->(f:Fact {title: fact_title})
			WITH f
			ORDER BY rand()
			LIMIT 1
			RETURN f
		}
		WITH b, f
		ORDER BY rand()
		LIMIT $k
		RETURN 
  			b.name AS bird_name, 
     		f.title AS title, 
       		f.text AS text
    }
    RETURN bird_name, title, text
    '''
    with graph.session(database=database) as session:
        records = session.run(
            query, 
            k=int(k),
			titles_to_ignore=[
				'About the Author(s)',
				'Acknowledgments',
				'Introduction',
				'Introduction?login'
			]
        )
        return [{ **r } for r in records]


test_facts = get_single_facts_by_title(1)
test_facts


[{'bird_name': 'Rufous-sided Gerygone',
  'title': 'Identification',
  'text': '10 cm. Has short pale supercilium not extending behind eye, narrow pale eyering; crown olive-brown, rest of upper\xadparts, including upperwing, reddish-brown; tail red-brown, inconspicuous pale subterminal spots on outer rectrices; white below, pale rufous-ochre on breast side and flanks; iris variously red-brown, dark grey or yellow-grey; bill blackish-grey; legs dark greyish. Sexes alike. Juvenile has underparts washed pale yellow. Races differ minimally in size and plumage colour: kuehni darker and duller above, with greyer brown crown and somewhat redder sides and flanks; fulvescens very similar to previous, but slightly less reddish on upperparts and on sides and flanks; senex extremely similar to previous, possibly slightly duller and greyer above, with darker, ashier crown; keyensis slightly larger, but otherwise barely separable.'},
 {'bird_name': 'Western Barn Owl',
  'title': 'Similar Species',
 

In [100]:
def get_fact_node_counts():
    query = '''
    MATCH (f:Fact)
	WHERE NOT (f.title IN $titles_to_ignore)
	RETURN DISTINCT f.title AS fact_title, COUNT(*) AS cnt
    '''
    with graph.session(database=database) as session:
        records = session.run(
            query, 
			titles_to_ignore=[
				'About the Author(s)',
				'Acknowledgments',
				'Introduction',
				'Introduction?login'
			]
        )
        return [{ **r } for r in records]
    

fact_counts_df = pd.DataFrame(get_fact_node_counts())
fact_counts_df.head(5)

Unnamed: 0,fact_title,cnt
0,Identification,8248
1,Similar Species,360
2,Systematics History,8341
3,Geographic Variation,291
4,Subspecies,8391


In [92]:
import json
from openai import OpenAI
from tqdm.notebook import tqdm

from dotenv import load_dotenv

load_dotenv()

client = OpenAI()

single_facts = get_single_facts_by_title(10)

single_fact_sys_prompt = '''
You are a question & answer generator. Your job is to take a text input that represents a fact about a particular bird, and return a Q&A pair.
- Your input will be in the following JSON format: {
	"bird_name": "<<the bird's common name>>",
	"fact_title": "<<the title that describes what the fact is about",
	"text": "<<the text of the fact about that bird>>"
}
- You will output the following JSON: {
	"question": "<<A question that could be reliably answered, only using the information provided in the text>>",
	"answer": "<<The answer to that question, based only on the text provided>>"
}

Here is an example:
-- input: { "bird_name": "Ostrich", "fact_title": "Location", "text": "The Ostrich is the largest bird in Australia" }
-- output: { "question": "What is the largest bird in Australia?", "answer": "Ostrich" }

Follow these guidelines:
- Do not return a question that asks to describe the structure, layout, or any meta-information about the source material itself, e.g. "What sections are covered in the text?".
- While the question can be complex, keep the answer as brief as possible. In the answer, do not embellish, and only cover the points necessary to answer the question.
- Do not ask any open-ended questions that could be potentially have different answers than what is provided in the fact text, e.g. "What is a distinguishing feature of <<bird>>?".
- Do not provide an explanation, follow the structured output format exactly.
'''


def query_model(sys_prompt: str, user_prompt: str, model='gpt-4.1', temperature=0.5, max_output_tokens=300) -> str:
	resp = client.responses.create(
		model=model,
		instructions=sys_prompt,
		input=[
			{'role': 'user',   'content': [{'type': 'input_text', 'text': user_prompt}]},
		],
		temperature=temperature,
		max_output_tokens=max_output_tokens
	)

	return resp.output_text


def generate_basic_qa(sys_prompt: str, bird_row: dict[str, str]) -> dict[str, str]:
	user_prompt = json.dumps({
		'bird_name': bird_row['bird_name'], 
		'fact_title': bird_row['title'], 
		'text': bird_row['text']
	})

	response = query_model(sys_prompt, user_prompt)
	return json.loads(response)


single_fact_qas = []
for single_fact in tqdm(single_facts):
		qa = generate_basic_qa(single_fact_sys_prompt, single_fact)
		single_fact_qas.append(qa)

print(single_fact_qas[:5])

  0%|          | 0/681 [00:00<?, ?it/s]

[{'question': 'What distinguishes the Boat-billed Heron from Nycticorax nycticorax?', 'answer': 'The Boat-billed Heron is overall much whiter in appearance and has an incredibly broad, flattened and thick bill.'}, {'question': 'How can the Plain Prinia be distinguished from P. sylvatica?', 'answer': 'By its smaller size, less bulky appearance, narrower-based tail, weaker legs, smaller bill, more prominent supercilium, and lack of dark mottling on the lower malar region.'}, {'question': "What is a distinguishing feature of Monteiro's Bushshrike compared to M. blanchoti?", 'answer': 'It has dark grey eyes (not yellow) and a more extensive pale face patch.'}, {'question': 'What are the key identification features of the Western Olivaceous Flatbill?', 'answer': 'Large-headed, dark-olive flatbill with white eyering, dusky wings with yellow-edged coverts and secondaries, yellow-edged tertials, dusky tail with pale margins, pale gray or yellow throat, grayish olive breast, pale yellow belly a

In [95]:
import pandas as pd

qa_df = pd.DataFrame({
	'question': [ qa['question'] for qa in single_fact_qas ],
	'answer':  [ qa['answer'] for qa in single_fact_qas ],
	'qa_type': 'single_fact',
	'ground_truth': single_facts
})

display(qa_df.head(5))
qa_df.to_csv('../data/eval_qas.csv')

Unnamed: 0,question,answer,qa_type,ground_truth
0,What distinguishes the Boat-billed Heron from ...,The Boat-billed Heron is overall much whiter i...,single_fact,"{'bird_name': 'Boat-billed Heron', 'title': 'I..."
1,How can the Plain Prinia be distinguished from...,"By its smaller size, less bulky appearance, na...",single_fact,"{'bird_name': 'Plain Prinia', 'title': 'Identi..."
2,What is a distinguishing feature of Monteiro's...,It has dark grey eyes (not yellow) and a more ...,single_fact,"{'bird_name': 'Monteiro's Bushshrike', 'title'..."
3,What are the key identification features of th...,"Large-headed, dark-olive flatbill with white e...",single_fact,"{'bird_name': 'Western Olivaceous Flatbill', '..."
4,What are some distinguishing features of the E...,The European Honey-buzzard differs from true b...,single_fact,"{'bird_name': 'European Honey-buzzard', 'title..."


In [107]:
compare_sample_size = 20

facts_for_compare_df = pd.DataFrame(get_single_facts_by_title(compare_sample_size))

compare_w_counts = facts_for_compare_df.merge(fact_counts_df, left_on='title', right_on='fact_title', how='left')
facts_for_compare_df = compare_w_counts[compare_w_counts.cnt > compare_sample_size].drop(columns=['fact_title', 'cnt'])

print(len(facts_for_compare_df))
facts_for_compare_df.head(5)

1180


Unnamed: 0,bird_name,title,text
0,Mindanao Cuckooshrike,Identification,~26.6 cm (1). Male is gray on head and upperpa...
1,Chestnut-rumped Heathwren,Identification,12·5–16 cm; 17 g. A terrestrial acanthizid wit...
2,Striated Softtail,Identification,17–18 cm. Small furnariid with shape like Cran...
3,Victorin's Warbler,Identification,15–17 cm; 10 g. A short-winged warbler with lo...
4,Collared Scops-Owl,Identification,The Collared Scops-Owl occurs in grayish brown...


In [137]:
def fact_compare(fact_texts: list[dict[str, str]], p=3) -> dict[str, str]:
	comparison_sys_prompt = f'''
You are a question & answer generator. Your job is to take a text input that represents a series of facts about birds, and return a series of {p} Q&A pairs that can only be answered with those facts.
The facts could be about the same bird, or different birds. Each fact has a "fact_title" key, which describes what the fact is about, e.g. "Breeding".

- Your input will be an array in the following JSON format: [{{
	"bird_name": "<<the bird's common name>>",
	"fact_title": "<<the title that describes what the fact is about",
	"text": "<<the text of the fact about that bird>>"
}}, {{ "bird_name": "...", "fact_title": "...", "text": "..." }}, ... ]

- You will output the following JSON: {{
	"question": "<<A question that could be reliably answered using only the information provided in the text>>",
	"answer": "<<The answer to that question, based only on the text provided>>"
}}

Here is an example:
-- input: [
	{{ "bird_name": "Ostrich", "fact_title": "Location", "text": "The Ostrich is the largest bird in Australia" }},
	{{ "bird_name": "Dodo", "fact_title": "Conservation Status", "text": "The Dodo is extinct, formerly the largest bird in Australia" }}
]
-- output: [
    {{ "question": "Which one is currently the largest bird in Australia, the Ostrich or the Dodo?", "answer": "Ostrich" }}
	...{p-1} more entries
]

Follow these guidelines:
- Do not return questions that ask to describe the structure, layout, or any meta-information about the source material itself, e.g. "What sections are covered in the text?" would be incorrect.
- While the question can be complex, keep the answers as brief as possible. In the answer, do not embellish, and only cover the points necessary to answer each question.
- Do not ask any questions that are too vague, and could have a different answer if new information was provided. e.g. "Which bird is blue?" would be too vague.
- Make sure you incorporate information from all of the facts in your response, a good Q&A pair should only be answerable with each fact provided.
- Make sure you output {p} Q&A pairs, following the structured output format exactly, in valid JSON. Do not provide any explanations.
'''

	response = query_model(comparison_sys_prompt, json.dumps(fact_texts), temperature=0.5)
	return json.loads(response)

#- Do not ask any open-ended questions that could be potentially have different answers than what is provided in the fact text, e.g. "What is a distinguishing feature of <<bird>>?".
#- In your answer, refer to any birds by name, DO NOT implicitly refer to any bird facts, e.g. "the first bird has a..." would be an example of an INCORRECT answer.

fact_1 = facts_for_compare_df.iloc[2]
print(fact_1.bird_name, fact_1.title, fact_1.text)
fact_2 = facts_for_compare_df.iloc[5]
print(fact_2.bird_name, fact_2.title, fact_2.text)
fact_3 = facts_for_compare_df.iloc[10]
print(fact_3.bird_name, fact_3.title, fact_3.text)

fact_compare([
	{ 'bird_name': fact_1.bird_name, 'fact_title': fact_1.title, 'text': fact_1.text },
	{ 'bird_name': fact_2.bird_name, 'fact_title': fact_2.title, 'text': fact_2.text },
	{ 'bird_name': fact_3.bird_name, 'fact_title': fact_3.title, 'text': fact_3.text },
])

Striated Softtail Identification 17–18 cm. Small furnariid with shape like Cranioleuca but patterned more like Asthenes. Has indistinct buff supercilium, pale lores, rest of face brownish with broad pale buff to tawny-buff streaks; crown and upper­parts dark reddish-brown with conspicuous brown-bordered reddish-rufous to rufous buff shaft streaks; wings mostly dark rufous, remiges with dark fuscous tips; tail graduated, rectrices with shafts slightly stiffened basally, tips blunt, pale tawny to cinnamon-rufous; chin and upper throat orange-rufous to yellowish-orange; underparts dull brown, streaked buff, streaks fading on belly and undertail-coverts; iris deep chestnut brown; upper mandible dusky horn to silvery grey, lower mandible bluish-grey with dusky horn tip; tarsus and toes dull greenish-grey. Sexes alike. Juvenile lacks throat patch, has less well-defined streaks on back, is more mottled than streaked below.
Philippine Scops-Owl Identification 23–28 cm; c. 200–310 g (1). With O

[{'question': 'Which bird among the Striated Softtail, Philippine Scops-Owl, and Ornate Lorikeet has the largest size according to the identification facts provided?',
  'answer': 'Philippine Scops-Owl'},
 {'question': 'Which bird is described as having a throat and breast that are red broadly barred with deep blue?',
  'answer': 'Ornate Lorikeet'},
 {'question': "Which bird's juvenile lacks a throat patch and has less well-defined streaks on the back compared to the adult?",
  'answer': 'Striated Softtail'}]

In [124]:
import itertools as it

def get_combinations(df: pd.DataFrame, k: int, num_samples: int, group_col='title') -> tuple[pd.Series]:
	def tuples_for_col(group):
		tuples = []
		for perm in it.islice(it.permutations(group.index, k), num_samples):
			tuples.append(tuple(group.loc[idx] for idx in perm))
		return tuples

	return (
		df.groupby(group_col, group_keys=False)
				.apply(tuples_for_col)
	)


comparison_samples = get_combinations(facts_for_compare_df, 2, 5)
print(len(comparison_samples), len(comparison_samples[0]), len(comparison_samples[0][0]))
print('NUM SAMPLES:', len(comparison_samples)*len(comparison_samples[0]))
[ samp.to_dict() for samp in comparison_samples[0][0] ]

59 5 2
NUM SAMPLES: 295


  .apply(tuples_for_col)
  print(len(comparison_samples), len(comparison_samples[0]), len(comparison_samples[0][0]))
  print('NUM SAMPLES:', len(comparison_samples)*len(comparison_samples[0]))
  [ samp.to_dict() for samp in comparison_samples[0][0] ]


[{'bird_name': 'Giant Coot',
  'title': 'Agonistic Behavior',
  'text': 'In Fuerte Baquedano, Northern Chile, immatures of this species harassed Common Gallinule juveniles, attacking them with their feet. Sometimes the adults also intimidated Common Gallinule (2).'},
 {'bird_name': 'Sula Lorikeet',
  'title': 'Agonistic Behavior',
  'text': 'In a study of agonistic behavior in captive lorikeets using mirror-image stimulation, it was found that the "Yellow-and-green Lorikeet" (then considered conspecific with the Yellow-cheeked Lorikeet, and race not specified) had a smaller repertoire of distinct agonistic gestures than most other species, and it tended to use unritualized gestures more than most other species (22).'}]

In [None]:
from tqdm.notebook import tqdm
from json import JSONDecodeError

hop_2_samples = get_combinations(facts_for_compare_df, 2, 5)
hop_3_samples = get_combinations(facts_for_compare_df, 3, 5)
hop_4_samples = get_combinations(facts_for_compare_df, 4, 5)


def get_qas_for_comparison(samples: list[list[tuple[str, str]]], p=3) -> pd.DataFrame:
	all_qas = []
	for title_division in tqdm(samples):
		for sample in title_division:
			qa_input = [ fact.to_dict() for fact in sample ]	
			retries = 3
			trying = True
			while retries > 0 and trying:
				try:
					qas = fact_compare(qa_input, p=p)
					for qa in qas:
						all_qas.append({ **qa, 'ground_truth': [ s.to_dict() for s in sample ]})
					trying = False
				except JSONDecodeError:
					print('JSON parsing failed, retrying...')
					retries -= 1
			if retries == 0:
				raise Exception(f'JSON parsing failed 3 times')
			
	return pd.DataFrame(all_qas)

print('GETTING 2 HOP SAMPLES...')
hop_2_df = get_qas_for_comparison(hop_2_samples)
hop_2_df['qa_type'] = '2_hop'
print(f'FETCHED {len(hop_2_df)} 2 HOP SAMPLES')
print('GETTING 3 HOP SAMPLES...')
hop_3_df = get_qas_for_comparison(hop_3_samples)
hop_3_df['qa_type'] = '3_hop'
print(f'FETCHED {len(hop_3_df)} 3 HOP SAMPLES')
print('GETTING 4 HOP SAMPLES...')
hop_4_df = get_qas_for_comparison(hop_4_samples)
hop_4_df['qa_type'] = '4_hop'
print(f'FETCHED {len(hop_4_df)} 4 HOP SAMPLES')

GETTING 2 HOP SAMPLES...


  .apply(tuples_for_col)
  .apply(tuples_for_col)
  .apply(tuples_for_col)


  0%|          | 0/59 [00:00<?, ?it/s]

JSON parsing failed, retrying...
JSON parsing failed, retrying...
FETCHED 885 2 HOP SAMPLES
GETTING 3 HOP SAMPLES...


  0%|          | 0/59 [00:00<?, ?it/s]

JSON parsing failed, retrying...
FETCHED 885 3 HOP SAMPLES
GETTING 4 HOP SAMPLES...


  0%|          | 0/59 [00:00<?, ?it/s]

JSON parsing failed, retrying...
JSON parsing failed, retrying...
FETCHED 885 4 HOP SAMPLES


In [146]:
import pandas as pd

eval_qas = pd.read_csv('../data/eval_qas.csv', index_col=0)
print(f'OLD EVAL QAS LENGTH: {len(eval_qas)}')
eval_qas = pd.concat([eval_qas, hop_2_df, hop_3_df, hop_4_df], ignore_index=True)
print(f'NEW QAS LENGTH WITH NEW DATA: {len(eval_qas)}')
eval_qas.to_csv('../data/eval_qas_with_hops.csv')

OLD EVAL QAS LENGTH: 681
NEW QAS LENGTH WITH NEW DATA: 3336


In [2]:
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv

load_dotenv()

graph = GraphDatabase.driver(
    os.environ['NEO4J_URI'],
    auth=(os.environ['NEO4J_USER'], os.environ['NEO4J_PWD'])
)
database = os.environ['NEO4J_DATABASE']


def vertex_sampling_w_contraction(N: int):
    with graph.session(database=database) as session:

        def start_node():
            rec = session.run('''
                MATCH (n)
                WHERE NOT n:Reference
                RETURN elementId(n) AS id
                ORDER BY rand()
                LIMIT 1    
            ''').single()
            return rec['id']

        def contract_neighborhood(in_neighborhood: set[str]):
            recs = session.run('''
                MATCH (base)-[]-(neighbor)
                WHERE 
                    elementId(base) IN $n_ids
                    AND NOT elementId(neighbor) IN $n_ids
                    AND NOT base:Reference
                    AND NOT neighbor:Reference
				WITH DISTINCT labels(neighbor)[0] AS neighbor_label, collect(neighbor) AS nodes_in_label
				WITH collect({label: neighbor_label, nodes: nodes_in_label}) AS label_groups
				WITH label_groups[toInteger(rand() * size(label_groups))] AS chosen_group
				WITH chosen_group.nodes AS nodes_in_chosen_label
				WITH nodes_in_chosen_label[toInteger(rand() * size(nodes_in_chosen_label))] AS picked_neighbor
				RETURN elementId(picked_neighbor) AS id
            ''', n_ids=list(in_neighborhood)).value()
            return recs[0] if recs else None
        '''
                RETURN elementId(nbr) AS id
                ORDER BY rand()
                LIMIT 1
			'''

        def fetch_nodes(ids: set[str]):
            result = session.run('''
                MATCH (n)
                WHERE elementId(n) IN $ids
                RETURN elementId(n) AS id, labels(n) AS labels, n AS node
            ''', ids=list(ids))
            nodes = []
            for record in result:
                props = dict(record["node"])
                props["node_label"] = record["labels"][0] if record["labels"] else None
                nodes.append(props)
            return nodes

        sampled = set([start_node()])
        num_sampled = 1

        while num_sampled < N:
            nxt = contract_neighborhood(sampled)
            if nxt is None:
                break
            sampled.add(nxt)
            num_sampled += 1

        return fetch_nodes(sampled)

[ samp['node_label'] for samp in vertex_sampling_w_contraction(20)]


['Image',
 'Family',
 'Image',
 'Bird',
 'Genus',
 'Fact',
 'Bird',
 'Fact',
 'Bird',
 'Bird',
 'Fact',
 'Image',
 'Bird',
 'Family',
 'Bird',
 'Image',
 'Bird',
 'Order',
 'Genus',
 'Image']

In [3]:
import json
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI()


def serialize_node(node: dict[str, str]) -> dict:
	if node['node_label'] == 'Bird':
		content = [{
			'type': 'input_text',
			'text': json.dumps({
				'node_label': node['node_label'],
				'name': node['name'],
				'order': node['order'],
				'family': node['family'],
				'genus': node['genus'],
				'species': node['species']
			}) 
		}]
	elif node['node_label'] == 'Fact':
		content = [{
			'type': 'input_text',
			'text': json.dumps({
				'node_label': node['node_label'],
				'bird_name': node['bird_name'],
				'fact_title': node['title'],
				'content': node['text']
			})
		}]
	elif node['node_label'] == 'Image':
		content = [
			{
				'type': 'input_image',
				'image_url': node['url']
			},
			{
				'type': 'input_text',
				'text': json.dumps({
					'node_label': node['node_label'],
					'bird_name': node['bird_name'],
					**({'title': node['title']} if node['title'] else {})
				})
			}
		]
	elif node['node_label'] in ['Genus', 'Family', 'Order']:
		content = [{
			'type': 'input_text',
			'text': json.dumps({
				'node_label': node['node_label'],
				'name': node['name']	
			})
		}]

	return {
		'role': 'user',
		'content': content
	}
				

def synthesize_qas_from_neighborhood(
    num_qas: int, 
    num_in_neighborhood: int,
	model='gpt-4.1',
	temperature=0.5
) -> list[dict[str, str]]:

	neighborhood_prompt = f'''
You are a question & answer generator. Your job is to accept a series of TEXT and IMAGE inputs, and use them to construct a series of Q&A pairs.
All inputs represent facts, images, or properties of birds, and are represented as JSON objects. Each input is derived from a graph, where this data is stored.
Each image input will additionally be followed by a JSON object as a caption, containing the metadata information for that image in the database. Use BOTH the image and its metadata when creating the Q&A pairs.
Each input will have a "node_label" key, which describes what the node represents.

Follow these steps when generating your Q&A pairs:
1. First, mentally analyze each input separately and extract its key ideas. For images, describe as many details about each image as possible.
2. Then, synthesize all of the ideas into {num_qas} questions that could be answered using ONLY the inputs as reference.
-- DO NOT ask any questions that are too vague, and could have a different answer if new information was available. e.g. "Which bird is blue?" would be INCORRECT.
-- Every question MUST incorporate information from AS MANY inputs as possible. DO NOT just regurgitate information from one node.
-- Each question should be unique from each other, and EVERY input should be represented at some point.
3. Then, using the inputs as reference, answer each question.
-- Each answer should be as brief as possible. Do not embellish. Only cover the relevant points. 
4. Respond ONLY as JSON with the shape: {{ "qas": [{{ "question": "...", "answer": "..." }}, {{ "question": "...", "answer": "..." }}, ...] }}
-- Make sure you output {num_qas} Q&A pairs, following the structured output format exactly, in valid JSON. Do not provide any explanations.

MAKE SURE THAT THE QUESTIONS DO NOT DIRECTLY REFERENCE THE TEXT OR CONTENT. Here are some examples of INCORRECT questions:
-- "...In the image shown..."
-- "...For the birds provided..."
-- "...In the reference text..."
-- "...as evidenced by what is listed..."
THE QUESTIONS SHOULD BE ABLE TO BE ASKED WITHOUT ANY KNOWLEDGE OF THE REFERENCE INFORMATION.
'''

	samples = vertex_sampling_w_contraction(num_in_neighborhood)
	serialized_samples = [ serialize_node(node) for node in samples ]

	resp = client.responses.create(
		model=model,
		instructions=neighborhood_prompt,
		input=serialized_samples,
		text={
      		'format': {
				'type': 'json_schema',
				'name': 'qa_array',
				'strict': True,
				'schema': {
					'type': 'object',
					'properties': {
						'qas': {
							'type': 'array',
							'minItems': num_qas,
							'maxItems': num_qas,
							'items': {
								'type': 'object',
								'properties': {
									'question': {'type': 'string'},
									'answer': {'type': 'string'}
								},
								'required': ['question', 'answer'],
								'additionalProperties': False
							}
						}
					},
					'required': ['qas'],
					'additionalProperties': False	
				}
			}
        }
	)

	return json.loads(resp.output_text)['qas'], serialized_samples


sample_vsc_qas, sample_vsc_ground_truth = synthesize_qas_from_neighborhood(
	num_qas=10,
	num_in_neighborhood=30,
	model='gpt-4.1'
)

for ind, ground_truth in enumerate(sample_vsc_ground_truth):
	print(f'NODE {ind}: ', ground_truth)

for ind, qa in enumerate(sample_vsc_qas):
	print(f'QA {ind}: ', qa)

NODE 0:  {'role': 'user', 'content': [{'type': 'input_text', 'text': '{"node_label": "Genus", "name": "Ptychoramphus"}'}]}
NODE 1:  {'role': 'user', 'content': [{'type': 'input_text', 'text': '{"node_label": "Bird", "name": "Peruvian Tern", "order": "Charadriiformes", "family": "Laridae", "genus": "Sternula", "species": "Lorata"}'}]}
NODE 2:  {'role': 'user', 'content': [{'type': 'input_text', 'text': '{"node_label": "Order", "name": "Charadriiformes"}'}]}
NODE 3:  {'role': 'user', 'content': [{'type': 'input_text', 'text': '{"node_label": "Fact", "bird_name": "Hudsonian Godwit", "fact_title": "Introduction", "content": "The Hudsonian Godwit (Limosa haemastica), known in the past as the Ring-tailed Marlin or Goose-bird, is the least well known of the world\'s four godwit species. With the majority of its population restricted to only a few remote sites during much of the year, it was long regarded as one of the rarest birds on the continent, ranked with the likes of the Kirtland\'s War

In [4]:
import time
import openai
import threading
from tqdm.notebook import tqdm

num_total_rows = 3000
num_workers = 4
num_qas_per_query = 10

if 'vsc_qa_rows' in locals():
    print(f'{len(vsc_qa_rows)} rows already parsed')
    print(f'parsing {num_total_rows - len(vsc_qa_rows)}')
else:
    vsc_qa_rows = []

lock = threading.Lock()

def qa_vsc_gen_worker(thread_id):
	for _ in tqdm(range(int((num_total_rows - len(vsc_qa_rows)) / (num_workers * num_qas_per_query)))):
		retries = 3
		done = False
		while retries > 0 and not done:
			try:
				vsc_qas, vsc_ground_truth = synthesize_qas_from_neighborhood(
					num_qas=num_qas_per_query,
					num_in_neighborhood=30,
					model='gpt-4.1'
				)
				with lock:
					for vsc_qa in vsc_qas:
						vsc_qa_rows.append(vsc_qa | {
							'ground_truth': vsc_ground_truth,
							'qa_type': 'vsc_neighborhood'
						})
				done = True
			except openai.RateLimitError:
				print(f'Hit rate limit, retrying...({retries} left)')
				time.sleep(10)
				retries -= 1

threads = [
	threading.Thread(target=qa_vsc_gen_worker, args=(tid, ))
	for tid in range(num_workers)
]

for t in threads:
    t.start()
for t in threads:
    t.join()

len(vsc_qa_rows)

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(2 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(2 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(2 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(3 left)
Hit rate limit, retrying...(

3000

In [6]:
import pandas as pd

vsc_df = pd.DataFrame(vsc_qa_rows)

eval_qas_with_hops = pd.read_csv('../data/eval_qas_with_hops.csv', index_col=0)
print(f'OLD EVAL QAS LENGTH: {len(eval_qas_with_hops)}')
eval_qas = pd.concat([eval_qas_with_hops, vsc_df], ignore_index=True)
print(f'NEW QAS LENGTH WITH NEW DATA: {len(eval_qas)}')
eval_qas.to_csv('../data/eval_qas_with_vsc.csv')

OLD EVAL QAS LENGTH: 3336
NEW QAS LENGTH WITH NEW DATA: 6336


In [43]:
eval_qas = pd.read_csv('../data/eval_qas_with_vsc.csv', index_col=0)
eval_qas.qa_type.value_counts()

qa_type
vsc_neighborhood    3000
3_hop                885
2_hop                885
4_hop                885
single_fact          681
Name: count, dtype: int64

In [49]:
import ast
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv

load_dotenv()

graph = GraphDatabase.driver(
    os.environ['NEO4J_URI'],
    auth=(os.environ['NEO4J_USER'], os.environ['NEO4J_PWD'])
)
database = os.environ['NEO4J_DATABASE']


def get_fact_node_id(bird_name: str, title: str) -> str:
	with graph.session(database=database) as session:
		node_result = session.run('''
			MATCH (f:Fact {
				bird_name: $bird_name,
				title: $title	
			})
			RETURN elementId(f) AS id
		''', bird_name=bird_name, title=title).single()
		return node_result['id']


for ind, row in eval_qas.iterrows():
	if row.qa_type == 'single_fact':
		ground_truth = ast.literal_eval(row.ground_truth)
		node_id = get_fact_node_id(ground_truth['bird_name'], ground_truth['title'])
		eval_qas.at[ind, 'node_ids'] = [node_id]

eval_qas.head(5)


Unnamed: 0,question,answer,qa_type,ground_truth,node_ids
0,What distinguishes the Boat-billed Heron from ...,The Boat-billed Heron is overall much whiter i...,single_fact,"{'bird_name': 'Boat-billed Heron', 'title': 'I...",[4:a554891f-4e4e-45d4-beca-f7796c51940f:197847]
1,How can the Plain Prinia be distinguished from...,"By its smaller size, less bulky appearance, na...",single_fact,"{'bird_name': 'Plain Prinia', 'title': 'Identi...",[4:a554891f-4e4e-45d4-beca-f7796c51940f:457779]
2,What is a distinguishing feature of Monteiro's...,It has dark grey eyes (not yellow) and a more ...,single_fact,"{'bird_name': ""Monteiro's Bushshrike"", 'title'...",[4:a554891f-4e4e-45d4-beca-f7796c51940f:416657]
3,What are the key identification features of th...,"Large-headed, dark-olive flatbill with white e...",single_fact,"{'bird_name': 'Western Olivaceous Flatbill', '...",[4:a554891f-4e4e-45d4-beca-f7796c51940f:363358]
4,What are some distinguishing features of the E...,The European Honey-buzzard differs from true b...,single_fact,"{'bird_name': 'European Honey-buzzard', 'title...",[4:a554891f-4e4e-45d4-beca-f7796c51940f:209969]


In [51]:
for ind, row in eval_qas.iterrows():
    if row.qa_type in ['2_hop', '3_hop', '4_hop']:
        ground_truth = ast.literal_eval(row.ground_truth)
        node_ids = [
			get_fact_node_id(f['bird_name'], f['title'])
			for f in ground_truth
		]
        eval_qas.at[ind, 'node_ids'] = node_ids

eval_qas[eval_qas.qa_type.isin(['2_hop', '3_hop', '4_hop'])].head(5)

Unnamed: 0,question,answer,qa_type,ground_truth,node_ids
681,Which species was observed harassing Common Ga...,Giant Coot,2_hop,"[{'bird_name': 'Giant Coot', 'title': 'Agonist...",[4:a554891f-4e4e-45d4-beca-f7796c51940f:115510...
682,What type of gestures does the Sula Lorikeet t...,Unritualized gestures,2_hop,"[{'bird_name': 'Giant Coot', 'title': 'Agonist...",[4:a554891f-4e4e-45d4-beca-f7796c51940f:115510...
683,Which bird was found to have a smaller reperto...,Sula Lorikeet,2_hop,"[{'bird_name': 'Giant Coot', 'title': 'Agonist...",[4:a554891f-4e4e-45d4-beca-f7796c51940f:115510...
684,Which bird species has been observed attacking...,Giant Coot,2_hop,"[{'bird_name': 'Giant Coot', 'title': 'Agonist...",[4:a554891f-4e4e-45d4-beca-f7796c51940f:115510...
685,Which bird species defends a 30 meter area aro...,Bicolored Hawk,2_hop,"[{'bird_name': 'Giant Coot', 'title': 'Agonist...",[4:a554891f-4e4e-45d4-beca-f7796c51940f:115510...


In [66]:
import ast

ast.literal_eval(eval_qas[eval_qas.qa_type == 'vsc_neighborhood'].ground_truth.iloc[0])

[{'role': 'user',
  'content': [{'type': 'input_text',
    'text': '{"node_label": "Genus", "name": "Clanga"}'}]},
 {'role': 'user',
  'content': [{'type': 'input_image',
    'image_url': 'https://cdn.download.ams.birds.cornell.edu/api/v1/asset/630362912/480'},
   {'type': 'input_text',
    'text': '{"node_label": "Image", "bird_name": "Red-thighed Sparrowhawk"}'}]},
 {'role': 'user',
  'content': [{'type': 'input_text',
    'text': '{"node_label": "Family", "name": "Accipitridae"}'}]},
 {'role': 'user',
  'content': [{'type': 'input_image',
    'image_url': 'https://cdn.download.ams.birds.cornell.edu/api/v1/asset/121313251/640'},
   {'type': 'input_text',
    'text': '{"node_label": "Image", "bird_name": "Black-and-white Hawk-Eagle", "title": "Juvenile in flight, dorsal view. The upperwing is brownish, and the wing coverts, including secondary and primary coverts, have pale tips, typical of Juvenile Plumage. The subterminal band is relatively narrow and indistinct. Note also the unifo

In [7]:
eval_qas[(eval_qas.qa_type == 'vsc_neighborhood') & (pd.isna(eval_qas.node_ids))]

Unnamed: 0,question,answer,qa_type,ground_truth,node_ids
3336,Which genus includes a bird that is very small...,The genus Tachyspiza includes the Red-thighed ...,vsc_neighborhood,"[{'role': 'user', 'content': [{'type': 'input_...",
3337,Which bird shows no known aerial sexual displa...,The Bicolored Hawk shows no known aerial sexua...,vsc_neighborhood,"[{'role': 'user', 'content': [{'type': 'input_...",
3338,Which genus includes two harrier species found...,The genus Circus includes the Swamp Harrier an...,vsc_neighborhood,"[{'role': 'user', 'content': [{'type': 'input_...",
3339,Which bird is described as having black upperp...,The Black-and-white Hawk-Eagle has black upper...,vsc_neighborhood,"[{'role': 'user', 'content': [{'type': 'input_...",
3340,Which bird exhibits a modified basic strategy ...,The Bicolored Hawk exhibits a modified basic s...,vsc_neighborhood,"[{'role': 'user', 'content': [{'type': 'input_...",
...,...,...,...,...,...
6194,What breeding habitats are preferred by the Li...,The Little Ringed Plover prefers breeding in s...,vsc_neighborhood,"[{'role': 'user', 'content': [{'type': 'input_...",
6195,How do the adult breeding and juvenile appeara...,Adult breeding Black-tailed Gulls have white h...,vsc_neighborhood,"[{'role': 'user', 'content': [{'type': 'input_...",
6196,"Which bird species, known for a series of deep...",Greater Coucal.,vsc_neighborhood,"[{'role': 'user', 'content': [{'type': 'input_...",
6197,What bird in the genus Taccocua is generally s...,Sirkeer Malkoha.,vsc_neighborhood,"[{'role': 'user', 'content': [{'type': 'input_...",


In [8]:
import pandas as pd
eval_qas = pd.read_csv('../data/eval_qas_with_node_ids.csv', index_col=0)
len(eval_qas)

for ind, row in eval_qas[eval_qas.qa_type == '4_hop'].iterrows():
	print(f'ROW {ind} ------------------')
	print(f'Q: ', row.question)
	print(f'A: ', row.answer)
	print(f'G: ', row.ground_truth)

# in the provided facts... etc
# ROW 699 :: Which bird has a bluish gray bill and which has a bright yellow bill in adulthood?
# ROW 711 :: Which bird is described as having behavior similar to another species, and which one is described with details about its perching and foraging habits?
# 3 hop not taking all references into consideration

# personas

ROW 2451 ------------------
Q:  Which bird is reported to defend a 30 meter area around its nest from both raptors and large passerines, as well as primates?
A:  Bicolored Hawk
G:  [{'bird_name': 'Giant Coot', 'title': 'Agonistic Behavior', 'text': 'In Fuerte Baquedano, Northern Chile, immatures of this species harassed Common Gallinule juveniles, attacking them with their feet. Sometimes the adults also intimidated Common Gallinule (2).'}, {'bird_name': 'Sula Lorikeet', 'title': 'Agonistic Behavior', 'text': 'In a study of agonistic behavior in captive lorikeets using mirror-image stimulation, it was found that the "Yellow-and-green Lorikeet" (then considered conspecific with the Yellow-cheeked Lorikeet, and race not specified) had a smaller repertoire of distinct agonistic gestures than most other species, and it tended to use unritualized gestures more than most other species (22).'}, {'bird_name': 'Bicolored Hawk', 'title': 'Agonistic Behavior', 'text': 'Both sexes defend area of 3

In [None]:
from transformers import pipeline
import numpy as np

# Load a small, specialized SQuAD 2.0 model (approx 500MB)
# This model is designed to return NO answer if the text doesn't support it.
qa_pipeline = pipeline(
    "question-answering",
    model="deepset/roberta-base-squad2",
    tokenizer="deepset/roberta-base-squad2"
)

In [1]:
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv
from openai import OpenAI
import json

load_dotenv()

graph = GraphDatabase.driver(
    os.environ['NEO4J_URI'],
    auth=(os.environ['NEO4J_USER'], os.environ['NEO4J_PWD'])
)
database = os.environ['NEO4J_DATABASE']

client = OpenAI()

def find_is_source_related(
	source: dict,
	question: str,
	answer: str,
	model='gpt-4.1',
) -> bool:

	neighborhood_prompt = '''
You are a Q&A source evaluator. Your job is to take a question and answer pair, as well as a source in TEXT or IMAGE form, and return whether or not that source is related to or supports the Q&A pair.
You will take 3 inputs in this order:

- Input 1: the question
- Input 2: the answer
- Input 3: the source. The source will either be a json object with a node_label key which describes what type of source it is and its associated attributes, or an image input followed by its caption. The caption for an image input will also be in json object format.

If the source pertains to the question, and supports the answer, return {"value": true}. Return {"value": false} if the source is irrelevant or does not support the answer. Do not explain.
'''

	resp = client.responses.create(
		model=model,
		instructions=neighborhood_prompt,
		input=[
			{ 
    			'role': 'user',
				'content': [{
					'type': 'input_text',
					'text': question
				}]
    		},
			{
				'role': 'user',
				'content': [{
					'type': 'input_text',
					'text': answer
				}]	
			},
			source
		],
		text={
			'format': {
				'type': 'json_schema',
				'name': 'yes_or_no',
				'strict': True,
				'schema': {
					'type': 'object',
					'properties': {
						'value': {
							'type': 'boolean'
						}
					},
					'required': ['value'],
					'additionalProperties': False
				}
			}
		},
	)

	return json.loads(resp.output_text)


def find_node_id(input_json: dict) -> str:
	with graph.session(database=database) as session:
		content = input_json['content']
		if len(content) > 1:
			img_url = content[0]['image_url']
			node_result = session.run('''
				MATCH (i:Image {
					url: $url	
				})
				RETURN elementId(i) AS id
            ''', url=img_url).single()
			return node_result['id']
		else: 
			payload = json.loads(content[0]['text'])
			if payload['node_label'] == 'Fact':
				return get_fact_node_id(payload['bird_name'], payload['fact_title'])
			else:
				node_result = session.run(f'''
					MATCH (x:{payload['node_label']} {{
						name: $name
					}})
					RETURN elementId(x) AS id
				''', name=payload['name']).single()
				return node_result['id']


def get_fact_node_id(bird_name: str, title: str) -> str:
	with graph.session(database=database) as session:
		node_result = session.run('''
			MATCH (f:Fact {
				bird_name: $bird_name,
				title: $title	
			})
			RETURN elementId(f) AS id
		''', bird_name=bird_name, title=title).single()
		return node_result['id']

'''
sample_vsc_row = eval_qas[eval_qas.qa_type == 'vsc_neighborhood'].iloc[0]
sample_ground_truth = ast.literal_eval(sample_vsc_row.ground_truth)
print(f'Q: {sample_vsc_row.question}')
print(f'A: {sample_vsc_row.answer}')

for g in sample_ground_truth:
	print('GROUND_TRUTH: ', g)
	is_related = find_is_source_related(g, sample_vsc_row.question, sample_vsc_row.answer)
	print('IS RELATED: ', is_related)
	node_id = find_node_id(g)
	print('NODE ID: ', node_id)
'''

"\nsample_vsc_row = eval_qas[eval_qas.qa_type == 'vsc_neighborhood'].iloc[0]\nsample_ground_truth = ast.literal_eval(sample_vsc_row.ground_truth)\nprint(f'Q: {sample_vsc_row.question}')\nprint(f'A: {sample_vsc_row.answer}')\n\nfor g in sample_ground_truth:\n\tprint('GROUND_TRUTH: ', g)\n\tis_related = find_is_source_related(g, sample_vsc_row.question, sample_vsc_row.answer)\n\tprint('IS RELATED: ', is_related)\n\tnode_id = find_node_id(g)\n\tprint('NODE ID: ', node_id)\n"

In [4]:
import ast
import openai
import threading
import time
from datetime import datetime

num_workers = 12

vsc_qa_indexes = list(eval_qas[
	(eval_qas.qa_type == 'vsc_neighborhood') &
	(pd.isna(eval_qas.node_ids))
].index)
total_length = len(vsc_qa_indexes)

parsed_vsc_qas = {}

lock = threading.Lock()


def qa_val_vsc_worker(thread_id):

	while (len(vsc_qa_indexes) > 0):

		with lock:
			print(f'{datetime.now()} :: WORKER {thread_id} :: PARSING {(total_length-len(vsc_qa_indexes))+1}/{total_length}')
			ind = vsc_qa_indexes.pop()

		row = eval_qas.iloc[ind]
		ground_truth = ast.literal_eval(row.ground_truth)

		nodes = []
		correct_gt = []

		for g in ground_truth:

			retries = 5
			done = False

			while retries > 0 and not done:
				try:
					is_related = find_is_source_related(g, row.question, row.answer)
					if is_related['value']:
						node_id = find_node_id(g)
						with lock:
							correct_gt.append(g)
							nodes.append(node_id)
					done = True
				except openai.RateLimitError:
					print(f'{datetime.now()} :: WORKER {thread_id} :: Rate Limit Hit ({retries} retries left)')
					time.sleep(15)
					retries -= 1
				except openai.InternalServerError:
					print(f'{datetime.now()} :: WORKER {thread_id} :: Internal Server Error ({retries} retries left)')
					time.sleep(15)
					retries -= 1

			if not done:
				raise Exception(f'{datetime.now()} :: WORKER {thread_id} :: MAX RETRIES HIT')
		
		with lock:
			eval_qas.at[ind, 'ground_truth'] = correct_gt
			eval_qas.at[ind, 'node_ids'] = nodes
	
	print(f'WORKER {thread_id} FINISHED')


threads = [
	threading.Thread(target=qa_val_vsc_worker, args=(tid, ))
	for tid in range(num_workers)
]

for t in threads:
    t.start()
for t in threads:
    t.join()


2025-11-22 16:19:13.267942 :: WORKER 0 :: PARSING 1/2984
2025-11-22 16:19:13.268943 :: WORKER 1 :: PARSING 2/2984
2025-11-22 16:19:13.269689 :: WORKER 2 :: PARSING 3/2984
2025-11-22 16:19:13.270383 :: WORKER 3 :: PARSING 4/2984
2025-11-22 16:19:13.270952 :: WORKER 4 :: PARSING 5/2984
2025-11-22 16:19:13.271524 :: WORKER 5 :: PARSING 6/2984
2025-11-22 16:19:13.272244 :: WORKER 6 :: PARSING 7/2984
2025-11-22 16:19:13.272881 :: WORKER 7 :: PARSING 8/2984
2025-11-22 16:19:13.273413 :: WORKER 8 :: PARSING 9/2984
2025-11-22 16:19:13.274044 :: WORKER 9 :: PARSING 10/2984
2025-11-22 16:19:13.274599 :: WORKER 10 :: PARSING 11/2984
2025-11-22 16:19:13.275450 :: WORKER 11 :: PARSING 12/2984
2025-11-22 16:19:41.292913 :: WORKER 1 :: Rate Limit Hit (5 retries left)
2025-11-22 16:19:56.198001 :: WORKER 7 :: Rate Limit Hit (5 retries left)
2025-11-22 16:20:02.914196 :: WORKER 4 :: Rate Limit Hit (5 retries left)
2025-11-22 16:20:06.023386 :: WORKER 10 :: Rate Limit Hit (5 retries left)
2025-11-22 16:

  warn(


2025-11-22 16:59:09.563601 :: WORKER 0 :: Rate Limit Hit (5 retries left)
2025-11-22 16:59:12.870868 :: WORKER 2 :: PARSING 104/2984
2025-11-22 16:59:16.029747 :: WORKER 4 :: Rate Limit Hit (5 retries left)
2025-11-22 16:59:20.115837 :: WORKER 10 :: PARSING 105/2984
2025-11-22 16:59:27.497038 :: WORKER 11 :: PARSING 106/2984
2025-11-22 16:59:29.701207 :: WORKER 2 :: Rate Limit Hit (5 retries left)
2025-11-22 16:59:33.575245 :: WORKER 1 :: Rate Limit Hit (5 retries left)
2025-11-22 16:59:41.006455 :: WORKER 10 :: Rate Limit Hit (5 retries left)
2025-11-22 16:59:41.409069 :: WORKER 0 :: Rate Limit Hit (4 retries left)
2025-11-22 16:59:46.611546 :: WORKER 11 :: Rate Limit Hit (5 retries left)
2025-11-22 16:59:55.202221 :: WORKER 7 :: Rate Limit Hit (5 retries left)
2025-11-22 17:00:00.598316 :: WORKER 9 :: Rate Limit Hit (5 retries left)
2025-11-22 17:00:09.368458 :: WORKER 3 :: PARSING 107/2984
2025-11-22 17:00:13.624339 :: WORKER 1 :: Rate Limit Hit (5 retries left)
2025-11-22 17:00:15.

Exception in thread Thread-12 (qa_val_vsc_worker):
Traceback (most recent call last):
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    _threading_Thread_run(self)
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_889948/1188436977.py", line 58, in qa_val_vsc_worker
Exception: 2025-11-22 17:14:15.101137 :: WORKER 8 :: MAX RETRIES HIT


2025-11-22 17:14:16.074106 :: WORKER 9 :: Rate Limit Hit (1 retries left)
2025-11-22 17:14:17.094041 :: WORKER 3 :: Rate Limit Hit (1 retries left)
2025-11-22 17:14:17.716704 :: WORKER 2 :: Rate Limit Hit (1 retries left)
2025-11-22 17:14:17.874334 :: WORKER 11 :: Rate Limit Hit (1 retries left)
2025-11-22 17:14:18.545022 :: WORKER 6 :: Rate Limit Hit (1 retries left)


Exception in thread Thread-4 (qa_val_vsc_worker):
Traceback (most recent call last):
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    _threading_Thread_run(self)
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_889948/1188436977.py", line 58, in qa_val_vsc_worker
Exception: 2025-11-22 17:14:20.989457 :: WORKER 0 :: MAX RETRIES HIT


2025-11-22 17:14:22.773567 :: WORKER 10 :: Rate Limit Hit (1 retries left)
2025-11-22 17:14:26.188971 :: WORKER 5 :: Rate Limit Hit (1 retries left)
2025-11-22 17:14:27.253309 :: WORKER 1 :: Rate Limit Hit (1 retries left)
2025-11-22 17:14:27.396580 :: WORKER 4 :: Rate Limit Hit (1 retries left)


Exception in thread Thread-11 (qa_val_vsc_worker):
Traceback (most recent call last):
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    _threading_Thread_run(self)
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_889948/1188436977.py", line 58, in qa_val_vsc_worker
Exception: 2025-11-22 17:14:29.033656 :: WORKER 7 :: MAX RETRIES HIT
Exception in thread Thread-13 (qa_val_vsc_worker):
Traceback (most recent call last):
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "/home/overlord/anaconda3/envs/bwe_312/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    _threading_Thre

In [8]:
eval_qas.to_csv('../data/eval_qas_with_node_ids.csv')

In [None]:
# need to prune based on whether or not each ground truth was used for the Q&A pair
# standardize/look up ids for ground truth