In [None]:
"""
Batch Gemini Inference and CSV Aggregator

This notebook takes either a number of samples or a list of specific sample filenames,
calls `run_gemini` on each sample, collects the outputs, and generates a single CSV file
aggregating all results.

NOTE: run_gemini.ipynb must be in the same directory as this notebook
"""

num_to_sample = 10
path_to_sample_from = "../data/img"
sampled_paths = None

In [None]:
import os
import random

# If sampled_paths is provided, use it directly
if sampled_paths is None:
    all_files = [
        f for f in os.listdir(path_to_sample_from)
        if f.lower().endswith(('.jpg', '.jpeg'))
    ]

    # Randomly sample the desired number of files
    sampled_files = random.sample(all_files, min(num_to_sample, len(all_files)))

    # Save the full paths to a list
    sampled_paths = [os.path.join(path_to_sample_from, f) for f in sampled_files]

print("Will generate CSV based on sampled files:")
for path in sampled_paths:
    print(path)

In [None]:
import importlib
import herbarium_label_extractor
importlib.reload(herbarium_label_extractor)
from herbarium_label_extractor import HerbariumLabelExtractor

with open('prompts/system_instructions_no_ocr.md') as f:
    sys_instr = f.read()
with open('prompts/few_shot_prompt_no_ocr.md') as f:
    few_shot = f.read()

extractor = HerbariumLabelExtractor(
    system_instructions=sys_instr,
    few_shot_prompt=few_shot,
    few_shot_image_paths=[
        '../img/IMG_2708.jpg',
    ],
    output_dir='../tmp'
)

for img_path in ["../img/IMG_2713.jpg"]:
    result = extractor.classify(img_path)
    print(result)
