In [None]:
# get the image to process
filename = "IMG_2709"

In [None]:
import os
import json

# Save the locations of the JSON response and the image to variables

# takes the huge JSON blob and puts in the response variable
full_path = os.path.join(os.path.abspath('..'), "json", filename + ".json")
# Load JSON
with open(full_path, 'r') as f:
    response = json.load(f)

# Save the image filename to a variale
image_path = os.path.join(os.path.abspath('..'), "img", filename + ".jpg")


In [None]:
# fetch and clean up the huge raw Google Vision API response into an AI
# friendly, much smaller response that contains grouping nad text only 
# and store it in tmp/herbarium_ocr_output blah

# Step 3: Extract raw text and block-wise grouping
results = {
    "text": response['fullTextAnnotation']['text'],
    "blocks_summary": []
}

for page in response['fullTextAnnotation']['pages']:
    for block in page['blocks']:
        block_text = ""
        for paragraph in block['paragraphs']:
            for word in paragraph['words']:
                word_text = ''.join([symbol['text'] for symbol in word['symbols']])
                block_text += word_text + ' '
        block_summary = {
            "text": block_text.strip(),
            "bounding_box": block['boundingBox'],
            "confidence": block.get('confidence', None)
        }
        results['blocks_summary'].append(block_summary)

# Step 4: Print cleaned and grouped output
import json
print("\nGrouped OCR Output for LLM:\n")
print(json.dumps(results, indent=2))
herbarium_ocr_output = os.path.join(os.path.abspath('..'), "tmp", "herbarium_ocr_output" + filename + ".json")

# Optional: Save to JSON file for LLM input
with open(herbarium_ocr_output, "w") as f:
    json.dump(results, f, indent=2)

print("\nSaved grouped OCR output to " + herbarium_ocr_output)


In [None]:
# build system instructions for the AI

system_instructions = """
System Instructions:

**Role and Goal:**
You are an AI assistant with multimodal capabilities (text and image understanding), specialized in accurately reading and parsing information from herbarium specimen labels. Your primary goal is to extract specific predefined fields from the provided label information and return them in a structured JSON format.

**Input:**
You will be given two pieces of information for each specimen label:
1.  **OCR Text:** Text extracted from the specimen label using Optical Character Recognition. This is your primary source for text-based information.
2.  **Source Image:** The image file from which the OCR text was derived. You should use this image to:
    *   Verify information found in the OCR text.
    *   Clarify ambiguities or correct potential errors in the OCR text.
    *   Potentially find information that was missed or poorly transcribed by the OCR process.

**Output Requirements:**
1.  **Format:** Your response MUST be a single, valid JSON object.
Fields to Extract (with typical presence): You MUST attempt to extract the following fields. The keys in the JSON object MUST EXACTLY match these names. The percentages provided indicate the approximate frequency with which each field is typically found on these labels; use this as contextual information to guide your search but always adhere to the "Handling Missing Information" rules.
    *   "verbatimScientificName" (Typical presence: ~100.00%)
    *   "verbatimScientificNameAuthorship" (Typical presence: unknown)
    *   "verbatimInstitutionName" (Typical presence: ~100.00%)
    *   "verbatimInstitutionNumber" (Typical presence: ~89.85%)
    *   "verbatimIdentifiedBy" (Typical presence: ~45.42%)
    *   "verbatimIdentifiedDate" (Typical presence: ~33.53%)
    *   "verbatimCollectedBy" (Typical presence: ~97.23%)
    *   "additionalCollectedBy" (Typical presence: ~8.94%)
    *   "recordNumber" (Typical presence: ~88.82%)
    *   "verbatimEventDate" (Typical presence: ~66.19%)
    *   "verbatimHabitat" (Typical presence: ~57.23%)
    *   "verbatimSubstrate" (Typical presence: ~29.77%)
    *   "verbatimLocality" (Typical presence: ~93.92%)
    *   "verbatimCoordinates" (Typical presence: ~16.47%)
    *   "verbatimElevation" (Typical presence: ~20.97%)
3.  **Handling Missing Information:**
    *   If the information for any of the specified fields cannot be found in either the OCR Text or the Source Image, the value for that field in the JSON object MUST be `null`.
    *   Do NOT omit any keys from the JSON object, even if their value is `null`. All listed fields must be present as keys in the output.
4.  **Accuracy and Verbatim Fields:**
    *   Extract information as accurately as possible.
    *   Prioritize information clearly visible in the **Source Image** if there's a discrepancy or ambiguity in the **OCR Text**.
    *   For fields designated as "Verbatim" (e.g., "Verbatim Date", "Verbatim Elevation"), extract the text exactly as it appears on the label. Use the **Source Image** to confirm or derive the verbatim text, especially if the OCR output is questionable or incomplete for these fields.
5.  **Strictness:**
    *   Do not infer or add any information that is not explicitly present in the provided OCR Text or visible in the Source Image.
    *   Your response MUST ONLY be the JSON object. Do not include any explanations, apologies, conversational text, or any other text before or after the JSON structure.

**Example of Interaction (for your understanding, do not replicate in output):**

*User will provide (conceptually, actual API format will vary):*


OCR Text:

{
  "text": "LICHENS OF CALIFORNIA\nAcarospora strigata (Nyl.) Jatta\nMono Co., UC White Mtns. Research Station\nCrooked Creek road, East of Research Station.\nN37\u00b030' 18\" W118\u00b009\u2032 11\u2033\nDolomite outcrop in Bristlecone-limber pine forest\nOn rock\nElev. 9995'\nColl: Ronald & Judith Robertson 2 July 2005\nDet: Judith Robertson\nHERBARIUM OF THE\nUC\nWWVERSITY OF CALIFORMA\nNo. 9318\n\u300c",
  "blocks_summary": [
    {
      "text": "LICHENS OF CALIFORNIA",
      "bounding_box": {
        "vertices": [
          {
            "x": 1716,
            "y": 1013
          },
          {
            "x": 3818,
            "y": 1013
          },
          {
            "x": 3818,
            "y": 1120
          },
          {
            "x": 1716,
            "y": 1120
          }
        ]
      },
      "confidence": 0.98448104
    },
    {
      "text": "Acarospora strigata ( Nyl . ) Jatta",
      "bounding_box": {
        "vertices": [
          {
            "x": 1488,
            "y": 1425
          },
          {
            "x": 3436,
            "y": 1467
          },
          {
            "x": 3433,
            "y": 1592
          },
          {
            "x": 1486,
            "y": 1550
          }
        ]
      },
      "confidence": 0.9753486
    },
    {
      "text": "Mono Co. , UC White Mtns . Research Station Crooked Creek road , East of Research Station . N37 \u00b0 30 ' 18 \" W118 \u00b0 09 \u2032 11 \u2033 Dolomite outcrop in Bristlecone - limber pine forest On rock",
      "bounding_box": {
        "vertices": [
          {
            "x": 1260,
            "y": 1725
          },
          {
            "x": 4251,
            "y": 1785
          },
          {
            "x": 4235,
            "y": 2558
          },
          {
            "x": 1245,
            "y": 2498
          }
        ]
      },
      "confidence": 0.97363615
    },
    {
      "text": "Elev . 9995 '",
      "bounding_box": {
        "vertices": [
          {
            "x": 3232,
            "y": 2602
          },
          {
            "x": 3919,
            "y": 2616
          },
          {
            "x": 3916,
            "y": 2722
          },
          {
            "x": 3229,
            "y": 2707
          }
        ]
      },
      "confidence": 0.9147444
    },
    {
      "text": "Coll : Ronald & Judith Robertson 2 July 2005 Det : Judith Robertson",
      "bounding_box": {
        "vertices": [
          {
            "x": 1208,
            "y": 2925
          },
          {
            "x": 4080,
            "y": 2993
          },
          {
            "x": 4074,
            "y": 3291
          },
          {
            "x": 1201,
            "y": 3224
          }
        ]
      },
      "confidence": 0.9750879
    },
    {
      "text": "HERBARIUM OF THE UC WWVERSITY OF CALIFORMA",
      "bounding_box": {
        "vertices": [
          {
            "x": 2229,
            "y": 3427
          },
          {
            "x": 3245,
            "y": 3446
          },
          {
            "x": 3234,
            "y": 4018
          },
          {
            "x": 2218,
            "y": 3999
          }
        ]
      },
      "confidence": 0.8199365
    },
    {
      "text": "No. 9318",
      "bounding_box": {
        "vertices": [
          {
            "x": 3453,
            "y": 3163
          },
          {
            "x": 4022,
            "y": 3174
          },
          {
            "x": 4019,
            "y": 3281
          },
          {
            "x": 3451,
            "y": 3270
          }
        ]
      },
      "confidence": 0.97180176
    },
    {
      "text": "\u300c",
      "bounding_box": {
        "vertices": [
          {
            "x": 5435,
            "y": 3653
          },
          {
            "x": 5388,
            "y": 3731
          },
          {
            "x": 5338,
            "y": 3698
          },
          {
            "x": 5386,
            "y": 3622
          }
        ]
      },
      "confidence": 0.7091904
    }
  ]
}

Source Image:
[An image file, e.g., label_image.jpg]

*You will respond with (ONLY the JSON, having used the image to correct OCR errors and understand grouing):*

```json
{
  "verbatimScientificName": "Acarospora strigata",
  "verbatimScientificNameAuthorship": "(Nyl.) Jatta",
  "verbatimInstitutionName": "Herbarium of the University of California",
  "verbatimInstitutionNumber": "",
  "verbatimIdentifiedBy": "Judith Robertson",
  "verbatimIdentifiedDate": null,
  "verbatimCollectedBy": "Ronald & Judith Robertson",
  "additionalCollectedBy": null,
  "recordNumber": "9318",
  "verbatimEventDate": "2 July 2005",
  "verbatimHabitat":  "Dolomite outcrop in Bristlecone-limber pine forest On rock"
  "verbatimSubstrate": null,
  "verbatimLocality": "Mono Co., UC White Mtns. Research Station Crooked Creek road, East of Research Station.",
  "verbatimCoordinates": "N37°30' 18\" W118°09′ 11″",
  "verbatimElevation": "9995'"
}
```
"""

In [None]:
from IPython.display import Image as IPythonImage

# build the prompt and the image for the model


with open(herbarium_ocr_output, 'r') as f:
    json_data = json.load(f)

# Load the image as input
image = IPythonImage(filename=image_path)


prompt = f"""
Raw OCR text:
{json.dumps(json_data)}
"""


In [None]:
import google.generativeai as genai
from dotenv import load_dotenv
import os

load_dotenv()

# Set up the Gemini call
# If you get "ModuleNotFoundError: No module named 'google'", run: 
# pip install google-generativeai
# If you get "ModuleNotFoundError: No module named 'dotenv'", run:
# pip install python-dotenv

genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))

model_name = "gemini-2.5-pro-preview-05-06"

model = genai.GenerativeModel(
    model_name,
    system_instruction=system_instructions
)
response = model.generate_content(contents=[prompt, image])



In [None]:
import re
import json

response_text = response.candidates[0].content.parts[0].text
# Convert response to JSON-serializable format
response_serializable = re.sub(r'^```json\s*|```$', '', response_text.strip())

# Save to file
output_path = os.path.join(os.path.abspath('..'), "tmp", "herbarium_processed_output" + filename + ".json")

with open(output_path, 'w') as f:
    json.dump(response_serializable, f, indent=2)

print(f"Saved output to {output_path}")
print(response_serializable)