In [1]:
import os
from pathlib import Path
# Change cwd to the project root (parent of 'notebooks/')
os.chdir(Path.cwd().parent)
Path.cwd()

PosixPath('/Users/jbrandt/code/birddog')

In [2]:
import boto3
#from botocore.exceptions import ClientError
#from botocore.exceptions import BucketAlreadyOwnedByYou, BucketAlreadyExists
import json

In [3]:
from birddog.core import Archive
from birddog.ai import _form_table_column_classifier_prompt

2025-04-22 21:37:49,165 [INFO] Using Google Cloud translation API (credentials file:/Users/jbrandt/code/birddog/google-cloud-translate-key.json)
2025-04-22 21:37:49,175 [INFO] Found credentials in environment variables.
2025-04-22 21:37:49,218 [INFO] Using AWS S3 bucket birddog-data for storage.


In [4]:
client = boto3.client("bedrock-runtime", region_name="us-east-1")

In [5]:
page = Archive('DAZHO')

2025-04-22 21:37:50,367 [INFO] HistoryLRU.lookup(ДАЖО/Д): cache miss
2025-04-22 21:37:50,522 [INFO] fetch_url: 1 requests in last 60s → 0.02 req/s
2025-04-22 21:37:51,218 [INFO] Retrieved from cache: DAZHO-D[2025,01,19,12:47]: page_cache/DAZHO-D/2025,01,19,12:47.json


In [6]:
classes = {
    "DATE": "A column indicating a single date or a date range",
    "DESCRIPTION": "A column containing a textual description of the item",
    "ID": "A unique row identifier, number, or code"
}


In [7]:
headers = [col['uk'] for col in page.header]
print(headers)
max_rows = 3
rows = [ [ item['text']['uk'] for item in row ] for row in page.children[:max_rows] ]
print(rows)

['№', 'Назва фонду', 'Крайні дати', 'Справ']
[['1', 'Волинська духовна консисторія м. Житомир, Волинської губернії', '1741–1921', ''], ['2', 'Житомирський повітовий суд м. Житомир, Волинської губернії', '1795-1872', '1280'], ['3', 'Житомирський городовий магістрат м. Житомир, Волинської губернії', '1800–1849', '201']]


In [8]:
full_prompt = _form_table_column_classifier_prompt(headers, classes, rows)
print(full_prompt)

You are given a list of table column headers in Ukranian and their indices:
0: №
1: Назва фонду
2: Крайні дати
3: Справ

Classify each header into one of the following types based on the descriptions below:
- DATE: A column indicating a single date or a date range
- DESCRIPTION: A column containing a textual description of the item
- ID: A unique row identifier, number, or code

Here are some sample rows of the table:
Row 1: ['1', 'Волинська духовна консисторія м. Житомир, Волинської губернії', '1741–1921', '']
Row 2: ['2', 'Житомирський повітовий суд м. Житомир, Волинської губернії', '1795-1872', '1280']
Row 3: ['3', 'Житомирський городовий магістрат м. Житомир, Волинської губернії', '1800–1849', '201']

Rules:
- Assign exactly one type to each header.
- Use each type at most once, except for 'Other', which can be reused.
- If a header does not clearly fit any type, classify it as 'OTHER'.

Respond with a JSON list of strings (no objects), where the i-th element is the classification 

In [10]:
def message_body(prompt):
    return {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 1000,
        "messages": [
          {
            "role": "user",
            "content": [
              {
                "type": "text",
                "text": prompt
              }
            ]
          }
        ]
      }


In [11]:
response = client.invoke_model(
    modelId="anthropic.claude-3-sonnet-20240229-v1:0",
    contentType="application/json",
    accept="application/json",
    body=json.dumps(message_body(full_prompt)))

In [12]:
print(response)

{'ResponseMetadata': {'RequestId': 'afc13671-a522-4cec-94a8-a6d71d65adfb', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 23 Apr 2025 03:38:06 GMT', 'content-type': 'application/json', 'content-length': '285', 'connection': 'keep-alive', 'x-amzn-requestid': 'afc13671-a522-4cec-94a8-a6d71d65adfb', 'x-amzn-bedrock-invocation-latency': '735', 'x-amzn-bedrock-output-token-count': '13', 'x-amzn-bedrock-input-token-count': '386'}, 'RetryAttempts': 0}, 'contentType': 'application/json', 'body': <botocore.response.StreamingBody object at 0x1076358d0>}


In [13]:
content = json.loads(response['body'].read())

In [14]:
print(content)

{'id': 'msg_bdrk_01Pji6QR5ArUYaoGE36E8efE', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-sonnet-20240229', 'content': [{'type': 'text', 'text': '["ID","DESCRIPTION","DATE","ID"]'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 386, 'output_tokens': 13}}


In [15]:
for key in content:
    print(key, content[key])

id msg_bdrk_01Pji6QR5ArUYaoGE36E8efE
type message
role assistant
model claude-3-sonnet-20240229
content [{'type': 'text', 'text': '["ID","DESCRIPTION","DATE","ID"]'}]
stop_reason end_turn
stop_sequence None
usage {'input_tokens': 386, 'output_tokens': 13}


In [16]:
print(content["content"][0]['text'])

["ID","DESCRIPTION","DATE","ID"]


In [None]:
mapping = json.loads(content["content"][0]['text'])

In [None]:
mapping