In [12]:
from base64 import b64encode
from pathlib import Path

import os
import asyncio
import nest_asyncio
import pandas as pd
from dotenv import load_dotenv
from langchain_anthropic.chat_models import ChatAnthropic
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.tracers.context import tracing_v2_enabled
from collections import Counter

from src.output_schema import Classification

In [2]:
nest_asyncio.apply()
load_dotenv()

True

In [3]:
DATA_FOLDER = Path("data/")
TEMPERATURE = 0.3
OUTPUT_PARSER = JsonOutputParser(pydantic_object=Classification)
SYSTEM_MESSAGE = f"""Please classify the following image into the most appropriate category. If the image does not clearly fit any category or if you're unsure, select 'UNKNOWN'. Here are the categories to choose from:

- COVER_PAGE: The image serves as the front page or cover of a document or book.
- BLANK_PAGE: The image shows a blank page without text or significant markings.
- TEXT_PAGE: The image is predominantly text-based, similar to a book or document page.
- IMAGE_PAGE: The image is primarily a photograph or illustration without significant text.
- DIAGRAM_PAGE: The image contains diagrams, charts, or graphs, with minimal text.
- TEXT_PLUS_IMAGE_PAGE: The image includes both text and significant photographic or illustrative content.
- TEXT_PLUS_DIAGRAM_PAGE: The image combines text with diagrams, charts, or graphs.
- TABLE_PAGE: The image features tables or spreadsheets.
- TEXT_PLUS_TABLE_PAGE: The image includes both text and table(s) or spreadsheet(s).

Select the single most fitting category based on the image's content.
{OUTPUT_PARSER.get_format_instructions()}"""

In [4]:
gpt = ChatOpenAI(model="gpt-4-vision-preview", temperature=TEMPERATURE)
claude = ChatAnthropic(model="claude-3-opus-20240229", temperature=TEMPERATURE)

In [5]:
def encode_image_to_base64(img_path):
    """Encode the image located at img_path to a base64 string."""
    try:
        with img_path.open("rb") as img_file:
            return b64encode(img_file.read()).decode("utf-8")
    except Exception as e:
        print(f"Error reading the image file: {e}")
        return None

In [22]:
def prepare_messages(img_base64, system_message=SYSTEM_MESSAGE):
    """Prepare messages for inference"""
    human_message = HumanMessage(
        content=[
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{img_base64}"},
            }
        ],
    )
    messages = [("system", system_message), human_message]
    return messages

In [7]:
def gen_batches(iterable, n=99):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

### 2022-denver-green

In [8]:
file_names = list(DATA_FOLDER.glob("*_2022-denver-green-code.jpg"))
file_names = sorted(file_names, key=lambda x: int(x.stem.split('_')[0]))
encoded_images = [encode_image_to_base64(img_path=file) for file in file_names]
prepared_messages = [prepare_messages(img_base64=img) for img in encoded_images]
chain = (gpt | OUTPUT_PARSER)

In [9]:
results_1 = []
batch_size = 20
total = len(prepared_messages) // batch_size + 1
with tracing_v2_enabled(project_name="2022-denver-green"):
    for idx, batch in enumerate(gen_batches(prepared_messages, batch_size)):
        print(f"Batch {idx}/{total} processing...")
        results_1.extend(await chain.abatch(batch, return_exceptions=True))
        print(f"Batch {idx}/{total} processed! sleeping...")
        await asyncio.sleep(60)
        print(f"Batch {idx}/{total} done!")


Batch 0/14 processing...
Batch 0/14 processed! sleeping...
Batch 0/14 done!
Batch 1/14 processing...
Batch 1/14 processed! sleeping...
Batch 1/14 done!
Batch 2/14 processing...
Batch 2/14 processed! sleeping...
Batch 2/14 done!
Batch 3/14 processing...
Batch 3/14 processed! sleeping...
Batch 3/14 done!
Batch 4/14 processing...
Batch 4/14 processed! sleeping...
Batch 4/14 done!
Batch 5/14 processing...
Batch 5/14 processed! sleeping...
Batch 5/14 done!
Batch 6/14 processing...
Batch 6/14 processed! sleeping...
Batch 6/14 done!
Batch 7/14 processing...
Batch 7/14 processed! sleeping...
Batch 7/14 done!
Batch 8/14 processing...
Batch 8/14 processed! sleeping...
Batch 8/14 done!
Batch 9/14 processing...
Batch 9/14 processed! sleeping...
Batch 9/14 done!
Batch 10/14 processing...
Batch 10/14 processed! sleeping...
Batch 10/14 done!
Batch 11/14 processing...
Batch 11/14 processed! sleeping...
Batch 11/14 done!
Batch 12/14 processing...
Batch 12/14 processed! sleeping...
Batch 12/14 done!
Bat

In [20]:
df_1 = pd.DataFrame(zip(file_names, [c["label"] if isinstance(c, dict) else c for c in results_1]), columns=["local_image_path", "label"])
df_1.to_csv("2022-denver-green-code.csv", index=False)

In [22]:
df_1.label.value_counts()

label
TEXT_PAGE                                                                                                                                                                                                                                                                                   143
TABLE_PAGE                                                                                                                                                                                                                                                                                   34
BLANK_PAGE                                                                                                                                                                                                                                                                                   33
TEXT_PLUS_TABLE_PAGE                                                                                                              

### 20201119Complete_Denver_Zoning_Code_updated11122020

In [8]:
file_names = list(DATA_FOLDER.glob("*_20201119Complete_Denver_Zoning_Code_updated11122020.png"))
file_names = sorted(file_names, key=lambda x: int(x.stem.split('_')[0]))
encoded_images = [encode_image_to_base64(img_path=file) for file in file_names]
prepared_messages = [prepare_messages(img_base64=img) for img in encoded_images]
chain = (gpt | OUTPUT_PARSER)

In [42]:
results_2 = []
batch_size = 20
total = len(prepared_messages) // batch_size + 1
with tracing_v2_enabled(project_name="20201119Complete_Denver_Zoning_Code_updated11122020"):
    for idx, batch in enumerate(gen_batches(prepared_messages, batch_size), start=1):

        print(f"Batch {idx}/{total} processing...")

        tmp = await chain.abatch(batch, return_exceptions=True)

        count = Counter([c["label"] if isinstance(c, dict) else c for c in tmp])
        total_strings = sum(count[key] for key in count if isinstance(key, str))
        if total_strings < batch_size - 1:
            print(f"Number of errors: {batch_size - total_strings}")
            break


        results_2.extend(tmp)

        print(f"Batch {idx}/{total} processed! sleeping...")
        await asyncio.sleep(60)


        print("-" * 50)
# List name, split

Batch 0/26 processing...
Batch 0/26 processed! sleeping...
Batch 0/26 done!
Batch 1/26 processing...
Batch 1/26 processed! sleeping...
Batch 1/26 done!
Batch 2/26 processing...
Batch 2/26 processed! sleeping...
Batch 2/26 done!
Batch 3/26 processing...
Batch 3/26 processed! sleeping...
Batch 3/26 done!
Batch 4/26 processing...
Batch 4/26 processed! sleeping...
Batch 4/26 done!
Batch 5/26 processing...
Batch 5/26 processed! sleeping...
