In [None]:
from langchain_core.messages import HumanMessage
from langchain_google_vertexai import ChatVertexAI
import os
import pdf2image
import pathlib

In [None]:
def extract_filename(file_path:str):
    return pathlib.PurePosixPath(file_path).name.replace(pathlib.PurePosixPath(file_path).suffix,'')

def convert_pdf_to_images(input_pdf_path, output_folder, dpi=500):
    """
    Converts a PDF file to a list of PNG images.

    Args:
        input_pdf_path (str): The path to the input PDF file.
        output_folder (str): The path to the output folder where the images will be saved.
        dpi (int, optional): The DPI of the output images. Default is 500.

    Returns:
        list: A list of PNG images.
    """
    

    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Convert PDF to images
    pdf_images = pdf2image.convert_from_path(input_pdf_path, dpi)
    fname_list=[]
    # Save images to output folder
    for i, image in enumerate(pdf_images):
        file_name=extract_filename(input_pdf_path)
        fname = f'{output_folder}/{file_name}_{str(i)}.png'
        image.save(fname, "PNG")
        fname_list.append(fname)

    return fname_list

def image_with_llm(llm, image_path, prompt):
    """Extract table from image using LLM.

    Args:
    image_path: Path to the image file.
    llm: LLM model to use for extraction.

    Returns:
    Extracted table.
    """

    image_message = {
        "type": "image_url",
        "image_url": {"url": image_path},
    }
    text_message = {
        "type": "text",
        "text": prompt,
    }
    message = HumanMessage(content=[text_message, image_message])

    output = llm([message])
    return output.content

def save_html(html_string, filename):
    """
    Save an HTML string to a file.

    Args:
    html_string: The HTML string to save.
    filename: The name of the file to save the HTML string to.
    """

    with open(filename, "w") as f:
        f.write(html_string)

In [None]:
llm = ChatVertexAI(
    model_name="gemini-pro-vision",
    temperature=0
)

In [None]:
base_path="."
file_name="example.pdf"
input_pdf_path = f'{base_path}/{file_name}'
output_pic_folder = f'{base_path}/pic_file'
output_html_folder = f'{base_path}/html'


pics = convert_pdf_to_images(input_pdf_path,output_pic_folder)


In [None]:
check_table_exist_prompt="Are there any table within this picture? return Yes or No"

result = [pic for pic in pics if image_with_llm(llm,pic,check_table_exist_prompt).strip() == "Yes"]

In [None]:
extract_table_to_html_prompt="Extract data as html format, separate the column carefully."
for _ in result:
    htlm_str = image_with_llm(llm, _, extract_table_to_html_prompt)
    save_html(
        htlm_str, 
        f"{output_html_folder}/{extract_filename(_)}.html")