In [1]:
import io
import sys

from google.cloud import storage

sys.path.append("..")
from function.retriever import initialize_retriever

retriever = initialize_retriever()

metadatas = [entry.metadata for entry in retriever.vectorstore.docstore._dict.values()]
all_stored_venues = set(_["company"] for _ in metadatas)
all_stored_venues

Created new FAISS index in faiss_db


{'dummy'}

In [2]:
from pathlib import Path

from function.cloud import list_files

all_venues = set([Path(path).stem for path in list_files(filter=r"venues/")])

unprocessed_venues = all_venues - all_stored_venues
venue = list(unprocessed_venues)[1]
venue


'The Los Angeles Arboretum'

In [3]:
from dotenv import load_dotenv

from function.retriever import preprocess_document

load_dotenv(override=True)

info = preprocess_document("Majestic Garden Hotel")


searching for Majestic Garden Hotel.pdf on google cloud...
downloading Majestic Garden Hotel.pdf from google cloud...
sending Majestic Garden Hotel.pdf to Adobe...
extracting text from pdf...
2024-12-22 00:28:19 unzip file
2024-12-22 00:28:19 open json file
2024-12-22 00:28:19 extract text
generating image descriptions for Majestic Garden Hotel.pdf
Generating image descriptions...
   (1/2) /var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/tmp216si82o/figures/fileoutpart1.png
   (2/2) /var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/tmp216si82o/figures/fileoutpart0.png
skipping /var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/tmp216si82o/figures/fileoutpart0.png
uploading adobe_extracted_directory to google cloud


In [5]:
import os

from function.retriever import add_documents_to_retriever

PERSIST_DIRECTORY: str = os.getenv("DATABASE_DIR")

add_documents_to_retriever({venue: info}, retriever)
retriever.vectorstore.save_local(PERSIST_DIRECTORY)


Processed document: Vellano Estate by Wedgewodo Weddings


In [3]:
import sys
from typing import Any, Dict

import pandas as pd

sys.path.append("..")
from function.retriever import *


In [4]:
def load_venue_metadata() -> Dict[str, Dict[str, Any]]:
    """
    Load venue metadata from Excel file into a dictionary.

    Parameters
    ----------
    excel_path : str
        Path to the Excel file containing venue metadata

    Returns
    -------
    Dict[str, Dict[str, Any]]
        Dictionary with venue names as keys and their metadata as values
    """
    bucket_name = "wedding-venues-001"
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    excel_blob = bucket.blob("Wedding Venues.xlsx")
    excel_content = excel_blob.download_as_bytes()
    df = pd.read_excel(io.BytesIO(excel_content))
    # Assuming 'venue_name' is the column that matches the folder names
    metadata_dict = {}
    for _, row in df.iterrows():
        metadata_dict[row["Venue name"]] = {
            "phone": row.get("Phone", ""),
            "address": row.get("Location", ""),
            "website": row.get("Website ", ""),
        }
    return metadata_dict

In [5]:
def preprocess_document(
    venue: str, venue_metadata: Dict[str, Dict[str, Any]]
) -> dict[str, Any]:
    """
    Modified version of preprocess_document that includes venue metadata.
    """
    with (
        NamedTemporaryFile(suffix=".pdf") as temp_pdf_file,
        NamedTemporaryFile(suffix=".zip") as temp_zip_file,
        TemporaryDirectory() as temp_output_dir,
    ):
        print(f"searching for {venue}.pdf on google cloud...")
        cloud_venue_path = list_files(filter=rf"venues/{venue}/.*.pdf")[0]
        print(f"downloading {venue}.pdf from google cloud...")
        download_file(cloud_venue_path, temp_pdf_file.name)
        print(f"sending {venue}.pdf to Adobe...")
        adobeLoader(temp_pdf_file.name, temp_zip_file.name)
        print("extracting text from pdf...")
        text_content = extract_text_from_file_adobe(temp_zip_file.name, temp_output_dir)

        extracted_figure_folder = Path(temp_output_dir) / "figures"
        if not extracted_figure_folder.exists():
            print(f"no images found for {venue}.pdf")
            image_descriptions = []
        else:
            print(f"generating image descriptions for {venue}.pdf")
            image_descriptions = generate_image_descriptions(
                base_dir=extracted_figure_folder,
                venue=venue,
            )
        print("uploading adobe_extracted_directory to google cloud")
        upload_directory(temp_output_dir, f"/processed/adobe_extracted/{venue}/")

    doc_id = str(uuid.uuid4())
    # Include venue metadata in document_info
    venue_info = venue_metadata.get(venue, {})
    document_info = {
        "doc_id": doc_id,
        "text_content": text_content,
        "image_descriptions": image_descriptions,
        "metadata": venue_info,
    }

    return document_info

In [6]:
def add_documents_to_retriever(
    documents: dict[str, dict[str, Any]],
    retriever: MultiVectorRetriever,
    venue_metadata: Dict[str, Dict[str, Any]],
) -> None:
    """
    Modified version of add_documents_to_retriever that includes venue metadata.
    """
    id_key = "content_id"

    for pdf_name, doc_info in documents.items():
        # Get venue metadata
        venue_info = venue_metadata.get(pdf_name, {})
        text_docs = [
            Document(
                page_content=doc_info["text_content"],
                metadata={
                    id_key: f"{doc_info['doc_id']}_text",
                    "doc_id": doc_info["doc_id"],
                    "company": pdf_name,
                    "type": "text",
                    "website": venue_info.get("website", ""),
                    "address": venue_info.get("address", ""),
                    "phone": venue_info.get("phone", ""),
                },
            )
        ]

        # Create image documents with metadata
        image_ids = [
            f"{doc_info['doc_id']}_image_{i}"
            for i in range(len(doc_info["image_descriptions"]))
        ]
        image_docs = [
            Document(
                page_content=item["description"],
                metadata={
                    id_key: image_ids[i],
                    "doc_id": doc_info["doc_id"],
                    "company": pdf_name,
                    "type": "image",
                    "image_path": item["image_path"],
                    "website": venue_info.get("website", ""),
                    "address": venue_info.get("address", ""),
                    "phone": venue_info.get("phone", ""),
                    # Add any other metadata fields
                },
            )
            for i, item in enumerate(doc_info["image_descriptions"])
        ]

        all_docs = text_docs + image_docs
        retriever.vectorstore.add_documents(all_docs)

        original_data = [(doc.metadata[id_key], doc) for doc in all_docs]
        retriever.docstore.mset(original_data)

        print(f"Processed document: {pdf_name}")

In [7]:
venue_metadata = load_venue_metadata()

In [8]:
from dotenv import load_dotenv

load_dotenv(override=True)
info = preprocess_document(venue, venue_metadata)

searching for The Los Angeles Arboretum.pdf on google cloud...
downloading The Los Angeles Arboretum.pdf from google cloud...
sending The Los Angeles Arboretum.pdf to Adobe...
extracting text from pdf...
2024-12-22 01:47:17 unzip file
2024-12-22 01:47:17 open json file
2024-12-22 01:47:17 extract text
generating image descriptions for The Los Angeles Arboretum.pdf
Generating image descriptions...
   (1/45) /var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/tmpi_p_wchr/figures/fileoutpart41.png
skipping /var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/tmpi_p_wchr/figures/fileoutpart41.png
   (2/45) /var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/tmpi_p_wchr/figures/fileoutpart40.png
skipping /var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/tmpi_p_wchr/figures/fileoutpart40.png
   (3/45) /var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/tmpi_p_wchr/figures/fileoutpart42.png
   (4/45) /var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/tmpi_p_wchr/figures/fileoutpart43.png
skipping /var/folder

In [9]:
add_documents_to_retriever({venue: info}, retriever, venue_metadata)
retriever.vectorstore.save_local(PERSIST_DIRECTORY)

Processed document: The Los Angeles Arboretum


In [10]:
retriever.vectorstore.docstore._dict

{'73eb21c3-ac1e-4d1a-9815-a15aa5f2b7db': Document(metadata={'company': 'dummy', 'type': 'text'}, page_content='dummy document'),
 '79c19657-f554-45b2-a646-028e3595c6ce': Document(metadata={'content_id': '99235b9f-76c4-4369-8655-1a304c778067_text', 'doc_id': '99235b9f-76c4-4369-8655-1a304c778067', 'company': 'The Los Angeles Arboretum', 'type': 'text', 'website': 'https://arboretum.org/rentals/weddings/', 'address': nan, 'phone': nan}, page_content='THE LOS ANGELES COUNTY ARBORETEUM \nLOCATIONS \nThe Los Angeles County Arboretum & Botanic Garden provides naturally beautiful settings for wedding ceremonies and receptions from Mid March through November.With over 120 acres of property, there are multiple options for indoor and outdoor ceremony & reception combinations to make your wedding experience special. \nCeremony & reception combinations Starting Prices \nCeremonies available from \n5:00 p.m. – 7:00 p.m. (Set up 3 p.m. -5 p.m.) \nReceptions Available from 5:30 p.m. - 10:00PM \n(Take