SyntaxError: invalid syntax (1837277231.py, line 1)

In [None]:
from function.retriever import initialize_retriever
import shutil
import os

retriever = initialize_retriever()

docs = retriever.vectorstore.docstore._dict.values()
for doc in docs:
    if (path := doc.metadata.get("image_path")) is not None:
        new_path = path.replace("processed/good_figures", "test_extracted_images")
        os.makedirs(os.path.dirname(new_path), exist_ok=True)
        shutil.copy(path, new_path)


In [4]:
from function.retriever import initialize_retriever
import shutil
import os

retriever = initialize_retriever()

docs = retriever.vectorstore.docstore._dict.values()
for doc in docs:
    if (path := doc.metadata.get("image_path")) is not None:
        new_path = path.replace("processed/good_figures", "test_extracted_images")
        os.makedirs(os.path.dirname(new_path), exist_ok=True)
        shutil.copy(path, new_path)


Loaded existing FAISS index from faiss_db


In [3]:
import os
from pathlib import Path

import chromadb
from chromadb.config import Settings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# sys.path.append("..")
from function.retriever import add_documents_to_retriever, preprocess_documents
from function.secrets import secrets

PERSIST_DIRECTORY = "chroma_db"
PDF_PATH = Path("data/test_pdf/")


def initialize_database() -> Chroma:
    embedding_model = OpenAIEmbeddings(
        model=os.environ.get("embedding_model"), api_key=secrets.OPENAI_API_KEY
    )

    chroma_client = chromadb.PersistentClient(
        path=PERSIST_DIRECTORY,
        settings=Settings(anonymized_telemetry=False, allow_reset=True),
    )

    # Initialize retriever with persistent vectorstore
    vectorstore = Chroma(
        client=chroma_client,
        collection_name="multimodal_docs",
        embedding_function=embedding_model,
        persist_directory=PERSIST_DIRECTORY,
    )

    return vectorstore


def initialize_retriever(vectorstore: Chroma) -> MultiVectorRetriever:
    store = InMemoryStore()
    id_key = "content_id"
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore, docstore=store, id_key=id_key, search_kwargs={}
    )
    return retriever


def add_pdfs_to_retriever(pdfs: list[str], retriever: MultiVectorRetriever):
    pdf_paths = [PDF_PATH / f"{pdf}.pdf" for pdf in pdfs]
    doc_infos = preprocess_documents(pdf_paths)
    add_documents_to_retriever(doc_infos, retriever)


def remove_pdfs_from_retriever(
    deleted_pdfs: list[str], retriever: MultiVectorRetriever
):
    # return  # remove when ready to prevent accidentally deleting pdfs
    all_stored_companies = set(
        _["company"] for _ in retriever.vectorstore.get()["metadatas"]
    )
    company_ids = {
        venue: retriever.vectorstore.get(where={"company": venue})["ids"]
        for venue in all_stored_companies
    }
    for pdf in deleted_pdfs:
        retriever.vectorstore.delete(company_ids[pdf])


def update_retriever(retriever: MultiVectorRetriever):
    all_stored_companies = set(
        _["company"] for _ in retriever.vectorstore.get()["metadatas"]
    )

    all_companies = set(
        path.name.replace(".pdf", "") for path in PDF_PATH.glob("*.pdf")
    )

    new_pdfs = all_companies - all_stored_companies
    deleted_pdfs = all_stored_companies - all_companies

    add_pdfs_to_retriever(new_pdfs, retriever)
    remove_pdfs_from_retriever(deleted_pdfs, retriever)

    print(f"all pdfs in {PDF_PATH}: {all_companies}")
    print(f"all pdfs in database: {all_stored_companies}")
    print(f"new pdfs: {new_pdfs}")
    print(f"deleted pdfs: {deleted_pdfs}")


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain.chains.llm import LLMChain
  warn(
* 'allow_population_by_field_name' has been renamed to 'populate_by_name'


In [4]:
vectorstore = initialize_database()
retriever = initialize_retriever(vectorstore)
update_retriever(retriever)

all pdfs in data/test_pdf: {'2024 Sherwood Country Club Wedding Packages', '_2024 Weddings at Lake Arrowhead~-merged', '2024 Queen Mary Ceremony Locations-merged'}
all pdfs in database: {'2024 Sherwood Country Club Wedding Packages', '_2024 Weddings at Lake Arrowhead~-merged', '2024 Queen Mary Ceremony Locations-merged'}
new pdfs: set()
deleted pdfs: set()


In [4]:
vectorstore = initialize_database()
retriever = initialize_retriever(vectorstore)

In [7]:
retriever.vectorstore.get().keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'included'])

In [11]:
all_stored_companies = set(
    _["company"] for _ in retriever.vectorstore.get()["metadatas"]
)
all_stored_companies
for company in all_stored_companies:
    print(f"company: {company}")

company: _2024 Weddings at Lake Arrowhead~-merged
company: 2024 Queen Mary Ceremony Locations-merged
company: 2024 Sherwood Country Club Wedding Packages


In [23]:
texts = 0
images = 0
companies = 0
known_companies = []
for entry in retriever.vectorstore.get()["metadatas"]:
    if entry["type"] == "text":
        texts += 1
    if entry["type"] == "image":
        images += 1
    if (company := entry["company"]) not in known_companies:
        known_companies.append(company)
        companies += 1
print("Summary of database:")
print("--------------------")
print(f"Number of companies: {companies}")
print(f"Number of texts: {texts}")
print(f"Number of image descriptions: {images}")


Summary of database:
--------------------
Number of companies: 3
Number of texts: 3
Number of image descriptions: 66


In [1]:
system_prompt = """
    You are a helpful assistant for a wedding venue search system. 
    Use the provided context to answer questions about wedding venues 
    and related information. 
    The provided context contains the text from documents and the 
    description of images from the document. Consider both to answer 
    the question.
    If the information isn't in the context, say so. 
    Be concise but informative."""
system_prompt

"\n    You are a helpful assistant for a wedding venue search system. \n    Use the provided context to answer questions about wedding venues \n    and related information. \n    The provided context contains the text from documents and the \n    description of images from the document. Consider both to answer \n    the question.\n    If the information isn't in the context, say so. \n    Be concise but informative."

In [3]:
vectorstore = initialize_database()
retriever = initialize_retriever(vectorstore)
all_stored_companies = set(
    _["company"] for _ in retriever.vectorstore.get()["metadatas"]
)
all_stored_companies

{'2024 Queen Mary Ceremony Locations-merged',
 '2024 Sherwood Country Club Wedding Packages',
 '_2024 Weddings at Lake Arrowhead~-merged'}

In [10]:
all_stored_companies = set(
    _["company"] for _ in retriever.vectorstore.get()["metadatas"]
)
all_stored_companies

{'2024 WEDDING PACKAGES', '_2024 Weddings at Lake Arrowhead~-merged'}