In [2]:

# from src.settings import PROJECT_ROOT_PATH
# from langchain_core.documents import Document
# from typing import List
# from tqdm import tqdm
# import pandas as pd
import os
# from langchain_text_splitters import CharacterTextSplitter, TextSplitter
# from vertexai.language_models import TextEmbeddingModel

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "prj-ilios-ai.json"
os.environ["PROJECT_ID"] = "602280418311"
os.environ["LOCATION"] = "us-west1"

In [None]:
file_name = "Site Green - Emerald Garden - Cape Fear.pdf"

In [None]:
processor = DocAIProcessor(location="us",
                           project_id="602280418311",
                           processor_id="e977fdd46ee23308")
doc_sequence = processor.process_document(
    file_path=PROJECT_ROOT_PATH / "data/documents/Site Green - Emerald Garden - Cape Fear.pdf")

In [None]:
def get_text_splitter() -> TextSplitter:
    """Get the text splitter to be used for the VectorDB."""
    text_splitter = CharacterTextSplitter(
        separator=".\n",
        chunk_size=600,
        chunk_overlap=600 // 2,
    )
    return text_splitter

def split_docs(
        file_sequence: FileSequence,
        add_tables_and_form_fields: bool = True,
        split_tables_and_form_fields: bool = False,
               ) -> List[Document]:
    text = file_sequence.get_all_text()
    text_splitter = get_text_splitter()
    tables = [dataframe_to_string(table) for table in file_sequence.get_tables()]
    form_fields = [
        dict_to_string(fields) for fields in file_sequence.get_form_fields()
    ]
    
    docs = text_splitter.create_documents([text])
    if add_tables_and_form_fields:
        if split_tables_and_form_fields:
            docs.extend(text_splitter.create_documents(tables))
            docs.extend(text_splitter.create_documents(form_fields))
        else:
            docs.extend(Document(table) for table in tables)
            docs.extend(Document(form_field) for form_field in form_fields)
    return docs


In [None]:
docs = split_docs(doc_sequence)

In [None]:
docs[0].page_content

In [None]:
def text_embedding(text: str) -> list:
    """Text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    embeddings = model.get_embeddings([text])
    vector = embeddings[0].values
    return vector
    

In [None]:
prepped_data = pd.DataFrame(data={"text": [doc.page_content for doc in docs], 
                   "file_name": [file_name for _ in docs],
             "embedding": [text_embedding(doc.page_content) for doc in tqdm(docs)]})


In [None]:
prepped_data.head()

In [None]:
prepped_data.to_json(PROJECT_ROOT_PATH / "data/processed/processed_data.json", lines=True, orient="records")

In [None]:
prepped_data

In [None]:
prepped_data.to_json(PROJECT_ROOT_PATH / "data/processed_data.json", lines=True, orient="records")

In [None]:
blob = "gs://cloud-ai-platform-458b4ded-772b-441a-9faf-173c984099b6/chatbot/chunks.jsonl"

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=os.environ['PROJECT_ID'], location=os.environ['LOCATION'])
_ = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"chatbot-docs",
    contents_delta_uri=blob,
    dimensions=768,
    approximate_neighbors_count=10,
)

In [3]:
from src.vectordb.pg_vector.retriever import PGVectorConnector