# Here we explore the langchain retriever proposal

In [None]:
from langchain_google_vertexai import VectorSearchVectorStore
import time
from typing import List
from pathlib import Path
from src.preprocessing.doc_ai.processor import DocAIProcessor
import os
from src.settings import PROJECT_ROOT_PATH
from src.vectordb.gcp_vector_search.transform_and_load import single_text_embedding
from google.cloud import aiplatform

os.environ['DOC_AI_LOCATION'] ="us"
os.environ['DOC_AI_PROCESSOR_ID'] = "e977fdd46ee23308"
os.environ['PROJECT_ID'] = "602280418311"
os.environ['LOCATION'] = "us-west1"
os.environ['GCS_BUCKET'] = "cloud-ai-platform-458b4ded-772b-441a-9faf-173c984099b6"

BUCKET_URI = f"gs://{os.environ['GCS_BUCKET']}"
# The number of dimensions for the tensorflow universal sentence encoder.
# If other embedder is used, the dimensions would probably need to change.
DIMENSIONS = 768
EMBEDDING_DIR = f"{BUCKET_URI}/test"


# Init embedding

In [None]:
import json

initial_config = {
    "id": "test_id",
    "text": "test text",
    "filename": "test_filename.pdf",
    "embedding": single_text_embedding("test text"),
}

with open("data.json", "w") as f:
    json.dump(initial_config, f)

!gsutil cp data.json {EMBEDDING_DIR}/file.json

In [None]:
aiplatform.init(project=os.environ['PROJECT_ID'], location=os.environ['LOCATION'], staging_bucket=BUCKET_URI)

In [None]:
version = time.strftime("%Y%m%d-%H%M%S")
suffix = "_langchain_retriever"
chunks_filename = f"chunks_working{suffix}.json"
index_display_name = f"chatbot_docs_working{suffix}"
endpoint_display_name = f"chatbot_docs_endpoint_working{suffix}"
file_names: List[str] = ["Site Green - Emerald Garden - Cape Fear.pdf"]
document_paths: List[Path] = [
    PROJECT_ROOT_PATH / f"data/documents/{file_name}" for file_name in file_names
]
processor = DocAIProcessor(
    location=os.environ["DOC_AI_LOCATION"],
    project_id=os.environ["PROJECT_ID"],
    processor_id=os.environ["DOC_AI_PROCESSOR_ID"],
)


In [None]:
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=index_display_name,
    contents_delta_uri=EMBEDDING_DIR,
    dimensions=DIMENSIONS,
    approximate_neighbors_count=10,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
)

In [None]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=endpoint_display_name, public_endpoint_enabled=True
)
my_index_endpoint = my_index_endpoint.deploy_index(
    index=my_index, deployed_index_id=endpoint_display_name
)

In [None]:
from tqdm import tqdm
from src.vectordb.gcp_vector_search.transform_and_load import document_chunking

texts = []
metadata = []
for document_path, file_name in tqdm(
        zip(document_paths, file_names), desc="Processing documents"
):
    doc_sequence = processor.process_documents(document_list=[document_path])
    docs = document_chunking(doc_sequence)
    texts.extend([doc.page_content for doc in docs])
    metadata.extend([dict(file_name=file_name) for _ in texts])
    

In [None]:
from langchain_google_vertexai import VertexAIEmbeddings
embeddings = VertexAIEmbeddings()
vector_store = VectorSearchVectorStore.from_components(
    project_id=os.environ["PROJECT_ID"],
    region=os.environ["LOCATION"],
    gcs_bucket_name=os.environ["GCS_BUCKET"],
    index_id='projects/602280418311/locations/us-west1/indexes/7446402916694360064',
    endpoint_id='projects/602280418311/locations/us-west1/indexEndpoints/8201837773720649728',
    embedding=embeddings
)

In [None]:
vector_store.add_texts(texts=texts, metadata=metadata)

In [None]:
vector_store.similarity_search("rotfl", k=2)

In [None]:
vector_store.delete("test_id")