# Update index 

Use this notebook to add new entries to the existing index. Be mind that they should match the existing index format.

In [None]:
# flake8: noqa
from langchain_google_vertexai import VectorSearchVectorStore
from langchain_google_vertexai import VertexAIEmbeddings
from typing import List
from pathlib import Path
from src.preprocessing.doc_ai.processor import DocAIProcessor
import os
from src.settings import PROJECT_ROOT_PATH
from google.cloud import aiplatform
from tqdm import tqdm
from src.vectordb.gcp_vector_search.transform_and_load import document_chunking, get_file_names

os.environ['DOC_AI_LOCATION'] = "us"
os.environ['DOC_AI_PROCESSOR_ID'] = "e977fdd46ee23308"
os.environ['PROJECT_ID'] = "602280418311"
os.environ['LOCATION'] = "us-west1"
os.environ['GCS_BUCKET'] = "chatbot_docs"

BUCKET_URI = f"{os.environ['GCS_BUCKET']}"
DIMENSIONS = 768
EMBEDDING_DIR = f"{BUCKET_URI}"

index_id = "projects/602280418311/locations/us-west1/indexes/5658473864628273152"
endpoint_id = 'projects/602280418311/locations/us-west1/indexEndpoints/2644395833545457664'
file_names: List[str] = list(get_file_names(PROJECT_ROOT_PATH / "data/nutting_ridge_dataset/sample"))
document_paths: List[Path] = [
    PROJECT_ROOT_PATH / f"data/nutting_ridge_dataset/sample/{file_name}" for file_name in file_names
]


In [None]:
aiplatform.init(project=os.environ['PROJECT_ID'], 
                location=os.environ['LOCATION'], 
                staging_bucket=BUCKET_URI)

In [None]:
processor = DocAIProcessor(
    location=os.environ["DOC_AI_LOCATION"],
    project_id=os.environ["PROJECT_ID"],
    processor_id=os.environ["DOC_AI_PROCESSOR_ID"],
)

In [None]:
texts = []
metadata = []
for document_path, file_name in tqdm(
        zip(document_paths, file_names), desc="Processing documents"
):
    doc_sequence = processor.process_documents(document_list=[document_path])
    docs = document_chunking(doc_sequence)
    texts.extend([doc.page_content for doc in docs])
    metadata.extend([dict(filename=file_name) for _ in texts])
    

In [None]:
embeddings = VertexAIEmbeddings(location=os.environ["LOCATION"])
vector_store = VectorSearchVectorStore.from_components(
    project_id=os.environ["PROJECT_ID"],
    region=os.environ["LOCATION"],
    gcs_bucket_name=BUCKET_URI,
    index_id=index_id,
    endpoint_id=endpoint_id,
    embedding=embeddings
)
vector_store.add_texts(texts=texts, metadata=metadata)