In [12]:
import uuid
import json

from google.cloud.aiplatform import MatchingEngineIndex, MatchingEngineIndexEndpoint
from google.cloud.storage import Client
from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import (
    Namespace,
)

from langchain_google_vertexai.embeddings import VertexAIEmbeddings

In [13]:
document_texts = [
    "Lions are my favourite animals",
    "There are two apples on the table",
    "Today is raining a lot in Madrid"
]

In [41]:
storage_client = Client()

embedding_engine = VertexAIEmbeddings()

embeddings = embedding_engine.embed(document_texts)

embeddings_dict = [
    {
        "id": str(uuid.uuid4()), 
        "embedding": embedding
    } 
    for embedding in embeddings
]

embeddings_json = "\n".join([json.dumps(record) for record in embeddings_dict])

Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


In [42]:
bucket = storage_client.get_bucket("vector_search_index_1")
blob = bucket.blob(f"{str(uuid.uuid4())}/embeddings.json")
blob.upload_from_string(embeddings_json)

In [43]:
MatchingEngineIndex.create_tree_ah_index(
    display_name = "my-index",
    contents_delta_uri =  f"gs://{blob.bucket.name}/{'/'.join(blob.name.split('/')[:-1])}",
    dimensions = len(embeddings[0]),
    approximate_neighbors_count = 150
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/245830287766/locations/us-central1/indexes/1217981806645608448/operations/178866283863867392
MatchingEngineIndex created. Resource name: projects/245830287766/locations/us-central1/indexes/1217981806645608448
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/245830287766/locations/us-central1/indexes/1217981806645608448')


<google.cloud.aiplatform.matching_engine.matching_engine_index.MatchingEngineIndex object at 0x7fe6e055f2d0> 
resource name: projects/245830287766/locations/us-central1/indexes/1217981806645608448

In [4]:
PROJECT_ID = "jzaldivar-test-project"

In [3]:
QUESTIONS_SIZE = 1000

bq_client = bigquery.Client(project=PROJECT_ID)
QUERY_TEMPLATE = """
        SELECT distinct q.id, q.title
        FROM (SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions`
        where Score > 0 ORDER BY View_Count desc) AS q
        LIMIT {limit} ;
        """
query = QUERY_TEMPLATE.format(limit=QUESTIONS_SIZE)
query_job = bq_client.query(query)
rows = query_job.result()
df = rows.to_dataframe()

# examine the data
df.head()

Unnamed: 0,id,title
0,73250763,Error CS0246: The type or namespace name 'Stre...
1,73206525,Keycloak 19.0 behind nginx (https) admin conso...
2,73475664,Citing Institutional Author or Organization in...
3,73399777,Azure build failing due to Method not found: '...
4,73426773,UnboundLocalError: local variable 'raw_labels'...


In [39]:
import uuid
from abc import ABC, abstractmethod
from typing import Any, Iterable, List, Sequence, Dict, Tuple

from langchain_core.documents import Document
from langchain_core.vectorstores import VectorStore
from langchain_core.embeddings import Embeddings

from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import (
    Namespace
)


class VectorSearchIndexUpdater(ABC):
    """ Abstract interface for updating an Vector Search index.
    """
    
    @abstractmethod
    def update_index(
        self, 
        ids: Sequence[str], 
        embeddings: Sequence[List[float]],
        metadatas: List[Dict[str, Any]] | None = None
    ) -> None:
        """ Updates the index.

        Args:
            ids: Sequence of ids for every entity.
            embeddings: Sequence of vector representations.
            metadatas:
        """
        raise NotImplementedError()


class VectorSearchDocumentStorage(ABC):
    """ Abstract interface of a key, text storage for retrieving documents.
    """
    
    @abstractmethod
    def get_by_id(self, document_id: str) -> str | None:
        """ Gets the text of a document by its id. If not found, returns None.

        Args:
            document_id: Id of the document to get from the storage.

        Returns:
            Text of the document if found, otherwise None.
        """
        raise NotImplementedError()

    @abstractmethod
    def store_by_id(self, document_id: str, text: str):
        """ Stores a document text associated to a document_id.

        Args:
            document_id: Id of the document to be stored.
            text: Text of the document to be stored.
        """
        raise NotImplementedError()
    
    def batch_store_by_id(self, ids: List[str], texts: List[str]) -> None:
        """ Stores a list of ids and documents in batch.
        
        The default implementation only loops to the individual `store_by_id`. 
        Subclasses that have faster ways to store data via batch uploading should
        implement the proper way.

        Args:
            ids: List of ids for the text.
            texts: List of texts.
        """
        for id_, text in zip(ids, texts):
            self.store_by_id(id_, text)

    def batch_get_by_id(self, ids: List[str]) -> List[str | None]:
        """ Gets a batch of documents by id.

        The default implementation only loops `get_by_id`.
        Subclasses that have faster ways to retrieve data by batch should implement
        this method.

        Args:
            ids: List of ids for the text.

        Returns:
            List of texts. If the key id is not found for any id record returns a None
                instead.
        """
        return [self.get_by_id(id_) for id_ in ids]


class VectorSearchSearcher(ABC):
    """ Abstract implementation of a similarity searcher.
    """
    
    @abstractmethod
    def find_neighbors(
        self,
        embedding: List[float],
        k: int = 4,
        filter_: List[Namespace] | None = None,
    ) -> List[Tuple[str, float]]:
        """

        Args:
            embedding:
            k:
            filter_
        """
        raise NotImplementedError()


In [58]:
from google.cloud import aiplatform
from google.cloud import storage
from google.oauth2.service_account import Credentials
from google.cloud.aiplatform.matching_engine import (
    MatchingEngineIndex, 
    MatchingEngineIndexEndpoint
)

class VectorSearchSDKManager:
    """ Class in charge of building Google Cloud SDK Objects from project_id and
    credentials.
    """
    
    def __init__(
        self,
        *,
        project_id: str,
        region: str,
        credentials: Credentials | None = None,
        credentials_path: str | None = None,
    ) -> None:
        """ Constructor.

        If `credentials` is provided, those credentials are used. If not provided
        `credentials_path` is used to retrieve credentials from a file. If also not 
        provided, falls back to default credentials.

        Args:
            project_id: Id of the project.
            region: Region of the project. E.j. 'us-central1'
            credentials: Google cloud Credentials object. 
            credentials_path: Google Cloud Credentials json file path.
        """
        self._project_id = project_id
        self._region = region

        if credentials is not None:
            self._credentials = credentials
        elif credentials_path is not None:
            self._credentials = Credentials.from_service_account_file(credentials_path)
        else:
            self._credentials = None

        self.initialize_aiplatform()

    def initialize(self) -> None:
        """ Initializes aiplatform.
        """
        aiplatform.init(
            project=self._project_id,
            location=self._region,
            credentials=self._credentials,
        )

    def get_gcs_client(self) -> storage.Client:
        """ Retrieves a Google Cloud Storage client.

        Returns:
            Google Cloud Storage Agent.
        """
        return storage.Client(project=self._project_id, credentials=self._credentials)
    
    def get_gcs_bucket(self, bucket_name: str) -> storage.Bucket:
        """ Retrieves a Google Cloud Bucket by bucket name.

        Args:
            bucket_name: Name of the bucket to be retrieved.

        Returns:
            Google Cloud Bucket.
        """
        client = self.get_gcs_client()
        return client.get_bucket(bucket_name)
    
    def get_index(self, index_id: str) -> MatchingEngineIndex:
        """ Retrieves a MatchingEngineIndex (VectorSearchIndex) by id.

        Args:
            index_id: Id of the index to be retrieved.

        Returns:
            MatchingEngineIndex instance.
        """
        return MatchingEngineIndex(
            index_name=index_id,
            project=self._project_id,
            location=self._region,
            credentials=self._credentials,
        )
    
    def get_endpoint(self, endpoint_id: str) -> MatchingEngineIndexEndpoint:
        """ Retrieves a MatchingEngineIndexEndpoint (VectorSearchIndexEndpoint) by id.

        Args:
            endpoint_id: Id of the endpoint to be retrieved.

        Returns:
            MatchingEngineIndexEndpoint instance.
        """
        return MatchingEngineIndexEndpoint(
            index_endpoint_name=endpoint_id,
            project=self._project_id,
            location=self._region,
            credentials=self._credentials,
        )


In [54]:
factory = VectorSearchFactory(project_id="jzaldivar-test-project", region="us-central1")

factory.get_gcs_bucket("vector_search_index_1")

<Bucket: vector_search_index_1>

In [None]:
class VectorAIVectorSearch(VectorStore):
    """
    """
    
    def __init__(
        self,
        searcher: VectorSearchSearcher,
        embeddings: Embeddings,
        document_storage: VectorSearchDocumentStorage,
        index_updater: VectorSearchIndexUpdater | None = None,
    ) -> None:
        """

        Args:
            querier:
            document_storage:
            index_manager:
            embeddings:
        """
        self._document_storage = document_storage
        self._searcher = searcher
        self._embeddings = embeddings
        self._index_updater = index_updater

    @property
    def embeddings(self) -> Embeddings:
        """
        """
        return self._embeddings

    def add_texts(
        self, 
        texts: Iterable[str], 
        metadatas: List[dict] | None = None, 
        **kwargs: Any
    ) -> List[str]:
        """
        """
        embeddings = self._embeddings.embed_documents(texts=texts)
        ids = [str(uuid.uuid4()) for _ in embeddings]
        self._index_manager.update_index(ids, embeddings, metadatas)
        self._document_storage.batch_store(ids, texts)

    def similarity_search_with_score(
        self,
        query: str,
        k: int = 4,
        filter_: List[Namespace] | None = None,
    ) -> List[Tuple[Document, float]]:
        """
        """
        pass

    def similarity_search_by_vector_with_score(
        self,
        embedding: List[float],
        k: int = 4,
        filter_: List[Namespace] | None = None,
    ) -> List[Tuple[Document, float]]:
        """
        """
        pass

In [17]:
from google.cloud.storage import Client, Bucket
from google.cloud.aiplatform.matching_engine import MatchingEngineIndex
from google.cloud.aiplatform.compat.types import matching_engine_index as me_types

In [18]:
from typing import Any, Dict, List, Sequence


class StreamVectorSearchIndexManager(VectorSearchIndexManager):
    """
    """

    def __init__(self, index: MatchingEngineIndex) -> None:
        """
        """
        super().__init__()
        self._me_index = index

    def update_index(
        self, 
        ids: Sequence[str], 
        embeddings: Sequence[List[float]], 
        metadatas: Sequence[Dict[str, Any]] | None = None
    ) -> None:
        """
        """

        index_data_points = []
        for id_, embedding in zip(ids, embeddings):
            index_data_point = me_types.IndexDatapoint(
                datapoint_id = id_,
                feature_vector = embedding
            )
            index_data_points.append(index_data_point)

        self._me_index.upsert_datapoints(datapoints=index_data_points)


class BatchVectorSearchIndexManager(VectorSearchIndexManager):
    """
    """

    def __init__(self, index: MatchingEngineIndex, staging_bucket: Bucket) -> None:
        """

        Args:
            index:
            staging_bucket:
        """
        self._me_index = index
        self._staging_bucket = staging_bucket

    def update_index(
        self, 
        ids: Sequence[str], 
        embeddings: Sequence[List[float]], 
        metadatas: List[Dict[str, Any]] | None = None
    ):
        """
        """
        self._me_index.update_embeddings()
        return super().update_index(ids, embeddings, metadatas)

    

In [15]:
index = MatchingEngineIndex(index_name="1217981806645608448")

index_manager = MEIVectorSearchIndexManager(matching_engine_index=index)

index_manager.update_index(
    ["A1", "A2"],
    [[1,1,1], [1, 1, 1]]
)

index.

Upserting datapoints MatchingEngineIndex index: projects/245830287766/locations/us-central1/indexes/1217981806645608448


FailedPrecondition: 400 StreamUpdate is not enabled on this Index.

In [9]:
help(me_types.IndexDatapoint)

Help on class IndexDatapoint in module google.cloud.aiplatform_v1.types.index:

class IndexDatapoint(proto.message.Message)
 |  IndexDatapoint(mapping=None, *, ignore_unknown_fields=False, **kwargs)
 |  
 |  A datapoint of Index.
 |  
 |  Attributes:
 |      datapoint_id (str):
 |          Required. Unique identifier of the datapoint.
 |      feature_vector (MutableSequence[float]):
 |          Required. Feature embedding vector. An array of numbers with
 |          the length of [NearestNeighborSearchConfig.dimensions].
 |      restricts (MutableSequence[google.cloud.aiplatform_v1.types.IndexDatapoint.Restriction]):
 |          Optional. List of Restrict of the datapoint,
 |          used to perform "restricted searches" where
 |          boolean rule are used to filter the subset of
 |          the database eligible for matching. This uses
 |          categorical tokens. See:
 |  
 |          https://cloud.google.com/vertex-ai/docs/matching-engine/filtering
 |      numeric_restricts 

In [None]:
m = MatchingEngineIndex()
m.upsert_datapoints()

In [30]:
from typing import Iterable, Tuple, Optional
from abc import ABC, abstractmethod

from google.cloud.storage import Bucket


class VertexAIVectorSearchDocumentStorage(ABC):
    """ Provides an abstract interface for the storage
    """

    @abstractmethod
    def get_by_id(self, document_id: str) -> str | None:
        """ Gets the text of a document by its id. If not found, returns None.

        Args:
            document_id: Id of the document to get from the storage.

        Returns:
            Text of the document if found, otherwise None.
        """
        raise NotImplementedError()

    @abstractmethod
    def store_by_id(self, document_id: str, text: str):
        """ Stores a document text associated to a document_id.

        Args:
            document_id: Id of the document to be stored.
            text: Text of the document to be stored.
        """
        raise NotImplementedError()

    @abstractmethod
    def __iter__(self) -> Iterable[Tuple[str, str]]:
        """ Iterates over all document_id, document_text pairs in the storage.

        Returns:
            Iterable of pairs document_id, document_text
        """
        raise NotImplementedError()
    

class GCSVertexAIVectorSearchDocumentStorage(VertexAIVectorSearchDocumentStorage):
    """ Provides an abstract interface for the storage
    """

    def __init__(
        self, 
        bucket: Bucket,
        prefix: Optional[str] = "documents"
    ) -> None:
        """ Constructor.

        Args:
            bucket: Bucket where the documents will be stored.
            prefix: Prefix that is prepended to all document names.
        """
        super().__init__()
        self._bucket = bucket
        self._prefix = prefix

    def get_by_id(self, document_id: str) -> str | None:
        """ Gets the text of a document by its id. If not found, returns None.

        Args:
            document_id: Id of the document to get from the storage.

        Returns:
            Text of the document if found, otherwise None.
        """

        blob_name = self._get_blob_name(document_id)
        existing_blob = self._bucket.get_blob(blob_name)
        
        if existing_blob is None: 
            return None
        
        return existing_blob.download_as_text()
    
    def store_by_id(self, document_id: str, text: str) -> None:
        """ Stores a document text associated to a document_id.

        Args:
            document_id: Id of the document to be stored.
            text: Text of the document to be stored.
        """
        blob_name = self._get_blob_name(document_id)
        new_blow = self._bucket.blob(blob_name)
        new_blow.upload_from_string(text)

    def __iter__(self) -> Iterable[Tuple[str, str]]:
        """ Iterates over all document_id, document_text pairs in the storage.

        Returns:
            Iterable of pairs document_id, document_text
        """
        for blob in self._bucket.list_blobs(prefix=self._prefix):
            yield blob.name.split("/")[-1], blob.download_as_text()

    def _get_blob_name(self, document_id: str) -> str:
        """ Builds a blob name using the prefix and the document_id.

        Args:
            document_id: Id of the document.

        Returns:
            Name of the blob that the document will be/is stored in
        """
        return f"{self._prefix}/{document_id}"

In [25]:
from typing import Any, List, Optional, Iterable, Type, Tuple
from langchain_core.documents import Document
from langchain_core.vectorstores import VectorStore, Embeddings
from langchain_google_vertexai.embeddings import TextEmbeddingModel

from google.cloud.storage import Bucket
from google.cloud.aiplatform import MatchingEngineIndex, MatchingEngineIndexEndpoint


class VertexVectorSearch(VectorStore):
    """
    """

    def __init__(
        self,
        embbedings: Embeddings,
        matching_engine_index: MatchingEngineIndex,
        matching_engine_index_endpoint: MatchingEngineIndexEndpoint,
        document_storage: VertexAIVectorSearchDocumentStorage,
        *,
        staging_gcs_bucket: Optional[Bucket] = None
    ) -> None:
        """

        Args:
            embeddings:
            matching_engine_index:
            matching_engine_index_endpoint:
            document_storage:
            staging_gcs_bucket:
        """
        super().__init__()
        self._embeddings = embbedings
        self._matching_engine_index = matching_engine_index
        self._matching_engine_index_endpoint = matching_engine_index_endpoint
        self._document_storage = document_storage
        self._staging_gcs_bucket = staging_gcs_bucket

    @property
    def embbedings(self) -> Embeddings:
        """
        """
        return self._embeddings
    
    @classmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Embeddings,

        metadatas: Optional[List[dict]] = None,
    ) -> "VertexVectorSearch":
        """
        """
        pass

    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """Run more texts through the embeddings and add to the vectorstore.

        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            kwargs: vectorstore specific parameters

        Returns:
            List of ids from adding the texts into the vectorstore.
        """
        self._check_staging_bucket_is_defined()

    def similarity_search(self, query: str, k: int = 4, **kwargs: Any) -> List[Document]:
        """
        """
        return super().similarity_search(query, k, **kwargs)
    
    def _check_staging_bucket_is_defined(self) -> None:
        """
        """
        if self._staging_gcs_bucket is None:
            raise ValueError(
                "In order to add documents to the index it is necessary to have "
                "a `staging_gcs_bucket` defined"
            )



In [26]:
from google.cloud.storage import Client

client = Client(project="jzaldivar-test-project")
bucket = client.get_bucket("vector_search_index_1")

document_storage = GcsVertexDocumentStorage(bucket = bucket)

document_storage.store_by_id("foo", text="This is a sample text")
document_storage.store_by_id("bar", text="This is a sample text 2")
document_storage.store_by_id("foobar", text="This is a sample text 3")

document_storage.get_by_id("foobar")

'This is a sample text 3'

In [27]:
for doc_id, text in document_storage:
    print(doc_id, text)

bar This is a sample text 2
foo This is a sample text
foobar This is a sample text 3


In [21]:
for blob in bucket.list_blobs(prefix="documents"):
    print(blob.name)

documents/foobar


In [11]:


prefix = "documents"
doc_id = "foobar"
doc_text = "Hello world"

blob_name = f"{prefix}/{doc_id}"

blob = bucket.blob(blob_name)

blob.upload_from_string(doc_text)

In [74]:
from google.cloud.datastore import Client

client = Client(namespace="Test", database=None, project="jzaldivar-test-project")


with client.transaction():
    key = client.key("document_id", "bar")
    entity = client.entity(key=key)
    entity["text"] = "hello"
    client.put(entity)

In [75]:
key = client.key("Id", "bar")
result = client.get(key)

In [95]:
import uuid
import json

from google.cloud.storage import Client

from langchain_google_vertexai.embeddings import VertexAIEmbeddings


bucket = Client().get_bucket("vector_search_index_1")


embedding_obj = VertexAIEmbeddings(model_name="textembedding-gecko@001")
texts = [
    "Lions are my favourite animals",
    "There are two apples on the table",
    "Today is raining a lot in Madrid",
]

def stage_embbedings(id: List[str], texts: List[str], )

    embedding_list = embedding_obj.embed(texts)

    records = []
    for embedding in embedding_list:
        records.append({"id": str(uuid.uuid4()), "embedding": embedding})
    json_str = "\n".join(json.dumps(record) for record in records)








In [96]:
bucket

<Bucket: vector_search_index_1>

{"id": "dc8338bb-bc78-4780-848c-345c555853ab", "embedding": [-0.03900467976927757, -0.015698261559009552, 0.018636668100953102, 0.04263034090399742, 0.015758180990815163, -0.055620964616537094, 0.01671185903251171, -0.0037077313754707575, 0.01562347449362278, -0.05356467142701149, -0.012113465927541256, 0.014641289599239826, 0.01885615848004818, -0.04123213142156601, 0.04255804792046547, 0.006277742329984903, -0.0601072683930397, -0.04209969937801361, 0.03318195044994354, 0.006834126077592373, -0.09027043730020523, -0.03146388754248619, 0.007097113877534866, -0.0018196215387433767, -0.005514012183994055, -0.09699542820453644, 0.022786062210798264, 0.029084108769893646, 0.003934985026717186, -0.016087468713521957, -0.0317002609372139, 0.011909456923604012, -0.05993838235735893, -0.023226061835885048, -0.03873502090573311, -0.011611388996243477, -0.03835229575634003, 0.04567045718431473, -0.004400783684104681, -0.061212096363306046, 0.018138084560632706, -0.0059463707730174065, 0.0058783

In [77]:
MatchingEngineIndex.create_tree_ah_index(
    display_name="auto_created",
    
)

'hello'

In [None]:
bucket.upload_from_string()

In [1]:
from langchain_google_vertexai.vectorstores import MatchingEngine