# Vertex AI Search - Generic Search Filter Demo

A demo of how to filter results Vertex AI Search results with metadata. Following example [here](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/search/search_filters_metadata.ipynb) 

## Objective

This notebook shows how to use [filters and metadata](https://cloud.google.com/generative-ai-app-builder/docs/filter-search-metadata) in search requests to [Vertex AI Search](https://cloud.google.com/generative-ai-app-builder/docs/introduction).

This works with unstructrued apps that contain metadata. You can use metadata fields to restrict your search to a specific set of documents.

Services used in the notebook:

*   ✅ Vertex AI Search for document search and retrieval


## Installs 


In [None]:
# tuples of (import name, install name, min_version)
packages = [
    ('google.cloud.aiplatform', 'google-cloud-aiplatform'),
    ('google.cloud.storage', 'google-cloud-storage'),
    ('google.cloud.discoveryengine','google-cloud-discoveryengine')
]

import importlib
install = False
for package in packages:
    if not importlib.util.find_spec(package[0]):
        print(f'installing package {package[1]}')
        install = True
        !pip install {package[1]} -U -q --user
    elif len(package) == 3:
        if importlib.metadata.version(package[0]) < package[2]:
            print(f'updating package {package[1]}')
            install = True
            !pip install {package[1]} -U -q --user

### Restart Kernel (If Installs Occured)

After a kernel restart the code submission can start with the next cell after this one.


In [None]:
if install:
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Setup
inputs:

In [None]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
# print(PROJECT_ID)

In [None]:
from google.cloud import storage

import json

from google.cloud import discoveryengine_v1alpha as discoveryengine
from google.api_core.client_options import ClientOptions

In [None]:
# PROJECT_ID = '' # set above
REGION = 'us-central1'
EXPERIMENT = 'search-alpha-pdfs'
SERIES = "generative-ai"

In [None]:
BUCKET = f"{PROJECT_ID}-{SERIES}-{EXPERIMENT}"
BUCKET_URI = f"gs://{BUCKET}"

BUCKET_2 = f"{PROJECT_ID}-{SERIES}-{EXPERIMENT}-meta"
BUCKET_URI_2 = f"gs://{BUCKET_2}"

In [None]:
LOCATION = "global"  # Replace with your data store location
# The datastore name can only contain lowercase letters, numbers, and hyphens
# DATASTORE_NAME = EXPERIMENT
DATASTORE_NAME = f"{SERIES}-{EXPERIMENT}"
DATASTORE_ID = f"{DATASTORE_NAME}-id"

In [None]:
# sanity check 
print("REGION:             ", REGION)
print("EXPERIMENT:         ", EXPERIMENT)
print("SERIES:             ", SERIES)
print("BUCKET_URI          ",  BUCKET_URI)
print("BUCKET_URI_2        ",  BUCKET_URI_2)
print("LOCATION:           ", LOCATION)
print("DATASTORE_NAME:     ", DATASTORE_NAME)
print("DATASTORE_ID:       ", DATASTORE_ID)

## Create GCS Buckets

One bucket for data and one bucket for metadata 

In [None]:
gcs = storage.Client(project = PROJECT_ID)

In [None]:
def create_buckets(bucket_names):
  """Creates a list of buckets in the current project.

  Args:
    bucket_names: A list of bucket names to create.
  """

  for bucket_name in bucket_names:
    if not gcs.lookup_bucket(bucket_name):
      print(f"Bucket {bucket_name} does not exist, creating it now...")
      bucketDef = gcs.bucket(bucket_name)
      bucket = gcs.create_bucket(bucketDef, project=PROJECT_ID, location=REGION)
      print(bucket)
    else:
      print(f"Bucket {bucket_name} already exists:")
      print(gcs.lookup_bucket(bucket_name))

In [None]:
bucket_names = [BUCKET, BUCKET_2]
create_buckets(bucket_names)

## Ingest sample data

In [None]:
# ! gsutil -m cp gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs/* $BUCKET_URI # TODO - all pdfs 
! gsutil cp gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs/20040630_google_10Q.pdf $BUCKET_URI
! gsutil cp gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs/20040930_google_10Q.pdf $BUCKET_URI

## Create metadata file


In [None]:
# TODO - UNSURE CORRECT GCS PATH
metadata = [
    {
   "id": "1",
   "structData": {"title":"Document1", "category": ["PersonaA"], "name": "document_1"},
   "content": {
     "mimeType": "application/pdf",
     "uri": f"{BUCKET_URI}/20040630_google_10Q.pdf"
   },
    },
    {
   "id": "2",
   "structData": {"title":"Document2", "category": ["PersonaA", "PersonaB"], "name": "document_2"},
   "content": {
     "mimeType": "application/pdf",
     "uri": f"{BUCKET_URI}/20040930_google_10Q.pdf"
   }
     }
]
print(metadata)

### Write metadata  to a local file

In [None]:
metadata_filename = "metadata.json"

with open(metadata_filename,  'w') as file:
    for item in metadata:
        json_string = json.dumps(item)
        file.write(json_string + '\n')

### Upload metadata to cloud storage

In [None]:
!gsutil cp $metadata_filename $BUCKET_URI_2/$metadata_filename

## Create Datastore

https://cloud.google.com/generative-ai-app-builder/docs/reference/rest/v1alpha/projects.locations.collections.dataStores#DataStore



In [None]:
def create_data_store(
    project_id: str, location: str, data_store_name: str, data_store_id: str
):
    # Create a client
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    client = discoveryengine.DataStoreServiceClient(client_options=client_options)

    # Initialize request argument(s)
    data_store = discoveryengine.DataStore(
        display_name=data_store_name,
        industry_vertical="GENERIC",
        solution_types = ["SOLUTION_TYPE_SEARCH"],
        content_config = "CONTENT_REQUIRED"

    )

    request = discoveryengine.CreateDataStoreRequest(
        parent=discoveryengine.DataStoreServiceClient.collection_path(
            project_id, location, "default_collection"
        ),
        data_store=data_store,
        data_store_id=data_store_id,
    )
    operation = client.create_data_store(request=request)

    # Make the request
    # The try block is necessary to prevent execution from haulting due to an error being thrown when the datastore takes a while to instantiate
    try:
        response = operation.result(timeout=90)
    except:
        print("long-running operation")

In [None]:
create_data_store(PROJECT_ID, LOCATION, DATASTORE_NAME, DATASTORE_ID)

## Import documents to datastore

https://cloud.google.com/generative-ai-app-builder/docs/reference/rest/v1alpha/projects.locations.collections.dataStores.branches.documents/import - correct ???



In [None]:
def import_documents(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: str,
):
    # Create a client
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    client = discoveryengine.DocumentServiceClient(client_options=client_options)

    # The full resource name of the search engine branch.
    # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}
    parent = client.branch_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        branch="default_branch",
    )

    source_documents = [f"{gcs_uri}/*"]

    request = discoveryengine.ImportDocumentsRequest(
        parent=parent,
        gcs_source=discoveryengine.GcsSource(
            input_uris=source_documents, 
            data_schema="document"
            # document (default): One JSON Document per line. Each document must have a valid Document.id.
            # data_schema="content" 
            # content: Unstructured data (e.g. PDF, HTML). Each file matched by inputUris becomes a document, 
            # with the ID set to the first 128 bits of SHA256(URI) encoded as a hex string.
            # https://cloud.google.com/generative-ai-app-builder/docs/reference/rest/v1alpha/GcsSource#:~:text=content%3A%20Unstructured%20data%20(e.g.%20PDF%2C%20HTML).%20Each%20file%20matched%20by%20inputUris%20becomes%20a%20document%2C%20with%20the%20ID%20set%20to%20the%20first%20128%20bits%20of%20SHA256(URI)%20encoded%20as%20a%20hex%20string.
              
        ),
        # Options: `FULL`, `INCREMENTAL`
        reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
    )

    # Make the request
    operation = client.import_documents(request=request)

    response = operation.result()

    # Once the operation is complete,
    # get information from operation metadata
    metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

    # Handle the response
    return operation.operation.name

In [None]:
import_documents(PROJECT_ID, LOCATION, DATASTORE_ID, BUCKET_URI_2)

## Create Search Engine / App


Official GCP GenAI GitHub Repo

* https://github.com/GoogleCloudPlatform/generative-ai/blob/main/search/create_datastore_and_search.ipynb

Python SDK

* https://cloud.google.com/generative-ai-app-builder/docs/libraries#client-libraries-usage-python

REST API 



* v1_alpha
    * https://cloud.google.com/generative-ai-app-builder/docs/reference/rest/v1alpha/projects.locations.collections.engines/create
    * https://cloud.google.com/generative-ai-app-builder/docs/reference/rest/v1alpha/projects.locations.collections.engines#Engine

In [None]:
def create_engine(
    project_id: str, location: str, data_store_name: str, data_store_id: str
):
    # Create a client
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    client = discoveryengine.EngineServiceClient(client_options=client_options)

    # Initialize request argument(s)
    config = discoveryengine.Engine.SearchEngineConfig(
        search_tier="SEARCH_TIER_ENTERPRISE", 
        search_add_ons=["SEARCH_ADD_ON_LLM"]
    )

    engine = discoveryengine.Engine(
        display_name=data_store_name,
        solution_type="SOLUTION_TYPE_SEARCH",
        industry_vertical="GENERIC",
        data_store_ids=[data_store_id],
        search_engine_config=config,
    )

    request = discoveryengine.CreateEngineRequest(
        parent=discoveryengine.DataStoreServiceClient.collection_path(
            project_id, location, "default_collection"
        ),
        engine=engine,
        engine_id=engine.display_name,
    )

    # Make the request
    operation = client.create_engine(request=request)
    response = operation.result(timeout=90)
    

In [None]:
create_engine(PROJECT_ID, LOCATION, DATASTORE_NAME, DATASTORE_ID)

## Query documents

### REST API examples

The filter `name: ANY("document_1")` ensures the query is against only the documents with `name` matching `document_1`.

In [None]:
%%bash -s "$PROJECT_ID" "$LOCATION" "$DATASTORE_ID"

project_id=$1
location=$2
data_store_id=$3

curl -X POST -H "Authorization: Bearer $(gcloud auth print-access-token)" \
-H "Content-Type: application/json" \
"https://discoveryengine.googleapis.com/v1beta/projects/$project_id/locations/$location/collections/default_collection/dataStores/$data_store_id/servingConfigs/default_search:search" \
-d '{
"query": "Who is the current CEO of Alphabet?",
"filter": "name: ANY(\"document_2\")"
}'

### Helper function

In [None]:
def search_data_store(
    project_id: str,
    location: str,
    data_store_id: str,
    search_query: str,
    filter_str: str,
) -> discoveryengine.SearchResponse:
    #  For more information, refer to:
    # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )

    # Create a client
    client = discoveryengine.SearchServiceClient(client_options=client_options)

    # The full resource name of the search engine serving config
    # e.g. projects/{project_id}/locations/{location}/dataStores/{data_store_id}/servingConfigs/{serving_config_id}
    serving_config = client.serving_config_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        serving_config="default_config",
    )

    # Optional: Configuration options for search
    # Refer to the `ContentSearchSpec` reference for all supported fields:
    # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest.ContentSearchSpec
    content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(
        # For information about snippets, refer to:
        # https://cloud.google.com/generative-ai-app-builder/docs/snippets
        snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(
            return_snippet=True
        ),
        extractive_content_spec=discoveryengine.SearchRequest.ContentSearchSpec.ExtractiveContentSpec(
            max_extractive_answer_count=5,
            max_extractive_segment_count=1,
        ),
        # For information about search summaries, refer to:
        # https://cloud.google.com/generative-ai-app-builder/docs/get-search-summaries
        summary_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(
            summary_result_count=5,
            include_citations=True,
            ignore_adversarial_query=False,
            ignore_non_summary_seeking_query=False,
        ),
    )

    # Refer to the `SearchRequest` reference for all supported fields:
    # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest
    request = discoveryengine.SearchRequest(
        serving_config=serving_config,
        query=search_query,
        filter=filter_str,
        page_size=5,
        content_search_spec=content_search_spec,
        query_expansion_spec=discoveryengine.SearchRequest.QueryExpansionSpec(
            condition=discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO,
        ),
        spell_correction_spec=discoveryengine.SearchRequest.SpellCorrectionSpec(
            mode=discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO
        ),
    )

    response = client.search(request)
    return response

### Example 1 - simple

In [None]:
search_query = "Who is the CEO of Alphabet?"
filter_str = 'name: ANY("document_1")'

results = search_data_store(
    PROJECT_ID, LOCATION, DATASTORE_ID, search_query, filter_str
)

print(f"\nQuestion: '{search_query}'\n\n")
print("Summary" + "-" * 40)
print(results.summary.summary_text)

# print("Raw Results" + "-" * 40)
# print(results)

### Example 2 - more complex

In [None]:
search_query = "Who is the CEO of Alphabet?"
filter_str = 'name: ANY("document_1") AND category: ANY("PersonaA")'

results = search_data_store(
    PROJECT_ID, LOCATION, DATASTORE_ID, search_query, filter_str
)

print(f"\nQuestion: '{search_query}'\n\n")
print("Summary" + "-" * 40)
print(results.summary.summary_text)

# print("Raw Results" + "-" * 40)
# print(results)