In [1]:
from google.cloud import discoveryengine_v1beta as discoveryengine
from google.api_core.client_options import ClientOptions
from google.protobuf.json_format import MessageToDict
import json
from typing import Optional

In [2]:
location="us"
project_id="<project>"
collection="default_collection"
data_store_id="google-devfest-llmops"
engine_id = "google-devfest-llmops-app"
metadata_jsonl_gcs_location = "gs://<bucket>/google_devtest/google_devfest_metadata.jsonl"

In [8]:
def create_datastore(
    project_id: str,
    location: str,
    data_store_id: str,
    collection_id="default_collection",
    starting_schema: discoveryengine.Schema = None,
    document_processing_config: discoveryengine.DocumentProcessingConfig= None
):
    
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    client = discoveryengine.DataStoreServiceClient(client_options=client_options)

    request = discoveryengine.CreateDataStoreRequest(
        parent=f"projects/{project_id}/locations/{location}/collections/{collection_id}",
        data_store=discoveryengine.DataStore(
            name=f"projects/{project_id}/locations/{location}/collections/{collection_id}/dataStores/{data_store_id}",
            display_name=data_store_id,
            industry_vertical=discoveryengine.IndustryVertical.GENERIC,
            solution_types=[discoveryengine.SolutionType.SOLUTION_TYPE_SEARCH],
            content_config=discoveryengine.DataStore.ContentConfig.CONTENT_REQUIRED,
            document_processing_config=document_processing_config,
            starting_schema=starting_schema
        ),
        data_store_id=data_store_id
    )
    response = client.create_data_store(request=request)
    return response 

### Generate the schema definition, i also mark file_name's keyPropertyMapping as title

In [11]:
schema_definition = {
    '$schema': 'https://json-schema.org/draft/2020-12/schema',
    'type': 'object',
    'properties': {
        'file_name': {
            'indexable': True,
            'type': 'string',
            'retrievable': True,
            'searchable': True,
        },
        'file_name_2': {
            'type': 'string',
            'keyPropertyMapping': 'title',
            'retrievable': True
        },
        'last_modified_unix_time': {
            'indexable': True,
            'type': 'number',
            'dynamicFacetable': True,
            'retrievable': True
        },
        'file_type': {
            'indexable': True,
            'type': 'string',
            'searchable': True,
            'dynamicFacetable': True,
            'retrievable': True
        },
        'etl_created': {
            'indexable': True,
            'type': 'string',
            'searchable': True,
            'dynamicFacetable': True,
            'retrievable': True
        },
        'last_modified_time': {
            'indexable': True,
            'type': 'string',
            'searchable': True,
            'dynamicFacetable': True,
            'retrievable': True
        },
        'etl_updated': {
            'indexable': True,
            'type': 'string',
            'searchable': True,
            'dynamicFacetable': True,
            'retrievable': True
        },
        'doc_type': {
            'indexable': True,
            'type': 'string',
            'searchable': True,
            'dynamicFacetable': True,
            'retrievable': True
        },
        'deleted': {
            'indexable': True,
            'type': 'string',
            'searchable': True,
            'dynamicFacetable': True,
            'retrievable': True
        }   
    }
}

schema_definition = json.dumps(schema_definition)

schema = discoveryengine.Schema(
    json_schema=schema_definition,
    name="default_schema"
)

### Define which parser we would like to use for datastore

In [9]:
document_processing_config = discoveryengine.DocumentProcessingConfig(
    name=f"projects/{project_id}/locations/{location}/collections/{collection}/dataStores/{data_store_id}/documentProcessingConfig",
    default_parsing_config= discoveryengine.DocumentProcessingConfig.ParsingConfig(
        layout_parsing_config = {
        },
        # digital_parsing_config = {
        # }
    ),
    chunking_config=discoveryengine.DocumentProcessingConfig.ChunkingConfig(
        {
            "layout_based_chunking_config": {
                "chunk_size": 500,
                "include_ancestor_headings": True
            }
        }
    )
)

### Create datastore

In [12]:
response = create_datastore(
    project_id=project_id,
    location=location,
    data_store_id=data_store_id,
    collection_id="default_collection",
    starting_schema=schema,
    document_processing_config=document_processing_config
)

Reference: https://cloud.google.com/generative-ai-app-builder/docs/reference/rest/v1beta/projects.locations.collections.dataStores#DataStore

In [None]:
# curl -X POST \
# -H "Authorization: Bearer $(gcloud auth print-access-token)" \
# -H "Content-Type: application/json" \
# -H "X-Goog-User-Project: <project>" \
# "https://us-discoveryengine.googleapis.com/v1/projects/<project>/locations/us/collections/default_collection/dataStores?dataStoreId=searchbkm-dev-datastore-v8-layout-tableunderstanding" \
# -d '{
#   "name": "projects/<project>/locations/us/collections/default_collection/dataStores/searchbkm-dev-datastore-v8-layout-tableunderstanding",
#   "displayName": "searchbkm-dev-datastore-v8-layout-tableunderstanding",
#   "industryVertical": "GENERIC",
#   "solutionTypes": ["SOLUTION_TYPE_SEARCH"],
#   "contentConfig": "CONTENT_REQUIRED",
#   "document_processing_config": {
#     "name": "projects/<project>/locations/us/collections/default_collection/dataStores/searchbkm-dev-datastore-v8-layout-tableunderstanding/documentProcessingConfig",
#     "defaultParsingConfig":{
#       "layoutParsingConfig": {
#         "enableImageAnnotation": true,
#         "enableTableAnnotation": true,
#       }
#     },
#     "chunkingConfig": {
#       "layoutBasedChunkingConfig": {
#             "chunkSize": 500,
#             "includeAncestorHeadings": true
#         }
#     }
#   },
#   "startingSchema": {
#     "name": "default_schema",
#     "jsonSchema": "{\"$schema\": \"https://json-schema.org/draft/2020-12/schema\", \"type\": \"object\", \"properties\": {\"file_name\": {\"indexable\": true, \"type\": \"string\", \"retrievable\": true, \"searchable\": true}, \"file_name_pso\": {\"type\": \"string\", \"keyPropertyMapping\": \"title\", \"retrievable\": true}, \"url\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"last_modified_unix_time\": {\"indexable\": true, \"type\": \"number\", \"dynamicFacetable\": true, \"retrievable\": true}, \"filesize_mb\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"server_redirected_preview_url\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"doc_id\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"language\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"site_name\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"file_type\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"etl_created\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"last_modified_time\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"server_redirected_url\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"pillar\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"etl_updated\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"author\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"doc_type\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}, \"deleted\": {\"indexable\": true, \"type\": \"string\", \"searchable\": true, \"dynamicFacetable\": true, \"retrievable\": true}}}"
#   }
# }'

In [13]:
response.operation.done

True

In [21]:
def import_documents(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: Optional[str] = None,
    bigquery_dataset: Optional[str] = None,
    bigquery_table: Optional[str] = None,
) -> str:
    #  For more information, refer to:
    # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )

    # Create a client
    client = discoveryengine.DocumentServiceClient(client_options=client_options)

    # The full resource name of the search engine branch.
    # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}
    parent = client.branch_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        branch="default_branch",
    )

    if gcs_uri:
        request = discoveryengine.ImportDocumentsRequest(
            parent=parent,
            gcs_source=discoveryengine.GcsSource(
                input_uris=[gcs_uri], data_schema="document"
            ),
            # Options: `FULL`, `INCREMENTAL`
            reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
        )
    else:
        request = discoveryengine.ImportDocumentsRequest(
            parent=parent,
            bigquery_source=discoveryengine.BigQuerySource(
                project_id=project_id,
                dataset_id=bigquery_dataset,
                table_id=bigquery_table,
                data_schema="custom",
            ),
            # Options: `FULL`, `INCREMENTAL`
            reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
        )

    # Make the request
    operation = client.import_documents(request=request)

    return operation

In [22]:
import_documents_operation = import_documents(
    project_id=project_id,
    location=location,
    data_store_id=data_store_id,
    gcs_uri=metadata_jsonl_gcs_location,
)

In [18]:
import_documents_operation.operation.name

'projects/329324204059/locations/us/collections/default_collection/dataStores/google-devfest-llmops/branches/0/operations/import-documents-544618397676509343'

### Create app

In [19]:
def create_engine(
        project:str, 
        location:str, 
        collection:str, 
        data_store_ids: str, 
        display_name:str,
        engine_id: str
):
    
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    
    # Create a client
    client = discoveryengine.EngineServiceClient(client_options=client_options)

    # Initialize request argument(s)
    engine = discoveryengine.Engine(
        search_engine_config=discoveryengine.Engine.SearchEngineConfig(
            search_tier="SEARCH_TIER_ENTERPRISE",
            search_add_ons=["SEARCH_ADD_ON_LLM"]
        )
    )
    engine.display_name = display_name
    engine.solution_type = "SOLUTION_TYPE_SEARCH"
    engine.data_store_ids=data_store_ids
    request = discoveryengine.CreateEngineRequest(
        parent=f"projects/{project}/locations/{location}/collections/{collection}",
        engine=engine,
        engine_id=engine_id,
    )

    # Make the request
    operation = client.create_engine(request=request)

    print("Waiting for operation to complete...")

    response = operation.result()

    # Handle the response
    print(response)

In [20]:
create_engine(
    project=project_id,
    location=location,
    collection="default_collection",
    data_store_ids=[data_store_id],
    display_name=engine_id,
    engine_id=engine_id
)

Waiting for operation to complete...
name: "projects/329324204059/locations/us/collections/default_collection/engines/google-devfest-llmops-app"
display_name: "google-devfest-llmops-app"
data_store_ids: "google-devfest-llmops"
solution_type: SOLUTION_TYPE_SEARCH
search_engine_config {
  search_tier: SEARCH_TIER_ENTERPRISE
  search_add_ons: SEARCH_ADD_ON_LLM
}

