## AI Search Engine Creation


In [6]:
# ! pip install google-api-core
# ! pip install google-cloud-discoveryengine

In [8]:
import os
from dotenv import load_dotenv

# override env variable values.
# set the 'PROJECT_ID' values in the '/.env' file. (or set your project_id value directly)
load_dotenv() 

PROJECT_ID=os.getenv('PROJECT_ID')

In [9]:
from google.cloud import discoveryengine_v1alpha as discoveryengine
from google.api_core.client_options import ClientOptions

LOCATION = "global"

### Create DataStore

DataStore는 AI Search Engine이 Crawling을 수행하여, 인덱스를 구성하는 대상이 된다. 

GCS / URL 형태로 제공할 수 있다. 

In [10]:
DATA_STORE_ID=os.getenv('DATA_STORE_ID')
DATA_STORE_NAME=os.getenv('DATA_STORE_NAME')
# Format: `gs://bucket/directory/object.json` or `gs://bucket/directory/*.json`
DATA_GCS_URI=os.getenv('DATA_STORE_GCS_URI')


In [11]:
def create_data_store(
    project_id: str, location: str, data_store_name: str, data_store_id: str
):
    # Create a client
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    client = discoveryengine.DataStoreServiceClient(client_options=client_options)

    # Initialize request argument(s)
    data_store = discoveryengine.DataStore(
        display_name=data_store_name,
        industry_vertical="GENERIC",
        content_config="CONTENT_REQUIRED",
    )

    request = discoveryengine.CreateDataStoreRequest(
        parent=discoveryengine.DataStoreServiceClient.collection_path(
            project_id, location, "default_collection"
        ),
        data_store=data_store,
        data_store_id=data_store_id,
    )
    operation = client.create_data_store(request=request)

    # Make the request
    # The try block is necessary to prevent execution from haulting due to an error being thrown when the datastore takes a while to instantiate
    try:
        response = operation.result(timeout=90)
    except:
        print("long-running operation")

In [21]:
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine

def import_documents_sample(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: Optional[str] = None,
    bigquery_dataset: Optional[str] = None,
    bigquery_table: Optional[str] = None,
) -> str:
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )

    # Create a client
    client = discoveryengine.DocumentServiceClient(client_options=client_options)

    # The full resource name of the search engine branch.
    # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}
    parent = client.branch_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        branch="default_branch",
    )

    if gcs_uri:
        request = discoveryengine.ImportDocumentsRequest(
            parent=parent,
            gcs_source=discoveryengine.GcsSource(
                input_uris=[gcs_uri], data_schema="content"
            ),
            # Options: `FULL`, `INCREMENTAL`
            reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
        )
    else:
        request = discoveryengine.ImportDocumentsRequest(
            parent=parent,
            bigquery_source=discoveryengine.BigQuerySource(
                project_id=project_id,
                dataset_id=bigquery_dataset,
                table_id=bigquery_table,
                data_schema="custom",
            ),
            # Options: `FULL`, `INCREMENTAL`
            reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
        )

    # Make the request
    operation = client.import_documents(request=request)

    print(f"Waiting for operation to complete: {operation.operation.name}")
    response = operation.result()

    # Once the operation is complete,
    # get information from operation metadata
    metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

    # Handle the response
    print(response)
    print(metadata)

    return operation.operation.name


In [13]:
create_data_store(PROJECT_ID, LOCATION, DATA_STORE_NAME, DATA_STORE_ID)

long-running operation


In [22]:
import_documents_sample(PROJECT_ID, LOCATION, DATA_STORE_ID, 
                        'gs://search-and-conversation-example-files/*', #DATA_GCS_URI
                        )


Waiting for operation to complete: projects/547505032058/locations/global/collections/default_collection/dataStores/pdf-viewer-test/branches/0/operations/import-documents-17816815103130670727
error_config {
  gcs_prefix: "gs://547505032058_asia_northeast3_import_content/errors17816815103130671210"
}

create_time {
  seconds: 1709678609
  nanos: 66130000
}
update_time {
  seconds: 1709678611
  nanos: 644274000
}
success_count: 1



'projects/547505032058/locations/global/collections/default_collection/dataStores/pdf-viewer-test/branches/0/operations/import-documents-17816815103130670727'