# Vertex AI Search - Make DataStore and import documents

### Inastall Vertex AI Search package

In [1]:
%pip install --upgrade --quiet google-cloud-discoveryengine

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from google.cloud import storage
from dotenv import load_dotenv, set_key
from google.oauth2 import service_account
import google.oauth2.credentials
from googleapiclient import discovery
from google.auth import default

# Load Google API Key from .env file
env_path = '.env'
load_dotenv(env_path)
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("Failed to load API key. Please set GOOGLE_API_KEY in the .env file.")

# Service account key file path and project settings
KEY_PATH = "./pablo-test-425702-22d29fa73af8.json"
PROJECT_ID = "pablo-test-425702"
GS_LOCATION = "asia-northeast3"

# Initialize Google Cloud Storage client
# try:
#     storage_client = storage.Client.from_service_account_json(KEY_PATH)
#     print("Google Cloud Storage client initialized")
# except Exception as e:
#     print(f"Google Cloud Storage client error: {e}")

credentials = service_account.Credentials.from_service_account_file(
    KEY_PATH, 
    scopes=['https://www.googleapis.com/auth/cloud-platform']
)

project_id = os.getenv('GOOGLE_CLOUD_PROJECT', 'pablo-test-425702')

# 프로젝트 번호 가져오기
def get_project_number(project_id):
    credentials, _ = default()
    service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
    request = service.projects().get(projectId=project_id)
    response = request.execute()
    return response['projectNumber']

PROJECT_NUMBER = get_project_number(project_id)
print(PROJECT_NUMBER)



151473909705


## Make data store

#### Data store creation


In [3]:
from google.cloud.discoveryengine_v1 import (
    CreateDataStoreRequest,
    CreateEngineRequest,
    DataStore,
    DataStoreServiceClient,
    Engine,
    EngineServiceClient,
    IndustryVertical,
    SolutionType,
    DocumentProcessingConfig,
)
from google.protobuf.field_mask_pb2 import FieldMask

def create_data_store(project_id: str, region: str, data_store_id: str):
    parent = f"projects/{project_id}/locations/{region}"
    client = DataStoreServiceClient(credentials=credentials)
    name = f"projects/{PROJECT_NUMBER}/locations/{region}/collections/default_collection/dataStores/{data_store_id}/documentProcessingConfig"

    config = DocumentProcessingConfig(
        name=name,
        chunking_config=DocumentProcessingConfig.ChunkingConfig(
            layout_based_chunking_config=DocumentProcessingConfig.ChunkingConfig.LayoutBasedChunkingConfig(
                chunk_size= 499,   # 100-500 사이의 값 설정
                include_ancestor_headings=True
            )
            ),
    )


    data_store = DataStore(
        display_name=data_store_id,
        industry_vertical=IndustryVertical.GENERIC,
        solution_types=[SolutionType.SOLUTION_TYPE_SEARCH],
        content_config=DataStore.ContentConfig.CONTENT_REQUIRED,
        document_processing_config = config
    )
    

    request = CreateDataStoreRequest(
        parent=parent,
        data_store=data_store,
        data_store_id=data_store_id
    )

    operation = client.create_data_store(request=request)
    response = operation.result()

    return response

In [4]:
import uuid

DATA_STORE_LOCATION  = 'global'
DATA_STORE_ID = str(uuid.uuid4())
data_store_response = create_data_store(PROJECT_ID, DATA_STORE_LOCATION, DATA_STORE_ID)
print(f"Created DataStore: {data_store_response}")

Created DataStore: name: "projects/151473909705/locations/global/collections/default_collection/dataStores/7240b480-194d-4c75-99fb-777d8deb76c3"
display_name: "7240b480-194d-4c75-99fb-777d8deb76c3"
industry_vertical: GENERIC
solution_types: SOLUTION_TYPE_SEARCH
default_schema_id: "default_schema"
content_config: CONTENT_REQUIRED
document_processing_config {
  name: "projects/151473909705/locations/global/collections/default_collection/dataStores/7240b480-194d-4c75-99fb-777d8deb76c3/documentProcessingConfig"
  chunking_config {
    layout_based_chunking_config {
      chunk_size: 499
      include_ancestor_headings: true
    }
  }
  default_parsing_config {
    digital_parsing_config {
    }
  }
}



## import Documents

In [5]:
from typing import Any

from typing import Optional
from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine


def import_documents(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: str,
    mode: str,
) -> Any:


    client_options = ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com") 
    client = discoveryengine.DocumentServiceClient(client_options=client_options, credentials=credentials)

  
    parent = client.branch_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        branch="default_branch"
        )

    modes = {
        "FULL": discoveryengine.ImportDocumentsRequest.ReconciliationMode.FULL,
        "INCREMENTAL": discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL
    }
    
    reconciliation_mode = modes.get(mode)
    if not reconciliation_mode:
        print("Wrong ReconciliationMode, Select either FULL or INCREMENTAL")
        return "ReconciliationMode Error"

    if not gcs_uri:
        print("Add the GCS URI to add contents")
        return "GCS URI Error"
    else:
    # Create import documents request
        request = discoveryengine.ImportDocumentsRequest(
            parent=parent,
            gcs_source=discoveryengine.GcsSource(input_uris=[gcs_uri], data_schema="content"),
            reconciliation_mode=reconciliation_mode,
        )

        # Make the request
        operation = client.import_documents(request=request)
        # print(f"Waiting for operation to complete: {operation.operation.name}")
        # response = operation.result()

        # # Get information from operation metadata
        # # metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

        # # # Handle the response
        # print(response)
        # print(metadata)

        return operation.operation.name




In [6]:
GCS_URI = "gs://example_test_bucket_rag/ls-stock-report/*.pdf"
mode = "FULL"

credentials = service_account.Credentials.from_service_account_file(
    KEY_PATH, 
    scopes=['https://www.googleapis.com/auth/cloud-platform']
)


outcome =  import_documents(
    PROJECT_ID,
    DATA_STORE_LOCATION,
    DATA_STORE_ID,
    GCS_URI,
    mode
    ) 

print(outcome)    


projects/151473909705/locations/global/collections/default_collection/dataStores/7240b480-194d-4c75-99fb-777d8deb76c3/branches/0/operations/import-documents-9934464801994979099
