In [1]:
import json
from dotenv import load_dotenv
from google.cloud import discoveryengine_v1alpha as discoveryengine
from util.agent_builder import AgentBuilderUtil
import os
import logging
from google.protobuf.json_format import MessageToDict

logging.basicConfig(level=logging.INFO) 

In [2]:
load_dotenv()

True

In [3]:
PROJECT_ID=os.environ.get("PROJECT_ID")
LOCATION=os.environ.get("LOCATION")
COLLECTION=os.environ.get("COLLECTION")
DATA_STORE_ID=os.environ.get("DATA_STORE_ID")
METADATA_JSONL_GCS_LOCATION=os.environ.get("METADATA_JSONL_GCS_LOCATION")
APP_ENGINE_DISPLAY_NAME=os.environ.get("APP_ENGINE_DISPLAY_NAME")
APP_ENGINE_ID=os.environ.get("APP_ENGINE_ID")

In [4]:
agent_builder_util = AgentBuilderUtil(
    project_id=PROJECT_ID,
    location=LOCATION,
    collection_id=COLLECTION
)

### Define Schema

In [5]:
schema_definition = {
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "type": "object",
  "properties": {
    "file_name": {
      "type": "string",
      "retrievable": True,
      "indexable": True,
      "searchable": True
    },
    "file_name_2": {
      "type": "string",
      "retrievable": False,
      'keyPropertyMapping': 'title'
    },
    "created_unix_time": {
      "type": "datetime",
      "retrievable": True,
      "indexable": True,
      "searchable": False
    },
    "file_type": {
      "type": "string",
      "retrievable": True,
      "indexable": True,
      "searchable": False
    },
    "doc_type": {
      "type": "string",
      "retrievable": True,
      "indexable": True,
      "searchable": False
    },
    "deleted": {
      "type": "string",
      "retrievable": True,
      "indexable": True,
      "searchable": False
    },
    "company": {
      "type": "string",
      "retrievable": True,
      "indexable": True,
      "searchable": True
    }
  }
}

schema_definition = json.dumps(schema_definition)

schema = discoveryengine.Schema(
    json_schema=schema_definition,
    name="devfest_tpe_2024_demo_schema"
)

### Choose Parser

In [6]:
document_processing_config = agent_builder_util.generate_document_processing_config(
    data_store_id=DATA_STORE_ID,
    parsing_config_type="layout_parsing_config",
    enable_chunking=True,
    chunk_size=500,
    include_ancestor_headings=True
)

In [7]:
document_processing_config

name: "projects/<project>/locations/us/collections/default_collection/dataStores/devfest_demo/documentProcessingConfig"
chunking_config {
  layout_based_chunking_config {
    chunk_size: 500
    include_ancestor_headings: true
  }
}
default_parsing_config {
  layout_parsing_config {
  }
}

### Create datastore

In [8]:
result = agent_builder_util.create_datastore(
    data_store_id=DATA_STORE_ID,
    starting_schema=schema,
    document_processing_config=document_processing_config
)

In [9]:
result.result()

name: "projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo"
display_name: "devfest_demo"
industry_vertical: GENERIC
solution_types: SOLUTION_TYPE_SEARCH
content_config: CONTENT_REQUIRED
default_schema_id: "default_schema"
document_processing_config {
  name: "projects/<project>/locations/us/collections/default_collection/dataStores/devfest_demo/documentProcessingConfig"
  chunking_config {
    layout_based_chunking_config {
      chunk_size: 500
      include_ancestor_headings: true
    }
  }
  default_parsing_config {
    layout_parsing_config {
    }
  }
}

In [10]:
result.operation.done

True

In [11]:
import_documents_operation = agent_builder_util.import_documents(
    data_store_id=DATA_STORE_ID,
    gcs_uri=METADATA_JSONL_GCS_LOCATION,
)

In [13]:
import_documents_operation.result()

error_config {
  gcs_prefix: "gs://<project_number>_us_west1_import_document/errors12615782969927776618"
}

### List all of documents within datastore

In [14]:
documents = agent_builder_util.list_documents(
    data_store_id=DATA_STORE_ID
)
documents

ListDocumentsPager<documents {
  name: "projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo/branches/0/documents/doc-1"
  id: "doc-1"
  schema_id: "default_schema"
  struct_data {
    fields {
      key: "file_type"
      value {
        string_value: "pdf"
      }
    }
    fields {
      key: "file_name"
      value {
        string_value: "20230203-alphabet-10K.pdf"
      }
    }
    fields {
      key: "file_name_2"
      value {
        string_value: "20230203-alphabet-10K.pdf"
      }
    }
    fields {
      key: "doc_type"
      value {
        string_value: "public"
      }
    }
    fields {
      key: "deleted"
      value {
        string_value: "no"
      }
    }
    fields {
      key: "created_unix_time"
      value {
        string_value: "2023-02-03T07:00:00Z"
      }
    }
    fields {
      key: "company"
      value {
        string_value: "google"
      }
    }
  }
  parent_document_id: "doc-1"
  content {
    mime_type: "a

### Get PARSED_DOCUMENT 

In [15]:
document_data = agent_builder_util.get_processed_document(
    document_path=documents.documents[2].name,
    processed_document_type="PARSED_DOCUMENT"
)

In [16]:
json.loads(document_data.json_data)

{'title': 'Tesla-Master-Plan-Part-3',
 'uri': 'gs://<bucket>/abehsu/devfest/files/Tesla-Master-Plan-Part-3.pdf',
 'name': 'projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo/branches/0/documents/doc-3',
 'blocks': [{'blockId': '1',
   'text': 'Master Plan Part 3',
   'type': 'HEADING_1',
   'pageSpan': {'pageStart': 1, 'pageEnd': 7},
   'parentBlockId': '0',
   'childrenBlockIds': ['22', '23', '2', '3', '4', '5', '6']},
  {'blockId': '22',
   'text': 'Sustainable Energy for All of Earth',
   'type': 'PARAGRAPH',
   'pageSpan': {'pageStart': 1, 'pageEnd': 1},
   'parentBlockId': '1'},
  {'blockId': '23',
   'text': 'Master Plan Part 3 - Sustainable Energy for All of Earth',
   'type': 'PARAGRAPH',
   'pageSpan': {'pageStart': 1, 'pageEnd': 1},
   'parentBlockId': '1'},
  {'blockId': '2',
   'text': 'Table of Contents',
   'type': 'HEADING_2',
   'pageSpan': {'pageStart': 2, 'pageEnd': 2},
   'parentBlockId': '1',
   'childrenBlockIds': ['99']},

### Get CHUNKED_DOCUMENT 

In [17]:
document_chunk_data = agent_builder_util.get_processed_document(
    document_path=documents.documents[2].name,
    processed_document_type="CHUNKED_DOCUMENT"
)

In [18]:
json.loads(document_chunk_data.json_data)

{'documentMetadata': {'uri': 'gs://<bucket>/abehsu/devfest/files/Tesla-Master-Plan-Part-3.pdf',
  'title': 'Tesla-Master-Plan-Part-3'},
 'chunks': [{'name': 'projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo/branches/0/documents/doc-3/chunks/c1',
   'id': 'c1',
   'content': '# Master Plan Part 3\n\nSustainable Energy for All of Earth Master Plan Part 3 - Sustainable Energy for All of Earth\n\n## Table of Contents\n\n_START_OF_TABLE_\nTABLE_IN_MARKDOWN:\n|-|-|\n| Executive Summary | 03 |\n| The Current Energy Economy is Wasteful | 04 |\n| The Plan to Eliminate Fossil Fuels | 05 |\n| 1. Repower the Existing Grid with Renewables | 05 |\n| 2. Switch to Electric Vehicles | 05 |\n| 3. Switch to Heat Pumps in Residential, Business & Industry | 07 |\n| 4. Electrify High Temperature Heat Delivery and Hydrogen | 09 |\n| 5. Sustainably Fuel Planes & Boats | 12 |\n| 6. Manufacture the Sustainable Energy Economy | 12 |\n| Modeling The Fully Sustainable E

### Option2: get list of chunks and chunks info

In [19]:
documents.documents[2].name

'projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo/branches/0/documents/doc-3'

In [20]:
document_chunks = agent_builder_util.list_chunks(
    document_path=documents.documents[2].name,
)
document_chunks

ListChunksPager<chunks {
  name: "projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo/branches/0/documents/doc-3/chunks/c1"
  id: "c1"
  content: "# Master Plan Part 3\n\nSustainable Energy for All of Earth Master Plan Part 3 - Sustainable Energy for All of Earth\n\n## Table of Contents\n\n_START_OF_TABLE_\nTABLE_IN_MARKDOWN:\n|-|-|\n| Executive Summary | 03 |\n| The Current Energy Economy is Wasteful | 04 |\n| The Plan to Eliminate Fossil Fuels | 05 |\n| 1. Repower the Existing Grid with Renewables | 05 |\n| 2. Switch to Electric Vehicles | 05 |\n| 3. Switch to Heat Pumps in Residential, Business & Industry | 07 |\n| 4. Electrify High Temperature Heat Delivery and Hydrogen | 09 |\n| 5. Sustainably Fuel Planes & Boats | 12 |\n| 6. Manufacture the Sustainable Energy Economy | 12 |\n| Modeling The Fully Sustainable Energy Economy | 13 |\n| • Energy Storage Technologies Evaluated | 18 |\n| Generation Technologies Evaluated • | 19 |\n| Model Result

In [21]:
document_chunks.chunks[0].name

'projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo/branches/0/documents/doc-3/chunks/c1'

In [22]:
document_chunk_data = agent_builder_util.get_chunk(
    chunk_path=document_chunks.chunks[0].name
)
document_chunk_data

name: "projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo/branches/0/documents/doc-3/chunks/c1"
id: "c1"
content: "# Master Plan Part 3\n\nSustainable Energy for All of Earth Master Plan Part 3 - Sustainable Energy for All of Earth\n\n## Table of Contents\n\n_START_OF_TABLE_\nTABLE_IN_MARKDOWN:\n|-|-|\n| Executive Summary | 03 |\n| The Current Energy Economy is Wasteful | 04 |\n| The Plan to Eliminate Fossil Fuels | 05 |\n| 1. Repower the Existing Grid with Renewables | 05 |\n| 2. Switch to Electric Vehicles | 05 |\n| 3. Switch to Heat Pumps in Residential, Business & Industry | 07 |\n| 4. Electrify High Temperature Heat Delivery and Hydrogen | 09 |\n| 5. Sustainably Fuel Planes & Boats | 12 |\n| 6. Manufacture the Sustainable Energy Economy | 12 |\n| Modeling The Fully Sustainable Energy Economy | 13 |\n| • Energy Storage Technologies Evaluated | 18 |\n| Generation Technologies Evaluated • | 19 |\n| Model Results | 20 |\n| US Only Model Resul

### Get Document info

In [23]:
agent_builder_util.get_document(documents.documents[1])

name: "projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo/branches/0/documents/doc-2"
id: "doc-2"
schema_id: "default_schema"
struct_data {
  fields {
    key: "file_type"
    value {
      string_value: "pdf"
    }
  }
  fields {
    key: "file_name"
    value {
      string_value: "626777_Emmitsburg-BSDL_Rev0p6.pdf"
    }
  }
  fields {
    key: "file_name_2"
    value {
      string_value: "626777_Emmitsburg-BSDL_Rev0p6.pdf"
    }
  }
  fields {
    key: "doc_type"
    value {
      string_value: "public"
    }
  }
  fields {
    key: "deleted"
    value {
      string_value: "no"
    }
  }
  fields {
    key: "created_unix_time"
    value {
      string_value: "2021-01-01T06:00:00Z"
    }
  }
  fields {
    key: "company"
    value {
      string_value: "intel"
    }
  }
}
parent_document_id: "doc-2"
content {
  mime_type: "application/pdf"
  uri: "gs://<bucket>/abehsu/devfest/files/626777_Emmitsburg-BSDL_Rev0p6.pdf"
}
index_time {
  secon

### Health check

In [24]:
agent_builder_util.check_index_status(
    document=documents.documents[1]
)

'2024-11-23T03:07:06.077301Z'

### Create Search app

In [25]:
agent_builder_util.create_engine(
    data_store_ids=[DATA_STORE_ID],
    display_name=APP_ENGINE_DISPLAY_NAME,
    engine_id=APP_ENGINE_ID
)

INFO:root:Waiting for operation to complete...


name: "projects/<project_number>/locations/us/collections/default_collection/engines/devfest_demo_app"
display_name: "devfest_demo_app"
data_store_ids: "devfest_demo"
solution_type: SOLUTION_TYPE_SEARCH
search_engine_config {
  search_tier: SEARCH_TIER_ENTERPRISE
  search_add_ons: SEARCH_ADD_ON_LLM
}

### Search

In [29]:
query="When does Alphabet plan to get to net zero?"

Extractive segment  
Note: when enable chunking config, google doesn't allow us to use extracrive answer

In [30]:
request = agent_builder_util.generate_search_request(
    data_store_id=DATA_STORE_ID,
    query=query,
    max_extractive_segment_count=5,
    return_extractive_segment_score=True,
    page_size=3
)
request


serving_config: "projects/<project>/locations/us/collections/default_collection/dataStores/devfest_demo/servingConfigs/default_serving_config"
query: "When does Alphabet plan to get to net zero?"
page_size: 3
spell_correction_spec {
  mode: SUGGESTION_ONLY
}
content_search_spec {
  extractive_content_spec {
    max_extractive_segment_count: 5
    return_extractive_segment_score: true
  }
  search_result_mode: DOCUMENTS
}

In [31]:
results = agent_builder_util.search_client.search(request=request)
results = MessageToDict(results._pb)
results


{'results': [{'id': 'doc-1',
   'document': {'name': 'projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo/branches/0/documents/doc-1',
    'id': 'doc-1',
    'structData': {'company': 'google',
     'file_type': 'pdf',
     'file_name': '20230203-alphabet-10K.pdf',
     'doc_type': 'public',
     'created_unix_time': '2023-02-03T07:00:00Z',
     'deleted': 'no'},
    'derivedStructData': {'extractive_segments': [{'content': "# Ongoing Commitment to Sustainability\n\nWe believe that every business has the opportunity and obligation to protect our planet. Sustainability is one of our core values at Google, and we strive to build sustainability into everything we do. We have been a leader on sustainability and climate change since Google's founding more than 20 years ago. These are some of our key achievements over the past two decades: • In 2007, we became the first major company to be carbon neutral for our operations. • In 2017, we became the f

In [32]:
for result in results["results"]:
    print(result["id"], result["document"]["structData"]["file_name"])

doc-1 20230203-alphabet-10K.pdf
doc-3 Tesla-Master-Plan-Part-3.pdf


When we change the search mode to CHUNKS, the extractive_segment will not be returned.

In [33]:
request = agent_builder_util.generate_search_request(
    data_store_id=DATA_STORE_ID,
    query=query,
    max_extractive_segment_count=5,
    return_extractive_segment_score=True,
    search_result_mode="CHUNKS",
    # num_next_chunks=1,
    # num_previous_chunks=1,
    page_size=10 #How many chunks returns
)
request

serving_config: "projects/<project>/locations/us/collections/default_collection/dataStores/devfest_demo/servingConfigs/default_serving_config"
query: "When does Alphabet plan to get to net zero?"
page_size: 10
spell_correction_spec {
  mode: SUGGESTION_ONLY
}
content_search_spec {
  extractive_content_spec {
    max_extractive_segment_count: 5
    return_extractive_segment_score: true
  }
  search_result_mode: CHUNKS
  chunk_spec {
  }
}

In [34]:
results = agent_builder_util.search_client.search(request=request)
results = MessageToDict(results._pb)
results


{'results': [{'modelScores': {'relevance_score': {'values': [0.7]}},
   'chunk': {'name': 'projects/<project_number>/locations/us/collections/default_collection/dataStores/devfest_demo/branches/0/documents/doc-1/chunks/c12',
    'id': 'c12',
    'content': "# Ongoing Commitment to Sustainability\n\nWe believe that every business has the opportunity and obligation to protect our planet. Sustainability is one of our core values at Google, and we strive to build sustainability into everything we do. We have been a leader on sustainability and climate change since Google's founding more than 20 years ago. These are some of our key achievements over the past two decades: • In 2007, we became the first major company to be carbon neutral for our operations. • In 2017, we became the first major company to match 100% of our annual electricity use with renewable energy, which we have achieved for five consecutive years. In 2020, we issued $5.75 billion in sustainability bonds—the largest sustain

In [35]:
for result in results["results"]:
    id = result["chunk"]["id"]
    print(f"chunk id: {id}")

chunk id: c12
chunk id: c137
chunk id: c6
chunk id: c39
chunk id: c40
chunk id: c7
chunk id: c5
chunk id: c90
chunk id: c144
chunk id: c146


### Apply filter on search

In [36]:
request = agent_builder_util.generate_search_request(
    data_store_id=DATA_STORE_ID,
    query=query,
    max_extractive_segment_count=5,
    return_extractive_segment_score=True,
    page_size=3,
    filter="file_name: ANY(\"20230203-alphabet-10K\")"
)
request


serving_config: "projects/<project>/locations/us/collections/default_collection/dataStores/devfest_demo/servingConfigs/default_serving_config"
query: "When does Alphabet plan to get to net zero?"
page_size: 3
filter: "file_name: ANY(\"20230203-alphabet-10K\")"
spell_correction_spec {
  mode: SUGGESTION_ONLY
}
content_search_spec {
  extractive_content_spec {
    max_extractive_segment_count: 5
    return_extractive_segment_score: true
  }
  search_result_mode: DOCUMENTS
}

In [37]:
results = agent_builder_util.search_client.search(request=request)
results = MessageToDict(results._pb)
results

{'attributionToken': '-gHw-QoMCP-whboGEIbc0r0CEiQ2NzNmMWQzNy0wMDAwLTI4ZDItYjc0NC1kNDNhMmNjNzg0ZmYiB0dFTkVSSUMquAGZ3qgv24-aItSynRXk7Ygtzua1L6OJsy2DspoitJKuMOuCsS3LmrQw-fazLcH4vDDE-LwwxPzLMJzd2DDHxrEw_PazLZbeqC-jgJciwvCeFeiCsS2Q97IwkKS0MJnd2DDn7Ygtq8SKLY2ktDDExrEw0ea1L5vWty3ej5oioImzLcH8yzCAspoijr6dFar4sy23kq4wlJLFMK34sy2OkckwtreMLcXL8xec7Z0tzpq0MK7Eii2Y1rctMAE',
 'guidedSearchResult': {},
 'summary': {},
 'queryExpansionInfo': {}}

In [38]:
request = agent_builder_util.generate_search_request(
    data_store_id=DATA_STORE_ID,
    query=query,
    max_extractive_segment_count=5,
    return_extractive_segment_score=True,
    page_size=3,
    filter="file_name: ANY(\"20230203-alphabet-10K.pdf\")"
)
request


serving_config: "projects/<project>/locations/us/collections/default_collection/dataStores/devfest_demo/servingConfigs/default_serving_config"
query: "When does Alphabet plan to get to net zero?"
page_size: 3
filter: "file_name: ANY(\"20230203-alphabet-10K.pdf\")"
spell_correction_spec {
  mode: SUGGESTION_ONLY
}
content_search_spec {
  extractive_content_spec {
    max_extractive_segment_count: 5
    return_extractive_segment_score: true
  }
  search_result_mode: DOCUMENTS
}

In [39]:
results = agent_builder_util.search_client.search(request=request)
results = MessageToDict(results._pb)
for result in results["results"]:
    print(result["id"], result["document"]["structData"]["file_name"])

doc-1 20230203-alphabet-10K.pdf


In [40]:
request = agent_builder_util.generate_search_request(
    data_store_id=DATA_STORE_ID,
    query=query,
    max_extractive_segment_count=5,
    return_extractive_segment_score=True,
    page_size=3,
    filter="file_name_2: ANY(\"20230203-alphabet-10K.pdf\")"
)
request
agent_builder_util.search_client.search(request=request)

InvalidArgument: 400 Request contains an invalid argument.

In [41]:
origin_query = "What is binY"
request = agent_builder_util.generate_search_request(
    data_store_id=DATA_STORE_ID,
    query=origin_query,
    max_extractive_segment_count=5,
    return_extractive_segment_score=True,
    page_size=3,
)
result = agent_builder_util.search_client.search(request=request)
print(f"origin query:{origin_query}")
print(f"corrected query:{result.corrected_query}")

origin query:What is binY
corrected query:What is bonY
