In [1]:
#Source: https://ai.google.dev/docs/semantic_retriever

# Authentication (OAuth using service accounts) 

In [2]:
# Rename the uploaded file to `service_account_key.json` OR
# Change the variable `service_account_file_name` in the code below.
service_account_file_name = '/Users/jeana/rag_gemini_service.json'

from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file(service_account_file_name)

scoped_credentials = credentials.with_scopes(
    ['https://www.googleapis.com/auth/cloud-platform', 'https://www.googleapis.com/auth/generative-language.retriever'])

In [3]:
import google.ai.generativelanguage as glm
generative_service_client = glm.GenerativeServiceClient(credentials=scoped_credentials)
retriever_service_client = glm.RetrieverServiceClient(credentials=scoped_credentials)
permission_service_client = glm.PermissionServiceClient(credentials=scoped_credentials)

# Create Corpus

In [4]:
example_corpus = glm.Corpus(display_name="Google for Developers Blog")
create_corpus_request = glm.CreateCorpusRequest(corpus=example_corpus)

# Make the request
create_corpus_response = retriever_service_client.create_corpus(create_corpus_request)

# Set the `corpus_resource_name` for subsequent sections.
corpus_resource_name = create_corpus_response.name
print(create_corpus_response)

name: "corpora/google-for-developers-blog-66ccd95rx1mt"
display_name: "Google for Developers Blog"
create_time {
  seconds: 1702979206
  nanos: 361433000
}
update_time {
  seconds: 1702979206
  nanos: 361433000
}



# Get Corpus

In [5]:
get_corpus_request = glm.GetCorpusRequest(name=corpus_resource_name)

# Make the request
get_corpus_response = retriever_service_client.get_corpus(get_corpus_request)

# Print the response
print(get_corpus_response)

name: "corpora/google-for-developers-blog-66ccd95rx1mt"
display_name: "Google for Developers Blog"
create_time {
  seconds: 1702979206
  nanos: 361433000
}
update_time {
  seconds: 1702979206
  nanos: 361433000
}



# Create Document

In [6]:
# Create a document with a custom display name.
example_document = glm.Document(display_name="Introducing Project IDX, An Experiment to Improve Full-stack, Multiplatform App Development")

# Add metadata.
# Metadata also supports numeric values not specified here
document_metadata = [
    glm.CustomMetadata(key="url", string_value="https://developers.googleblog.com/2023/08/introducing-project-idx-experiment-to-improve-full-stack-multiplatform-app-development.html")]
example_document.custom_metadata.extend(document_metadata)

# Make the request
# corpus_resource_name is a variable set in the "Create a corpus" section.
create_corpus_request = glm.CreateDocumentRequest(parent=corpus_resource_name, document=example_document)
create_document_response = retriever_service_client.create_document(create_corpus_request)

# Set the `document_resource_name` for subsequent sections.
document_resource_name = create_document_response.name
print(create_document_response)

name: "corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2"
display_name: "Introducing Project IDX, An Experiment to Improve Full-stack, Multiplatform App Development"
custom_metadata {
  string_value: "https://developers.googleblog.com/2023/08/introducing-project-idx-experiment-to-improve-full-stack-multiplatform-app-development.html"
  key: "url"
}
update_time {
  seconds: 1702979208
  nanos: 396300000
}
create_time {
  seconds: 1702979208
  nanos: 396300000
}



# Get created document

In [7]:
get_document_request = glm.GetDocumentRequest(name=document_resource_name)

# Make the request
# document_resource_name is a variable set in the "Create a document" section.
get_document_response = retriever_service_client.get_document(get_document_request)

# Print the response
print(get_document_response)

name: "corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2"
display_name: "Introducing Project IDX, An Experiment to Improve Full-stack, Multiplatform App Development"
custom_metadata {
  string_value: "https://developers.googleblog.com/2023/08/introducing-project-idx-experiment-to-improve-full-stack-multiplatform-app-development.html"
  key: "url"
}
update_time {
  seconds: 1702979208
  nanos: 396300000
}
create_time {
  seconds: 1702979208
  nanos: 396300000
}



# Ingest and Chunk a document

In [8]:
from google_labs_html_chunker.html_chunker import HtmlChunker

from urllib.request import urlopen

with(urlopen("https://developers.googleblog.com/2023/08/introducing-project-idx-experiment-to-improve-full-stack-multiplatform-app-development.html")) as f:
  html = f.read().decode("utf-8")

# This step creates the Chunk objects themselves and the next section uploads them to the Semantic Retriever API.
# Chunk the file using HtmlChunker
chunker = HtmlChunker(
    max_words_per_aggregate_passage=200,
    greedily_aggregate_sibling_nodes=True,
    html_tags_to_exclude={"noscript", "script", "style"},
)
passages = chunker.chunk(html)
print(passages)


# Create `Chunk` entities.
chunks = []
for passage in passages:
    chunk = glm.Chunk(data={'string_value': passage})
    # Optionally, you can add metadata to a chunk
    chunk.custom_metadata.append(glm.CustomMetadata(key="tags",
                                                    string_list_value=glm.StringList(
                                                        values=["Google For Developers", "Project IDX", "Blog", "Announcement"])))
    chunk.custom_metadata.append(glm.CustomMetadata(key="chunking_strategy",
                                                    string_value="greedily_aggregate_sibling_nodes"))
    chunk.custom_metadata.append(glm.CustomMetadata(key = "publish_date",
                                                    numeric_value = 20230808))
    chunks.append(chunk)
print(chunks)

['Introducing Project IDX, An Experiment to Improve Full-stack, Multiplatform App Development - Google for Developers', 'menu search Back to Google for Developers Latest Get Inspired Announcements Events Resources search Tags Latest Get Inspired Announcements Events Resources Back to Google for Developers', 'Introducing Project IDX, An Experiment to Improve Full-stack, Multiplatform App Development', 'August 08, 2023', 'Link copied to clipboard', 'Posted by Bre Arder, UX Research Lead, Kirupa Chinnathambi, Product Lead, Ashwin Raghav Mohan Ganesh, Engineering Lead, Erin Kidwell, Director of Engineering, and Roman Nurik, Design Lead', "These days, getting an app from zero to production – especially one that works well across mobile, web, and desktop platforms – can feel like building a Rube Goldberg machine.  You’ve got to navigate an endless sea of complexity, duct-taping together a tech stack that'll help you bootstrap, compile, test, deploy, and monitor your apps. While Google’s been

In [9]:
# Option 1: Use HtmlChunker in the section above.
# `chunks` is the variable set from the section above.
create_chunk_requests = []
for chunk in chunks:
  create_chunk_requests.append(glm.CreateChunkRequest(parent=document_resource_name, chunk=chunk))

# Make the request
request = glm.BatchCreateChunksRequest(parent=document_resource_name, requests=create_chunk_requests)
response = retriever_service_client.batch_create_chunks(request)
print(response)

chunks {
  name: "corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2/chunks/ix5dkga8d28v"
  data {
    string_value: "Introducing Project IDX, An Experiment to Improve Full-stack, Multiplatform App Development - Google for Developers"
  }
  custom_metadata {
    string_list_value {
      values: "Google For Developers"
      values: "Project IDX"
      values: "Blog"
      values: "Announcement"
    }
    key: "tags"
  }
  custom_metadata {
    string_value: "greedily_aggregate_sibling_nodes"
    key: "chunking_strategy"
  }
  custom_metadata {
    numeric_value: 20230808
    key: "publish_date"
  }
  state: STATE_PENDING_PROCESSING
}
chunks {
  name: "corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2/chunks/xrhdreimwtq4"
  data {
    string_value: "menu search Back to Google for Developers Latest Get Inspired Announcements Events Resources search Tags Latest Get Inspired Announcements Even

# List chunks and get state

In [10]:
# Make the request
request = glm.ListChunksRequest(parent=document_resource_name)
list_chunks_response = retriever_service_client.list_chunks(request)
for index, chunks in enumerate(list_chunks_response.chunks):
  print(f'\nChunk # {index + 1}')
  print(f'Resource Name: {chunks.name}')
  # Only ACTIVE chunks can be queried.
  print(f'State: {glm.Chunk.State(chunks.state).name}')


Chunk # 1
Resource Name: corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2/chunks/370j4xvvsb9f
State: STATE_PENDING_PROCESSING

Chunk # 2
Resource Name: corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2/chunks/3greskf5m272
State: STATE_PENDING_PROCESSING

Chunk # 3
Resource Name: corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2/chunks/3nutrg0fntr6
State: STATE_PENDING_PROCESSING

Chunk # 4
Resource Name: corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2/chunks/3yxvhrogyhc6
State: STATE_PENDING_PROCESSING

Chunk # 5
Resource Name: corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2/chunks/83e2zxquot9y
State: STATE_PENDING_PROCESSING

Chunk # 6
Resource Name: corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--597

# Query the Corpus

In [11]:
# Use the QueryCorpusRequest method to perform semantic search to get relevant passages.
user_query = "What is the purpose of Project IDX?"
results_count = 5

# Add metadata filters for both chunk and document.
chunk_metadata_filter = glm.MetadataFilter(key='chunk.custom_metadata.tags',
                                           conditions=[glm.Condition(
                                              string_value='Google For Developers',
                                              operation=glm.Condition.Operator.INCLUDES)])

# Make the request
# corpus_resource_name is a variable set in the "Create a corpus" section.
request = glm.QueryCorpusRequest(name=corpus_resource_name,
                                 query=user_query,
                                 results_count=results_count,
                                 metadata_filters=[chunk_metadata_filter])
query_corpus_response = retriever_service_client.query_corpus(request)
print(query_corpus_response)

relevant_chunks {
  chunk_relevance_score: 0.746129632
  chunk {
    name: "corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2/chunks/3yxvhrogyhc6"
    data {
      string_value: "Project IDX is a browser-based development experience built on Google Cloud and powered by Codey , a foundational AI model trained on code and built on PaLM 2. It’s designed to make it easier to build, manage and deploy full-stack web and multiplatform applications, with popular frameworks and languages. Project IDX is also built on Code OSS , so it should feel familiar no matter what you’re building. A big part of why we’re sharing Project IDX today is we’d love to hear from the broader developer community on what could help you work even faster. In the meantime, here’s a preview of what’s possible today with Project IDX."
    }
    custom_metadata {
      string_list_value {
        values: "Google For Developers"
        values: "Project IDX"
        values: 

# Attributed Question Answering

In [22]:
user_query = "What is the purpose of Project IDX?"
answer_style = "ABSTRACTIVE" # Or VERBOSE, EXTRACTIVE
MODEL_NAME = "models/aqa"

"""
https://ai.google.dev/models/gemini#aqa
Available base models: 
['models/chat-bison-001', 'models/text-bison-001', 'models/embedding-gecko-001', 'models/gemini-pro', 
'models/gemini-pro-vision', 'models/embedding-001', 'models/aqa'] 
    """
# Make the request
# corpus_resource_name is a variable set in the "Create a corpus" section.
content = glm.Content(parts=[glm.Part(text=user_query)])
retriever_config = glm.SemanticRetrieverConfig(source=corpus_resource_name, query=content)
req = glm.GenerateAnswerRequest(model=MODEL_NAME,
                                contents=[content],
                                semantic_retriever=retriever_config,
                                answer_style=answer_style)
aqa_response = generative_service_client.generate_answer(req)
print(aqa_response)

answer {
  content {
    parts {
      text: "Project IDX is a browser-based development experience built on Google Cloud and powered by Codey, a foundational AI model trained on code and built on PaLM 2. It’s designed to make it easier to build, manage and deploy full-stack web and multiplatform applications, with popular frameworks and languages."
    }
  }
  finish_reason: STOP
  grounding_attributions {
    source_id {
      semantic_retriever_chunk {
        source: "corpora/google-for-developers-blog-66ccd95rx1mt"
        chunk: "corpora/google-for-developers-blog-66ccd95rx1mt/documents/introducing-project-idx-an--59760ilslad2/chunks/3yxvhrogyhc6"
      }
    }
    content {
      parts {
        text: "Project IDX is a browser-based development experience built on Google Cloud and powered by Codey , a foundational AI model trained on code and built on PaLM 2. It’s designed to make it easier to build, manage and deploy full-stack web and multiplatform applications, with popular f

In [13]:
aqa_response.answer.content.parts[0].text

'Project IDX is a browser-based development experience built on Google Cloud and powered by Codey, a foundational AI model trained on code and built on PaLM 2. It’s designed to make it easier to build, manage and deploy full-stack web and multiplatform applications, with popular frameworks and languages.'

### Ingesting another document

In [15]:
# # Create a document with a custom display name.
# example_document = glm.Document(display_name="How it’s Made: Interacting with Gemini through multimodal prompting")

# # Add document metadata.
# # Metadata also supports numeric values not specified here
# document_metadata = [
#     glm.CustomMetadata(key="url", string_value="https://developers.googleblog.com/2023/12/how-its-made-gemini-multimodal-prompting.html")]
# example_document.custom_metadata.extend(document_metadata)

# # Make the CreateDocument request
# # corpus_resource_name is a variable set in the "Create a corpus" section.
# create_document_request = glm.CreateDocumentRequest(parent=corpus_resource_name, document=example_document)
# create_document_response = retriever_service_client.create_document(create_document_request)

# # Set the `document_resource_name` for subsequent sections.
# document_resource_name = create_document_response.name
# print(create_document_response)

# # Chunks - add another webpage from Google for Developers
# with(urlopen("https://developers.googleblog.com/2023/12/how-its-made-gemini-multimodal-prompting.html")) as f:
#   html = f.read().decode("utf-8")

# # Chunk the file using HtmlChunker
# chunker = HtmlChunker(
#     max_words_per_aggregate_passage=100,
#     greedily_aggregate_sibling_nodes=False,
# )
# passages = chunker.chunk(html)

# # Create `Chunk` entities.
# chunks = []
# for passage in passages:
#     chunk = glm.Chunk(data={'string_value': passage})
#     chunk.custom_metadata.append(glm.CustomMetadata(key="tags",
#                                                     string_list_value=glm.StringList(
#                                                         values=["Google For Developers", "Gemini API", "Blog", "Announcement"])))
#     chunk.custom_metadata.append(glm.CustomMetadata(key="chunking_strategy",
#                                                     string_value="no_aggregate_sibling_nodes"))
#     chunk.custom_metadata.append(glm.CustomMetadata(key = "publish_date",
#                                                     numeric_value = 20231206))
#     chunks.append(chunk)

# # Make the request
# create_chunk_requests = []
# for chunk in chunks:
#   create_chunk_requests.append(glm.CreateChunkRequest(parent=document_resource_name, chunk=chunk))
# request = glm.BatchCreateChunksRequest(parent=document_resource_name, requests=create_chunk_requests)
# response = retriever_service_client.batch_create_chunks(request)
# print(response)

### Using Llama Index HTML loader (experimental)

In [16]:
# # source : https://llamahub.ai/l/web-simple_web
# from llama_index import SummaryIndex
# from IPython.display import Markdown, display
# import os
# from llama_index import download_loader

# SimpleWebPageReader = download_loader("SimpleWebPageReader")

# loader = SimpleWebPageReader(html_to_text = True)
# documents = loader.load_data(urls=['https://developers.googleblog.com/2023/08/introducing-project-idx-experiment-to-improve-full-stack-multiplatform-app-development.html'])

# from llama_index.node_parser import SentenceSplitter
# parser = SentenceSplitter()
# nodes = parser.get_nodes_from_documents(documents)
# #there are still some unnecessary characters after the parsing.  html chunker is better for now