# Unstructured search and summarization

## Inputs

In [1]:
PROJECT_ID = "gcloud-create-customer"
REGION = "us-central1"
DATASET = "documents"

In [2]:
QUERY = "What claims can I make about supplements vs. beauty products on the label in the US?"

## Setup

In [3]:
# !pip install -q -U google-cloud-aiplatform==1.36.4 google-cloud-bigquery==3.13.0

In [4]:
import subprocess
import vertexai
from google.cloud import bigquery
from vertexai.language_models import ChatModel, TextGenerationModel, CodeGenerationModel

In [5]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

In [6]:
vertexai.init(project=PROJECT_ID, location=REGION)

model_text = TextGenerationModel.from_pretrained("text-bison@latest")
model_chat = ChatModel.from_pretrained("chat-bison@latest")

parameters = {
    "candidate_count": 1,
    "max_output_tokens": 2048,
    "temperature": 0.2,
    "top_p": 0.8,
    "top_k": 40
}

## Step 1: Inspect dataset

In [7]:
from google.cloud import bigquery

client = bigquery.Client()
tables = client.list_tables(DATASET)
output = ""

for item in tables:
    path = DATASET + "." + item.table_id
    table = client.get_table(path)
    output += "Got table '{}.{}.{}'.".format(table.project, table.dataset_id, table.table_id)
    output += "Table schema: {}".format(table.schema)
    output += "Table description: {}".format(table.description)
    output += "Table has {} rows".format(table.num_rows)
    output += "\n"

response = model_text.predict(
    """Summarize the output of these responses from the google-cloud-bigquery library. In a
    subsequent step, a large language model will generate Google SQL queries based on the
    information that you summarize here. Include column names as part of your summary.
    If a table has no data, then you can just give a very short summary. Try to generate a summary that
    is about 100 characters per table.

    Information about the dataset is as follows:
    {output}
    """.format(output=output),
    **parameters
)

output = response.text
print(output)

 | Table | Description |
| ----------- | ----------- |
| gcloud-create-customer.documents.object_table | 0 rows |
| gcloud-create-customer.documents.object_table_sm | 0 rows |
| gcloud-create-customer.documents.process_document_output | 1 row |
| gcloud-create-customer.documents.sample_docs | 101 rows |
| gcloud-create-customer.documents.sample_jsons | 8 rows |


## Step 2: Start session

In [8]:
chat = model_chat.start_chat(
    context="""
            You are an expert claims substantiator that works for a retail company that
            sells beauty products, nutrition products, and durables.
            
            During multiple conversation turns, you'll gather and explore information
            from different data sources to answer the user's question or input.
            
            After gathering some initial information, I will ask you to generate GoogleSQL
            query and Python code, then I'll run the query, then I'll tell you the result
            (if it ran successfully) or error (if it failed). And I'll repeat this process
            until you determine that you have enough information to give a final answer.

            The user input is:
            {query}

            Information about the dataset is as follows:
            {output}""".format(query=QUERY, output=output),
)

In [9]:
response = chat.send_message(
    """
    Given the user query, create 5 high-level questions that you would ask to find the
    answer to the user query in a BigQuery dataset. In a subsequent step, we will
    generate SQL from the questions that you write.
    """, **parameters)
output = response.text
print(output)

 1. What is the difference between supplements and beauty products?
2. What are the regulations for labeling supplements and beauty products in the US?
3. What are the common claims that are made about supplements and beauty products?
4. What evidence is there to support these claims?
5. What are the risks associated with taking supplements and using beauty products?



In [10]:
response = chat.send_message(
    """
    Generate a GoogleSQL query to BigQuery so that we can query the
    database and gather information to help answer the user's question.
    """, **parameters)
generated_sql = response.text
generated_sql = generated_sql.splitlines()
filtered_output = [line for line in generated_sql if '`' not in line]
generated_sql = '\n'.join(filtered_output)
print(generated_sql)

SELECT
  product_type,
  claim,
  COUNT(*) AS num_claims
FROM
  gcloud-create-customer.documents.sample_jsons
WHERE
  product_type IN ('supplement', 'beauty')
  AND claim IS NOT NULL
GROUP BY
  product_type,
  claim
ORDER BY
  num_claims DESC
LIMIT
  10;


In [17]:
generated_python

'\nfrom google.cloud import bigquery\nclient = bigquery.Client()\nquery_job = client.query("""\nSELECT\n  product_type,\n  claim,\n  COUNT(*) AS num_claims\nFROM\n  gcloud\n""")\nresults = query_job.result()\nfor row in results:\n    print(row.product_type, row.claim)\n'

In [13]:
for i in range(10):
    generated_python = """
from google.cloud import bigquery
client = bigquery.Client()
query_job = client.query(\"\"\"
{generated_sql}
\"\"\")
results = query_job.result()
for row in results:
    print(row.product_type, row.claim)
""".format(generated_sql=generated_sql)
    print(generated_sql)
    command = subprocess.run(["python", "-c", generated_python], capture_output=True, text=True)
    try:
        command.check_returncode()
        # print("SUCCESS: Code execution successful")
        result = command.stdout
        # print(result)
        break
    except Exception as err:
        result = command.stderr
        # print(result)
        # print("ERROR: Code execution failed")
        prompt = """
        The previous SQL query that you generated failed to run.

        Please fix the SQL query so that it doesn't throw an error.
        
        This is what the previous command returned: {result}

        Previous SQL query: {generated_sql}
        """.format(result=result, generated_sql=generated_sql)

        print("-------------------------------")
        print(prompt)
        
        response = chat.send_message(prompt, **parameters)
        
        generated_sql = response.text
        generated_sql = generated_sql.splitlines()
        filtered_output = [line for line in generated_sql if '`' not in line]
        generated_sql = '\n'.join(filtered_output)
        # print(generated_sql)
        print("################################")

SELECT
  product_type,
  claim,
  COUNT(*) AS num_claims
FROM
  gcloud-create-customer.documents.sample_jsons
WHERE
  product_type IN ('supplement', 'beauty')
  AND claim IS NOT NULL
GROUP BY
  product_type,
  claim
ORDER BY
  num_claims DESC
LIMIT
  10;
-------------------------------

        The previous SQL query that you generated failed to run.

        Please fix the SQL query so that it doesn't throw an error.
        
        This is what the previous command returned: Traceback (most recent call last):
  File "<string>", line 22, in <module>
  File "/Users/koverholt/miniconda3/lib/python3.11/site-packages/google/cloud/bigquery/job/query.py", line 1580, in result
    do_get_result()
  File "/Users/koverholt/miniconda3/lib/python3.11/site-packages/google/api_core/retry.py", line 366, in retry_wrapped_func
    return retry_target(
           ^^^^^^^^^^^^^
  File "/Users/koverholt/miniconda3/lib/python3.11/site-packages/google/api_core/retry.py", line 204, in retry_target
    ret