# Preload

In [None]:
# !pip install google-cloud-bigquery --upgrade
# !pip install google-cloud-aiplatform --upgrade

## Import and Define

In [None]:
import io
import json
import vertexai
import pandas as pd
from typing import Dict
from google.cloud import bigquery, storage
from vertexai.preview.generative_models import grounding, Tool
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, GenerationConfig

In [None]:
project_id = "vtxdemos"
region = "us-east1"
bucket_name_pickles = "etsy-demo"
bigquery_dataset_id = "demos_us"
table_id = "etsy_10k"

In [None]:
storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(bucket_name_pickles)

In [None]:
bq_client = bigquery.Client(project=project_id)
sql_query = f"SELECT * FROM `{project_id}.{bigquery_dataset_id}.{table_id}`"
df = bq_client.query(sql_query).to_dataframe()

In [None]:
# prompt: from df drop the columns min_price_usd max_price_usd  and pct_discount also drop the rows with nulls on price_usd

df = df.drop(['min_price_usd', 'max_price_usd', 'pct_discount', 'variations'], axis=1)
df = df.dropna(subset=['price_usd']).fillna("None")

## Utility Functions

In [None]:
def df_load_from_gcs(blob_name: str):
  data=bucket.blob(blob_name).download_as_bytes()
  return pd.read_pickle(io.BytesIO(data))

In [None]:
safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
    ),
]

In [None]:
from concurrent.futures import ThreadPoolExecutor

model_content = GenerativeModel(
    "gemini-1.5-flash-001",
    system_instruction="""
    From the following information create a smoothly description representing all the data you find in the context
    by not losing data.

    keep all the data.

    Output as plain text.
    """
)

def create_content(prompt: str):
  try: re = model_content.generate_content(
        [prompt],safety_settings=safety_settings
    ).text
  except: re = None
  return re

In [None]:
import pickle
from google.cloud import storage

def upload_to_gcs(bucket_name, blob_name, dataframe):
  """Uploads a pandas DataFrame to Google Cloud Storage as a pickle file."""
  storage_client = storage.Client(project=project_id)
  bucket = storage_client.bucket(bucket_name)
  blob = bucket.blob(blob_name)

  with open(blob_name.split("/")[-1],'wb') as f:
    pickle.dump(dataframe, f)

  blob.upload_from_filename(blob_name.split("/")[-1])

  print(f"DataFrame uploaded to gs://{bucket_name}/{blob_name}")

In [None]:
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

import json

def process_row(row):
  """
  Processes a single row, adds the 'content' column, and includes rate limiting.
  Converts all values to strings before JSON encoding.
  """
  try:
    row_dict = row._asdict()

    for key, value in row_dict.items():
        row_dict[key] = str(value)

    prompt = json.dumps(row_dict)
    row_dict['content'] = create_content(prompt)
    time.sleep(60/200)
    return row_dict
  except Exception as e:
    print(f"Error processing row: {e}")
    row_dict = row._asdict()
    row_dict['content'] = None
    return row_dict

rows_processed = 0
total_rows = len(df)

results_list = []

with ThreadPoolExecutor(max_workers=6) as executor:
  for result in executor.map(process_row, df.loc[:, ~df.columns.isin(["image_url", "listing_id_1", "listing_id", "taxonomy_id"])].itertuples(index=False)):
    results_list.append(result)  # Add the result to the list
    rows_processed += 1
    if rows_processed % 6 == 0:  # Print every 6 rows
      remaining_rows = total_rows - rows_processed
      print(f"Processed {rows_processed} rows. Remaining: {remaining_rows}")
results_df = pd.DataFrame(results_list)
df['content'] = results_df['content']
upload_to_gcs(bucket_name_pickles, "backup/data.pkl", df)

Processed 6 rows. Remaining: 9994
Processed 12 rows. Remaining: 9988
Processed 18 rows. Remaining: 9982
Processed 24 rows. Remaining: 9976
Processed 30 rows. Remaining: 9970
Processed 36 rows. Remaining: 9964
Processed 42 rows. Remaining: 9958
Processed 48 rows. Remaining: 9952
Processed 54 rows. Remaining: 9946
Processed 60 rows. Remaining: 9940
Processed 66 rows. Remaining: 9934
Processed 72 rows. Remaining: 9928
Processed 78 rows. Remaining: 9922
Processed 84 rows. Remaining: 9916
Processed 90 rows. Remaining: 9910
Processed 96 rows. Remaining: 9904
Processed 102 rows. Remaining: 9898
Processed 108 rows. Remaining: 9892
Processed 114 rows. Remaining: 9886
Processed 120 rows. Remaining: 9880
Processed 126 rows. Remaining: 9874
Processed 132 rows. Remaining: 9868
Processed 138 rows. Remaining: 9862
Processed 144 rows. Remaining: 9856
Processed 150 rows. Remaining: 9850
Processed 156 rows. Remaining: 9844
Processed 162 rows. Remaining: 9838
Processed 168 rows. Remaining: 9832
Processed

## Gemini Q&A

# Category 1 Preload

In [None]:
system_instructions_cat_1 = """
You are Etsy product expert.

Tasks:
- Create 4 questions that are most likely to be asked from the context itself (answerable with context only).
- Create 4 answers to these questions by looking up product information (context).

Rules:
Condense the response into a clear and concise summary.
Use bullet points whenever appropriate.
Be kind always and reply as descriptive as needed.
Do not add any special characters to the output.
Do not use markdown format. Use plain text only.

Give Questions and Answers:
"""

In [None]:
# Output Schema

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
    "response_mime_type": "application/json"
}


response_schema_cat1 = {
    "type": "OBJECT",
    "properties": {
        "questions_cat1": {
            "type": "ARRAY",
            "items": {
                "type": "STRING"
            }
        },
        "answers_cat1": {
            "type": "ARRAY",
            "items": {
                "type": "STRING"
            }
        }
    }
}

context_model = GenerativeModel(
    "gemini-1.5-flash-001",
    generation_config=GenerationConfig(temperature=1, response_mime_type="application/json", response_schema=response_schema_cat1),
    system_instruction=system_instructions_cat_1
)

In [None]:
re = context_model.generate_content(
    [
     f"""
     Give me Questions and their Answers from the following:

     Context:
     {str(df[df['listing_id'] == 970720611]['content'])}

     Contex Answerable Questions: {str(df[df['listing_id'] == 970720611]['questions_cat1'])}

     Output in JSON questions and answers:
     """], safety_settings=safety_settings)

In [None]:
import vertexai
import time
import json
import pandas as pd
from threading import Lock
from concurrent.futures import ThreadPoolExecutor

# Initialize Vertex AI
vertexai.init(project=project_id, location=region)

# Global variables
rows_processed = 0
total_rows = len(df)
results_list = []
lock = Lock() # Create a lock for thread safety

def process_row(row):
  global rows_processed
  retries = 3
  for attempt in range(retries):
    try:
      re = context_model.generate_content([f"Give me Questions and their Answers from the following:\n\nContext:\n {str(row.content)}\n\nContext Answerable Questions: {str(row.questions_cat2)}"], safety_settings=safety_settings)
      time.sleep(60/150*.95) # Adjust sleep time to stay within quota (150 req/min)
      try:
        result = json.loads(re.text)
        with lock: # Acquire lock before updating shared variables
          rows_processed += 1
          if rows_processed % 6 == 0:
            remaining_rows = total_rows - rows_processed
            print(f"Time: {time.time()-start_time}")
            print(f"Processed {rows_processed} rows. Remaining: {remaining_rows}")
        return result
      except:
        print(row)
        print(re)
        return {"answers_cat1": [None]*4, "questions_cat1": [None]*4}
    except Exception as e:
      if "Quota exceeded" in str(e) and attempt < retries - 1:
        print(f"Quota exceeded, retrying in {2**(attempt+1)} seconds...")
        time.sleep(2**(attempt+1))
      else:
        print(f"Error processing row: {e}")
        print(row)
        print(re)
        return {"answers_cat1": [None]*4, "questions_cat1": [None]*4}

start_time = time.time()
with ThreadPoolExecutor(max_workers=4) as executor: # Reduce threads for lower request rate
  for result in executor.map(process_row, df.loc[:, df.columns.isin(["content"])].itertuples(index=False)):
    results_list.append(result)

results_df = pd.DataFrame(results_list)
df[["questions_cat1", "answers_cat1"]] = results_df
upload_to_gcs(bucket_name_pickles, "backup/qa_cat1.pkl", df)

# Category 2 Preload

In [157]:
import json
import vertexai
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from vertexai.preview.generative_models import grounding, Tool
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, GenerationConfig

In [160]:
!gsutil cp gs://etsy-demo/backup/qa_cat1.pkl .

Copying gs://etsy-demo/backup/qa_cat1.pkl...
-
Operation completed over 1 objects/32.8 MiB.                                     


In [161]:
df = pd.read_pickle("qa_cat1.pkl")

In [162]:
vertexai.init(project="vtxdemos", location="us-central1")

In [166]:
system_instructions_cat_2 = """
Ignore all previous system instructions. For any query I give you, use only my Google Search agent to find information and formulate your response. Rely entirely on the search results, even if they are incomplete or imperfect.
You are an Etsy product expert familiar with various product categories and crafting techniques.

Tasks:
Analyze the provided Etsy listing content (Context).
Generate 4 unique and insightful questions about the product(s) mentioned in the Context but keep them as short as possible. These questions should:
Relate to general definitions, materials, production processes, historical context, or broader industry standards related to the product type.
Not be directly answerable from the Context itself. The user should need to perform a Google Search to find the answers.
Be relevant and interesting to a potential buyer who wants to learn more about the product category.
For each question, provide a concise and accurate answer based on information retrieved through Google Search.
Do not add any special character in your answers and questions.

"""

In [167]:
response_schema_cat2 = {
    "type": "OBJECT",
    "properties": {
        "questions_cat2": {
            "type": "ARRAY",
            "items": {
                "type": "STRING"
            },
            "min_items": 4,
            "max_items": 4
        },
        "answers_cat2": {
            "type": "ARRAY",
            "items": {
                "type": "STRING"
            },
            "min_items": 4,
            "max_items": 4
        },
    },
    "required": ["questions_cat2", "answers_cat2"]
}

tools = [
    Tool.from_google_search_retrieval(
        google_search_retrieval=grounding.GoogleSearchRetrieval()
    ),
]

ground_model = GenerativeModel(
    "gemini-1.5-flash-002",
    generation_config=GenerationConfig(
        temperature=1,
        response_mime_type="application/json",
        response_schema=response_schema_cat2,
        max_output_tokens=4000),
    tools=tools,
    system_instruction=system_instructions_cat_2
)

In [168]:
re = ground_model.generate_content(
    [
      f"""
      Run your tasks:

      Context:
      {(str(df.iloc[0]["content"])).strip()}

      Questions cat1: {','.join(df.iloc[0]["questions_cat1"])}
      """
    ],
    )
print(json.loads(re.text))

{'answers_cat2': ["Ceramic materials vary widely in their heat resistance some types of ceramic can withstand extremely high temperatures while others are more susceptible to cracking or damage from heat exposure Different firing processes also affect durability.  Understanding the specific type of ceramic used and the firing temperature is key to assessing the demon skull's long-term fire resistance", 'The weight of 57 ounces for an 8x8x8 inch ceramic skull seems relatively heavy for a solid ceramic piece of this size.  This suggests the ceramic may be dense or it might incorporate a filler material to achieve this weight.  It could also indicate thicker walls than a typical skull of similar dimensions.  Further investigation into similar ceramic sculptures would help clarify this', 'Many cultures have traditions associating skulls with both negative and positive meanings.  Demons are figures present in various mythologies, often associated with darkness or evil.  Exploring the cultur

In [171]:
# Parallel Processing
results_list = []
job_count = 0
total_jobs = len(df)

def process_row(row):
  global job_count
  content, questions_cat1 = row
  try:
    re = ground_model.generate_content(
        [
          f"""
          Run your tasks:

          Context:
          {(str(content)).strip()}

          Questions cat1: {','.join(questions_cat1)}
          """
        ],
        )
    print(f"Completed job {job_count+1}/{total_jobs}") #Print job number and total
    job_count += 1
    return(json.loads(re.text))

  except:
    print("error")
    job_count += 1 #Increment counter even if there is an error.
    return {"answers_cat2": [None]*4, "questions_cat2": [None]*4,}


with ThreadPoolExecutor(max_workers=6) as executor: # Reduce threads for lower request rate
  # Iterate over both columns, creating tuples
  for result in executor.map(process_row, ((row.content, row.answers_cat1) for row in df.itertuples(index=False))):
    results_list.append(result)

results_df = pd.DataFrame(results_list)
# df[["questions_cat2", "answers_cat2"]] = results_df
# upload_to_gcs(bucket_name_pickles, "backup/qa_cat2_v3.pkl", df)

Completed job 1/10000
Completed job 2/10000
Completed job 3/10000
Completed job 4/10000
Completed job 5/10000
Completed job 6/10000
Completed job 7/10000
Completed job 8/10000
Completed job 9/10000
Completed job 10/10000
Completed job 11/10000
Completed job 12/10000
Completed job 13/10000
Completed job 14/10000
Completed job 15/10000
Completed job 16/10000
Completed job 17/10000
Completed job 18/10000
Completed job 19/10000
Completed job 20/10000
Completed job 21/10000
Completed job 22/10000
Completed job 23/10000
Completed job 24/10000
Completed job 25/10000
Completed job 26/10000
Completed job 27/10000
Completed job 28/10000
Completed job 29/10000
Completed job 30/10000
Completed job 31/10000
Completed job 32/10000
Completed job 33/10000
Completed job 34/10000
Completed job 35/10000
Completed job 36/10000
Completed job 37/10000
Completed job 38/10000
Completed job 39/10000
Completed job 40/10000
Completed job 41/10000
Completed job 42/10000
Completed job 43/10000
Completed job 44/100

KeyboardInterrupt: 

In [None]:
import vertexai
import time
import json
import pandas as pd
from threading import Lock
from concurrent.futures import ThreadPoolExecutor

# Initialize Vertex AI
vertexai.init(project=project_id, location=region)

# Global variables
rows_processed = 0
total_rows = len(df)
results_list = []
lock = Lock() # Create a lock for thread safety

def process_row(row):  # Now accepts a tuple (content, questions_cat2)
  global rows_processed
  content, questions_cat1 = row  # Unpack the tuple
  print(questions_cat1)
  retries = 3
  for attempt in range(retries):
    try:
      re = ground_model.generate_content([f"Give me Questions and their Answers from the following:\n\nContext:\n {str(content)}\n\nContex Answerable Questions: {str(questions_cat1)}\n\n"], safety_settings=safety_settings)
      time.sleep(60/150*.95) # Adjust sleep time to stay within quota (150 req/min)
      try:
        result = json.loads(re.text)
        with lock: # Acquire lock before updating shared variables
          rows_processed += 1
          if rows_processed % 6 == 0:
            remaining_rows = total_rows - rows_processed
            print(f"Time: {time.time()-start_time}")
            print(f"Processed {rows_processed} rows. Remaining: {remaining_rows}")
        return result
      except:
        print("error")
        # print(row)
        # print(re)
        return {"answers_cat2": [None]*4, "questions_cat2": [None]*4}
    except Exception as e:
      if "Quota exceeded" in str(e) and attempt < retries - 1:
        print(f"Quota exceeded, retrying in {2**(attempt+1)} seconds...")
        time.sleep(2**(attempt+1))
      else:
        print(f"Error processing row: {e}")
        # print(row)
        # print(re)
        return {"answers_cat2": [None]*4, "questions_cat2": [None]*4}

start_time = time.time()
with ThreadPoolExecutor(max_workers=2) as executor: # Reduce threads for lower request rate
  # Iterate over both columns, creating tuples
  for result in executor.map(process_row, ((row.content, row.answers_cat1) for row in df.iloc[:10].itertuples(index=False))):
    results_list.append(result)

results_df = pd.DataFrame(results_list)
# df[["questions_cat2", "answers_cat2"]] = results_df
# upload_to_gcs(bucket_name_pickles, "backup/qa_cat2_v2.pkl", df)

['What are the dimensions of the demon skull?', 'How much does the demon skull weigh?', 'What colors is the demon skull available in?', 'Is the demon skull customizable?']
['What are the available sizes for the printable wall art?', 'What file formats are included in the digital download?', 'What is the theme and intended use of the printable wall art?', 'What is the price and how is the product delivered?']
['What material is the Tiffany Style Table Lamp made from?', 'What is the diameter of the lampshade?', 'How tall is the lamp?', 'Where can I purchase the Tiffany Style Table Lamp and what category does it belong to?']
error
['What materials is the box made of?', 'What are the dimensions of the box?', 'What type of clasp and hinges does the box have?', 'Can the box be personalized?']
error
['What is the "Handy One" pepper mill made of?', 'What makes the "Handy One" pepper mill unique?', 'What can the "Handy One" pepper mill be used for?', 'What is the price of the "Handy One" pepper

In [None]:
import vertexai
import time
import json
import pandas as pd

# Initialize Vertex AI
vertexai.init(project=project_id, location=region)

# Global variables
rows_processed = 0
total_rows = len(df)
results_list = []

def process_row(row):  # Now accepts a tuple (content, questions_cat2)
  global rows_processed
  content, questions_cat1 = row  # Unpack the tuple
  retries = 3
  for attempt in range(retries):
    try:
      re = ground_model.generate_content([f"Give me Questions and their Answers from the following:\n\nContext:\n {str(content)}\n\nContext Answerable Questions: {str(questions_cat1)}"], safety_settings=safety_settings)
      time.sleep(60/150*.95) # Adjust sleep time to stay within quota (150 req/min)
      try:
        result = json.loads(re.text)
        rows_processed += 1
        if rows_processed % 6 == 0:
          remaining_rows = total_rows - rows_processed
          print(f"Time: {time.time()-start_time}")
          print(f"Processed {rows_processed} rows. Remaining: {remaining_rows}")
        return result
      except:
        print(row)
        print(re)
        return {"answers_cat2": [None]*4, "questions_cat2": [None]*4}
    except Exception as e:
      if "Quota exceeded" in str(e) and attempt < retries - 1:
        print(f"Quota exceeded, retrying in {2**(attempt+1)} seconds...")
        time.sleep(2**(attempt+1))
      else:
        print(f"Error processing row: {e}")
        print(row)
        print(re)
        return {"answers_cat2": [None]*4, "questions_cat2": [None]*4}

start_time = time.time()

# Iterate over both columns, creating tuples and processing sequentially
for row in df.iloc[:10].itertuples(index=False):
  result = process_row((row.content, row.questions_cat1))
  results_list.append(result)

results_df = pd.DataFrame(results_list)
# df[["questions_cat2", "answers_cat2"]] = results_df
# upload_to_gcs(bucket_name_pickles, "backup/qa_cat2_v2.pkl", df)

This is a mini ceramic demon skull, available in black and gray. It is fireproof and designed to withstand the heat of your fireplace or fire pit, suitable for both gas and wood burning fires. It is almost half the size of a life-size human skull, measuring approximately 6 inches long, 4 inches tall, and 4 inches wide.  The skull is made of ceramic and is available for purchase for $16.00.  It is perfect for Halloween, Christmas, or any time you want to add a touch of spooky to your home. 

The skull is 8 inches wide, 8 inches high, and 8 inches long. It weighs 57 ounces. It is not customizable, but it is available in black or gray. It is made to order and ships directly from the maker. 

['The demon skull is 8 inches wide, 8 inches high, and 8 inches long.', 'It weighs 57 ounces.', 'It is made of ceramic and is available in black or gray.', 'The skull is not customizable but is made to order and ships directly from the maker.']
This printable wall art, inspired by Paris, is a charming

In [None]:
df= pd.read_pickle("qa_cat2.pkl")

# Category 3 Preload

- Build BigQuery Table with Embeddings (Text and MultiModal)

## Preparing Data
- Moving etsy images to cdn.

In [None]:
import os
import tqdm
import requests
from google.cloud import storage
from urllib.parse import urlparse
from requests.exceptions import MissingSchema
from concurrent.futures import ThreadPoolExecutor

In [None]:
suffix = "10k"
table_id = "etsy_10k"
bigquery_dataset_id = "demos_us"
bucket_name = "vtxdemos-fstoresearch-datasets"

### Copying Images from Etsy to Google Cloud CDN

In [None]:
public_gcs_link = []
private_gcs_link = []
public_cdn_link = []

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

total_images = len(df)
pbar = tqdm.tqdm(total=total_images, desc="Copying Images")

def process_image(args):
  index, row = args
  url = row["image_url"]
  if url is None:
    return None, None, None

  try:
    parsed_url = urlparse(url)
    filename = os.path.basename(parsed_url.path)
    response = requests.get(url)
    response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
    blob = bucket.blob(f"etsy-{suffix}/"+filename)
    blob.upload_from_string(response.content, content_type='image/jpeg')
    public_gcs_url = blob.public_url
    private_gcs_url = f"gs://{bucket_name}/etsy-{suffix}/{filename}"
    public_cdn_url = f"https://gcpetsy.sonrobots.net/etsy-{suffix}/{filename}"
    return public_gcs_url, private_gcs_url, public_cdn_url
  except (requests.exceptions.RequestException, MissingSchema) as e:
    print(f"Error processing URL {url} at index {index}: {e}")
    return None, None, None
  finally:
    pbar.update(1)


with ThreadPoolExecutor() as executor:
  results = list(executor.map(process_image, df.iterrows()))

for public_url, private_url, cdn_url in results:
  public_gcs_link.append(public_url)
  private_gcs_link.append(private_url)
  public_cdn_link.append(cdn_url)

df["public_gcs_link"] = public_gcs_link
df["private_gcs_link"] = private_gcs_link
df["public_cdn_link"] = public_cdn_link

pbar.close()

Copying Images:   0%|          | 0/10000 [00:41<?, ?it/s]
Copying Images:  16%|█▌        | 1580/10000 [00:57<03:19, 42.25it/s]

Error processing URL None at index 1584: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?


Copying Images:  90%|████████▉ | 8989/10000 [06:24<00:37, 27.16it/s]

Error processing URL None at index 8996: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?


Copying Images:  92%|█████████▏| 9247/10000 [06:34<00:29, 25.64it/s]

Error processing URL None at index 9255: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?


Copying Images: 100%|██████████| 10000/10000 [07:31<00:00, 22.16it/s]


In [None]:
sql_query = f"""
CREATE OR REPLACE EXTERNAL TABLE
  `demos_us.vtxdemos-etsy-biglake-images-{suffix}`
WITH CONNECTION `us.emb_connection`
OPTIONS (
  object_metadata = 'SIMPLE',
  uris = ['gs://vtxdemos-fstoresearch-datasets/etsy-{suffix}/*']
  )
"""

job = bq_client.query(sql_query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x78c5f26a3550>

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   listing_id        10000 non-null  Int64  
 1   title             10000 non-null  object 
 2   description       10000 non-null  object 
 3   price_usd         10000 non-null  float64
 4   tags              10000 non-null  object 
 5   materials         10000 non-null  object 
 6   attributes        10000 non-null  object 
 7   image_url         10000 non-null  object 
 8   listing_id_1      10000 non-null  Int64  
 9   taxonomy_id       10000 non-null  Int64  
 10  category          10000 non-null  object 
 11  content           7799 non-null   object 
 12  questions_cat1    10000 non-null  object 
 13  answers_cat1      9990 non-null   object 
 14  questions_cat2    9859 non-null   object 
 15  answers_cat2      9963 non-null   object 
 16  public_gcs_link   9997 non-null   object 

In [None]:
import pandas as pd

def process_data(data):
  """Processes data, handling None, NaN values, and single float values."""
  if isinstance(data, float) and pd.isnull(data):
    print("Single NaN value encountered. Returning an empty list.")
    return []
  elif isinstance(data, list) and all(pd.isnull(item) for item in data):
    print("All elements are None or NaN. Returning an empty list.")
    return []
  else:
    # Check for None within the list
    if isinstance(data,list) and any(pd.isnull(item) for item in data):
      print(f"Found None/NaN within the list: {data}")
    return data

# List of columns to process
columns_to_process = ['questions_cat1', 'answers_cat1', 'questions_cat2', 'answers_cat2']

# Apply the process_data function to the specified columns
for column in columns_to_process:
  df[column] = df[column].apply(process_data)

# Now load the modified DataFrame to BigQuery
job = bq_client.load_table_from_dataframe(df, f"{bigquery_dataset_id}.vtxdemos-etsy-{suffix}")
job.result()

All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elements are None or NaN. Returning an empty list.
All elemen

LoadJob<project=vtxdemos, location=US, id=01f09914-491b-49cb-a4ac-db580dc40a28>

### Create Embeddings

In [None]:
%%bigquery
CREATE OR REPLACE TABLE
  `demos_us.etsy-10k-embeddings` AS (
  WITH
    text_embeddings AS (
    SELECT
      *
    FROM
      ML.GENERATE_TEXT_EMBEDDING(MODEL demos_us.text_embedding_044,
        (
        SELECT
        *
        FROM
          `vtxdemos.demos_us.vtxdemos-etsy-10k`
        ORDER BY
          private_gcs_link )) ),
    image_embeddings AS (
    SELECT
      *
    FROM
      ML.GENERATE_EMBEDDING( MODEL `vtxdemos.demos_us.multimodalembedding`,
        TABLE `vtxdemos.demos_us.vtxdemos-etsy-biglake-images-10k`,
        STRUCT( TRUE AS flatten_json_output ) )
    ORDER BY
      uri)
  SELECT
    te.*,
    ie.*,
  FROM
    text_embeddings te
  INNER JOIN
    image_embeddings ie
  ON
    te.private_gcs_link = ie.uri
)

Query is running:   0%|          |

#### Create Titles and Summary

In [None]:
sql_query = f"""
CREATE OR REPLACE TABLE `demos_us.etsy-10k-embeddings-title` AS (
SELECT
  ml_generate_text_llm_result AS llm_title,
  * EXCEPT (prompt,
    ml_generate_text_status, ml_generate_text_llm_result, ml_generate_text_rai_result)
FROM
  ML.GENERATE_TEXT( MODEL `demos_us.gemini15`,
    (
    SELECT
      *,
      CONCAT("From the following text, create a short title, NO MORE than 4 words: text: ", title, " 4 words only output:") AS prompt
    FROMtx
      `demos_us.etsy-10k-embeddings`),
    STRUCT( 0.2 AS temperature,
      TRUE AS FLATTEN_JSON_OUTPUT ))
)
"""

job = bq_client.query(sql_query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x78c63c7a3a60>

In [None]:
sql_query = f"""
CREATE OR REPLACE TABLE `demos_us.etsy-10k-embeddings-title` AS (
SELECT
  ml_generate_text_llm_result AS summary,
  * EXCEPT (prompt,
    ml_generate_text_status, ml_generate_text_llm_result, ml_generate_text_rai_result)
FROM
  ML.GENERATE_TEXT( MODEL `demos_us.gemini15`,
    (
    SELECT
      *,
      CONCAT("From the following context, create a brief summary: context: ", content) AS prompt
    FROM
      `demos_us.etsy-10k-embeddings-title`
      ),
    STRUCT( 0.2 AS temperature,
      TRUE AS FLATTEN_JSON_OUTPUT ))
)
"""

job = bq_client.query(sql_query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x78c63c7a2fe0>

In [None]:
%%bigquery
CREATE OR REPLACE TABLE
  `demos_us.etsy-10k-full` AS (
  SELECT
    CAST(listing_id AS STRING) AS listing_id,
    * EXCEPT(listing_id,
      md5_hash,
      size,
      ml_generate_embedding_start_sec,
      ml_generate_embedding_end_sec,
      ml_generate_embedding_status,
      statistics
      )
  FROM
    `vtxdemos.demos_us.etsy-10k-embeddings-title`
  WHERE
    ARRAY_LENGTH(text_embedding) = 768
    AND ARRAY_LENGTH(ml_generate_embedding_result) = 1408)

Query is running:   0%|          |



### Creating Featurestore (Vector Database)

In [None]:
from vertexai.resources.preview import feature_store

In [None]:
feature_store_online_id = "feature_store_marketplace"

In [None]:
fos = feature_store.FeatureOnlineStore.create_optimized_store(
    feature_store_online_id
)

INFO:vertexai.resources.preview.feature_store.feature_online_store:Creating FeatureOnlineStore
INFO:vertexai.resources.preview.feature_store.feature_online_store:Create FeatureOnlineStore backing LRO: projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace/operations/7890546378126917632
INFO:vertexai.resources.preview.feature_store.feature_online_store:FeatureOnlineStore created. Resource name: projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace
INFO:vertexai.resources.preview.feature_store.feature_online_store:To use this FeatureOnlineStore in another session:
INFO:vertexai.resources.preview.feature_store.feature_online_store:feature_online_store = aiplatform.FeatureOnlineStore('projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace')


#### Create FeatureViews

In [None]:
import time

start_time = time.time()
bigquery_source = feature_store.utils.FeatureViewBigQuerySource(
        uri="bq://vtxdemos.demos_us.etsy-10k-full",
        entity_id_columns=["listing_id"],
    )
index_config = feature_store.utils.IndexConfig(
    embedding_column="text_embedding",
    dimensions=768,
    algorithm_config=feature_store.utils.TreeAhConfig(),
)

fv = fos.create_feature_view(
    name="etsy_view_text1",
    source=bigquery_source,
    index_config=index_config,
)
print(time.time() - start_time)

INFO:vertexai.resources.preview.feature_store.feature_online_store:Creating FeatureView
INFO:vertexai.resources.preview.feature_store.feature_online_store:Create FeatureView backing LRO: projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace/featureViews/etsy_view_text1/operations/2731497083113570304
INFO:vertexai.resources.preview.feature_store.feature_online_store:FeatureView created. Resource name: projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace/featureViews/etsy_view_text1
INFO:vertexai.resources.preview.feature_store.feature_online_store:To use this FeatureView in another session:
INFO:vertexai.resources.preview.feature_store.feature_online_store:feature_view = aiplatform.FeatureView('projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace/featureViews/etsy_view_text1')


1.3132998943328857


In [None]:
sync_response = fv.sync()

In [None]:
import time

start_time = time.time()
while True:
    feature_view_sync = fv.get_sync(
        sync_response.resource_name.split("/")[9]
    ).gca_resource
    if feature_view_sync.run_time.end_time.seconds > 0:
        status = "Succeed" if feature_view_sync.final_status.code == 0 else "Failed"
        print(f"Sync {status} for {feature_view_sync.name}. \n {feature_view_sync}")
        # wait a little more for the job to properly shutdown
        time.sleep(30)
        break
    else:
        print("Sync ongoing, waiting for 30 seconds.")
    time.sleep(30)
print(time.time() - start_time)

Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing

#### Indexing for Image (Multimodal) Embeddings

In [None]:
import time

start_time = time.time()
bigquery_source = feature_store.utils.FeatureViewBigQuerySource(
        uri="bq://vtxdemos.demos_us.etsy-10k-full",
        entity_id_columns=["listing_id"],
    )
index_config = feature_store.utils.IndexConfig(
    embedding_column="ml_generate_embedding_result",
    dimensions=1408,
    algorithm_config=feature_store.utils.TreeAhConfig(),
)

fv = fos.create_feature_view(
    name="etsy_view_image1",
    source=bigquery_source,
    index_config=index_config,
)
print(time.time() - start_time)

In [None]:
sync_response = fv.sync()

In [None]:
import time

start_time = time.time()
while True:
    feature_view_sync = fv.get_sync(
        sync_response.resource_name.split("/")[9]
    ).gca_resource
    if feature_view_sync.run_time.end_time.seconds > 0:
        status = "Succeed" if feature_view_sync.final_status.code == 0 else "Failed"
        print(f"Sync {status} for {feature_view_sync.name}. \n {feature_view_sync}")
        # wait a little more for the job to properly shutdown
        time.sleep(30)
        break
    else:
        print("Sync ongoing, waiting for 30 seconds.")
    time.sleep(30)
print(time.time() - start_time)

### Using Gemini + RAG (FeatureStore) to Look at Similarities

In [None]:
import json
import asyncio
from vertexai.vision_models import MultiModalEmbeddingModel, Image
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

In [None]:
text_emb_model = TextEmbeddingModel.from_pretrained("text-embedding-004")
image_emb_model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")

In [None]:
fv_multi = feature_store.FeatureView(name="projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace/featureViews/etsy_view_image1")
fv_text = feature_store.FeatureView(name="projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace/featureViews/etsy_view_text1")

#### Using Parallelism to Retrieve Matches from Different FeatureViews (Vector Dimensional Spaces)

In [None]:
def response_process(result, multimodal: bool):
  neighbors = result["neighbors"]

  all_extracted_data = []
  for row in neighbors:
    extracted_data = {}
    if multimodal:
      extracted_data['image_distance'] = row['distance']  # Extract distance
    else:
      extracted_data['text_distance'] = row['distance']  # Extract distance

    for feature in row['entity_key_values']['key_values']['features']:
      name = feature['name']
      if name not in ['ml_generate_embedding_result', 'text_embedding']:
        if 'value' in feature:
          for value_type, value in feature['value'].items():
            extracted_data[name] = value
        else:
          extracted_data[name] = "no values"

    all_extracted_data.append(extracted_data)

  dataframe = pd.DataFrame(all_extracted_data)

  return dataframe

def vector_search(prompt: str, image: str = None, multimodal=True):
  if multimodal:
    if image:
      image = Image.load_from_file(image)
      embeddings = image_emb_model.get_embeddings(
          contextual_text=prompt,
      ).text_embedding
    else:
      embeddings = image_emb_model.get_embeddings(
          contextual_text=prompt,
          image=image,
      ).text_embedding

    r = fv_multi.search(
        embedding_value = embeddings,
        neighbor_count = 6,
        approximate_neighbor_candidates=16,
        leaf_nodes_search_fraction=1.0,
        return_full_entity=True,  # returning entities with metadata
    ).to_dict()
    df = response_process(r, multimodal)

  else:
    texts = [prompt]
    inputs = [TextEmbeddingInput(text, "RETRIEVAL_DOCUMENT") for text in texts]
    embeddings = text_emb_model.get_embeddings(inputs)[0].values
    r = fv_text.search(
        embedding_value = embeddings,
        neighbor_count = 6,
        approximate_neighbor_candidates=16,
        leaf_nodes_search_fraction=1.0,
        return_full_entity=True,  # returning entities with metadata
    ).to_dict()

    df = response_process(r, multimodal)
  return df

async def async_vector_search(input: str):
  with ThreadPoolExecutor() as executor:
    loop = asyncio.get_running_loop()
    return await loop.run_in_executor(executor, vector_search, input)

# RAG
def parallel_vector_search(input: str, image: str = None):
  with ThreadPoolExecutor() as executor:
    start_time=time.time()
    df_1 = executor.submit(vector_search, prompt=input, multimodal=True)
    df_2 = executor.submit(vector_search, prompt=input, multimodal=False)

    df_1 = df_1.result()
    df_2 = df_2.result()

    # Rename the distance column to distinguish between text and image
    df_1 = df_1.rename(columns={'distance_to_average_review': 'text_distance'})
    df_2 = df_2.rename(columns={'distance_to_average_review': 'image_distance'})

    print(df_1.columns)
    print(df_2.columns)


    # Perform an outer join to combine results, handling cases where
    def get_values(x):
        try:
            return tuple(x["values"])
        except (TypeError, KeyError):
            # Handle errors, e.g., return an empty tuple or special value
            return ()

    df_1["questions_cat1"] = df_1["questions_cat1"].apply(get_values)
    df_1["answers_cat1"] = df_1["answers_cat1"].apply(get_values)
    df_1["questions_cat2"] = df_1["questions_cat2"].apply(get_values)
    df_1["answers_cat2"] = df_1["answers_cat2"].apply(get_values)
    df_2["questions_cat1"] = df_2["questions_cat1"].apply(get_values)
    df_2["answers_cat1"] = df_2["answers_cat1"].apply(get_values)
    df_2["questions_cat2"] = df_2["questions_cat2"].apply(get_values)
    df_2["answers_cat2"] = df_2["answers_cat2"].apply(get_values)

    # Merge the DataFrames
    combined_results = pd.merge(df_1, df_2, on=['title', 'price_usd', 'public_cdn_link', 'private_gcs_link', 'content', 'description', 'materials', 'questions_cat1', 'questions_cat2', 'answers_cat1', 'answers_cat2', 'tags', 'llm_title', "summary"], how='outer')

    # Fill missing values (in case an row has only one type of embedding)
    combined_results['text_distance'] = combined_results['text_distance'].fillna(-1000)  # Large negative value if no text embedding
    combined_results['image_distance'] = combined_results['image_distance'].fillna(-1000)  # Large negative value if no image embedding

    # Apply weights (e.g., 70% text, 30% image)
    combined_results['weighted_distance'] = (0.5 * abs(combined_results['text_distance'])) + (0.5 * abs(combined_results['image_distance']))

    ranked_df = combined_results.sort_values('weighted_distance')

    response = [
        {
            "title": row["llm_title"],
            "subtitle":  row["title"],
            "price": row["price_usd"],
            "summary":  row["summary"],
            "uri": row["public_cdn_link"],
            "private_uri": row["private_gcs_link"],
            "content": row["content"],
            "description": row["description"],
            "materials": row["materials"],
            "tags": row["tags"],
            "questions_cat1": row["questions_cat1"],
            "questions_cat2": row["questions_cat2"],
            "answers_cat1": row["answers_cat1"],
            "answers_cat2": row["answers_cat2"],
        } for index, row in ranked_df.iterrows()]

    # Wait for both futures to complete and return their results
    return response

In [None]:
query = "starbucks mermaid"
start_time = time.time()
re = parallel_vector_search(query)
print(f"Retrieval Time: {time.time()-start_time}")
for i in re:
  print(i["uri"])

Index(['image_distance', 'ml_embed_text_status', 'summary', 'llm_title',
       'title', 'description', 'price_usd', 'tags', 'materials', 'attributes',
       'image_url', 'listing_id_1', 'taxonomy_id', 'category', 'content',
       'questions_cat1', 'answers_cat1', 'questions_cat2', 'answers_cat2',
       'public_gcs_link', 'private_gcs_link', 'public_cdn_link', 'uri',
       'generation', 'content_type', 'updated'],
      dtype='object')
Index(['text_distance', 'ml_embed_text_status', 'summary', 'llm_title',
       'title', 'description', 'price_usd', 'tags', 'materials', 'attributes',
       'image_url', 'listing_id_1', 'taxonomy_id', 'category', 'content',
       'questions_cat1', 'answers_cat1', 'questions_cat2', 'answers_cat2',
       'public_gcs_link', 'private_gcs_link', 'public_cdn_link', 'uri',
       'generation', 'content_type', 'updated'],
      dtype='object')
Retrieval Time: 0.6391451358795166
https://gcpetsy.sonrobots.net/etsy-10k/il_570xN.5812467473_6nfy.jpg
https://gc

In [None]:
image = Image.load_from_file("gs://vtxdemos-fstoresearch-datasets/etsy-10k/il_570xN.4594309340_i6j3.jpg")
embeddings = image_emb_model.get_embeddings(
    image=image,
    contextual_text="mermaid",
)

In [None]:
# prompt: do a numpy dot product between embeddings.text_embedding and embeddings.image_embedding

import numpy as np
np.dot(embeddings.text_embedding, embeddings.image_embedding)

0.053123318284467994

In [None]:
r = fv_multi.search(
    embedding_value = embeddings.image_embedding,
    neighbor_count = 6,
    approximate_neighbor_candidates=16,
    leaf_nodes_search_fraction=1.0,
    return_full_entity=True,  # returning entities with metadata
).to_dict()

df = response_process(r, multimodal=True)

In [None]:
r = fv_multi.search(
    embedding_value = embeddings.text_embedding,
    neighbor_count = 6,
    approximate_neighbor_candidates=16,
    leaf_nodes_search_fraction=1.0,
    return_full_entity=True,  # returning entities with metadata
).to_dict()

df = response_process(r, multimodal=True)

## **Cateroy 3 Embeddings + FeatureStore**

In [None]:
def image_embeddings(gsc_uri: str):
  image = Image.load_from_file(gsc_uri)
  embeddings = image_emb_model.get_embeddings(
    image=image
    ).image_embedding

  r = fv_multi.search(
  embedding_value = embeddings,
  neighbor_count = 6,
  approximate_neighbor_candidates=16,
  leaf_nodes_search_fraction=1.0,
  return_full_entity=True,  # returning entities with metadata
  ).to_dict()
  df = response_process(r, multimodal=True)
  return df.iloc[1:5]

def text_embeddings(content: str):
    texts = [content]
    inputs = [TextEmbeddingInput(text, "RETRIEVAL_DOCUMENT") for text in texts]
    embeddings = text_emb_model.get_embeddings(inputs)[0].values
    r = fv_text.search(
        embedding_value = embeddings,
        neighbor_count = 6,
        approximate_neighbor_candidates=16,
        leaf_nodes_search_fraction=1.0,
        return_full_entity=True,  # returning entities with metadata
    ).to_dict()

    df = response_process(r, multimodal=False)
    return df.iloc[1:5]

In [None]:
emb_df.iloc[0]["content"]

'This personalized pink fairies drawstring bag is the perfect choice for kids who love dance, ballet, or simply need a stylish and practical bag for their games. It\'s made from high-quality polyester and features a vibrant design with 100s of options available to choose from. You can personalize it with a name or initials, making it a unique and special gift. The bag measures 44cm x 36.4cm and is equipped with a drawstring closure for easy access. It\'s ideal for carrying essentials like shoes, clothes, and toys.\n\nThe bag is made-to-order and the personalization process is straightforward. Simply choose your desired design and add the text you would like in the "Add your personalization" box during checkout. While proofs aren\'t usually provided, you can request one if you need to ensure the image is correct, especially if you\'re adding a custom image.\n\nShipping times vary depending on your location:\n\n* **UK:** 1-3 business days\n* **Europe:** 5-7 business days\n* **Rest of the

In [None]:
cat_3_schema = {
  "type": "OBJECT",
  "properties": {
    "visual_image_context": {
      "type": "OBJECT",
      "properties": {
        "questions": {
          "type": "ARRAY",
          "items": { "type": "STRING" },
          "min_items": 2,
          "max_items": 2
        },
        "answers": {
          "type": "ARRAY",
          "items": { "type": "STRING" },
          "min_items": 2,
          "max_items": 2
        }
      },
      "required": ["questions", "answers"]
    },
    "textual_context": {
      "type": "OBJECT",
      "properties": {
        "questions": {
          "type": "ARRAY",
          "items": { "type": "STRING" },
          "min_items": 2,
          "max_items": 2
        },
        "answers": {
          "type": "ARRAY",
          "items": { "type": "STRING" },
          "min_items": 2,
          "max_items": 2
        }
      },
      "required": ["questions", "answers"]
    }
  },
  "required": ["visual_image_context", "textual_context"]
}

system_instruction = '''
You are a virtual assistant designed to analyze Etsy listings and generate questions and answers that help users discover similar listings.

You will be provided with the following information:
* **visual_image_context:** Visual features of the listing images (colors, shape, style, objects, and other things that can match).
* **textual_context:** textual metadata (title, description, tags, materials, etc.).

Tasks:
1. **Identify key visual and textual characteristics:**  Based on the provided context, determine the prominent visual elements (e.g., style, color, shape) and textual themes (e.g., materials, keywords, intended use) of the listing.
2. **Questions Generator:**  Generate 4 concise and engaging questions that highlight potential areas of similarity with other listings.
    * 2 questions should focus on visual similarity (based on visual_image_context).
    * 2 questions should focus on textual similarity (based on textual_context).
3. **Answers Generator:** For each question, provide a detailed answer that explains how the question relates to the listing's characteristics and suggests what a user might find if they explore similar listings based on that aspect.

Rules:
Set your output for 2 questions and 2 answers for EACH type: visual_image_context and  textual_context.

Output Example:

{
  "visual_image_context": {
    "questions": [
      "Interested in similar cable-knit patterns?",
      "See other scarves in dark blue?"
    ],
    "answers": [
      "This scarf features a classic cable-knit pattern. Explore similar listings to discover other items with this beautiful and textured design.",
      "You've selected 'dark blue', This scarf is a rich dark blue color..."
    ]
  },
  "textual_context": {
  "questions": [
    "Explore more handmade wool items?",
    "Find more winter accessories?"
    ],
    "answers": [
      "This scarf is made from 100% Merino wool and is hand-knitted. Discover other beautifully crafted items made from warm and cozy wool.",
      "This scarf is perfect for keeping warm during the winter months...."
    ]
  }
}

'''

In [None]:
rec_model = GenerativeModel(
    "gemini-1.5-flash-001",
    system_instruction=system_instruction,
    generation_config=GenerationConfig(response_mime_type="application/json", response_schema=cat_3_schema, max_output_tokens=4000),
    safety_settings=safety_settings,
)

errors = []
documents = []
text_rec = []
image_rec = []

for n, (index, row) in enumerate(emb_df.iterrows()):
   if n<10:
    print(f"Analizing Listing Number: {n}")
    img_df = image_embeddings(row["private_gcs_link"]).iloc[0:2]
    img_link = img_df["private_gcs_link"].tolist()
    txt_df = text_embeddings(row["content"]).iloc[0:2]
    txt_content = txt_df["content"].tolist()
    print(len(img_df))

    # if n == 0:
    #   print(row["content"])
    #   print("private")
    #   print(row["public_cdn_link"])
    #   print("public")
    #   print(img_df["public_cdn_link"].iloc[0])
    #   print(txt_df["public_cdn_link"].iloc[0])

    links = [Part.from_uri(mime_type="image/jpeg", uri=link) for link in img_link]

    text_rec.append({"title": txt_df["title"].to_list(), "image_uri": txt_df["public_cdn_link"].to_list()})
    image_rec.append({"title": img_df["title"].tolist(), "image_uri": img_df["public_cdn_link"].to_list()})

    prompt = f"""
    Generate questions and answers that would help a user discover similar Etsy listings.

    textual_context:
    Listing text 1, {txt_content[0]}
    Listing text 2, {txt_content[1]}

    visual_image_context:
    """

    try:
      responses = rec_model.generate_content(
        [
            prompt,
            "Listing image 1:\n",
            links[0],
            "Listing image 2:\n",
            links[1],

            'Output format: [{"content_type": <String>, "questions": <Array 2 items max>, "answers": <Array 2 items max>}, {...}, ...]'

        ],
        )
      try:
        json.loads(responses.text)
        documents.append(responses.text)
      except:
        print("error class 2")
        print(responses.text)
        errors.append({"class_2": {f"{n}": responses.text}})
        documents.append("error")
    except:
      print("error class 1")
      print(responses)
      errors.append({"class_1": {f"{n}"}})
      documents.append("error")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Analizing Listing Number: 5298
2
Analizing Listing Number: 5299
2
Analizing Listing Number: 5300
2
Analizing Listing Number: 5301
2
Analizing Listing Number: 5302
2
Analizing Listing Number: 5303
2
Analizing Listing Number: 5304
2
Analizing Listing Number: 5305
2
Analizing Listing Number: 5306
2
Analizing Listing Number: 5307
2
Analizing Listing Number: 5308
2
Analizing Listing Number: 5309
2
Analizing Listing Number: 5310
2
Analizing Listing Number: 5311
2
Analizing Listing Number: 5312
2
Analizing Listing Number: 5313
2
Analizing Listing Number: 5314
2
Analizing Listing Number: 5315
2
Analizing Listing Number: 5316
2
Analizing Listing Number: 5317
2
Analizing Listing Number: 5318
2
Analizing Listing Number: 5319
2
Analizing Listing Number: 5320
2
Analizing Listing Number: 5321
2
Analizing Listing Number: 5322
2
Analizing Listing Number: 5323
2
Analizing Listing Number: 5324
2
Analizing Listing Number: 5325
2
Analizing L

In [None]:
list_id = []

for index, row in emb_df.iterrows():
  list_id.append(row["listing_id"])


def modify_data_as_arrays(data, key1, key2):
  """
  Modifies the data to store questions and answers as arrays of 2 strings.
  """
  dummy_list = []
  for document in documents:
    if document == "error":
      dummy_list.append(["None Info", "None Info"])
    else:
      try:
        dummy_list.append(json.loads(document)[key1][key2])
      except (KeyError, IndexError):
        dummy_list.append(["None Info", "None Info"])
  return dummy_list

def modify_rec_data(data, key):
  """
  Modifies the data from text_rec and image_rec to keep the list of titles/image_uris.
  """
  dummy_list = []
  for item in data:
    try:
      dummy_list.append(item[key])
    except KeyError:  # Handle cases where the key doesn't exist
      dummy_list.append([])  # Append an empty list if the key is missing
  return dummy_list


# Assuming 'documents' is your list of JSON strings
out_df1 = pd.DataFrame({"textual_questions": modify_data_as_arrays(documents, "textual_context", "questions")})
out_df2 = pd.DataFrame({"textual_answers": modify_data_as_arrays(documents, "textual_context", "answers")})
out_df3 = pd.DataFrame({"visual_questions": modify_data_as_arrays(documents, "visual_image_context", "questions")})
out_df4 = pd.DataFrame({"visual_answers": modify_data_as_arrays(documents, "visual_image_context", "answers")})

out_df5 = pd.DataFrame({"textual_tile": modify_rec_data(text_rec, "title")})
out_df6 = pd.DataFrame({"textual_image_uri": modify_rec_data(text_rec, "image_uri")})
out_df7 = pd.DataFrame({"visual_tile": modify_rec_data(image_rec, "title")})
out_df8 = pd.DataFrame({"visual_image_uri": modify_rec_data(image_rec, "image_uri")})
listing_id = pd.DataFrame({"listing_id": list_id})

final_df = pd.concat([listing_id, out_df1, out_df2, out_df3, out_df4, out_df5, out_df6, out_df7, out_df8], axis=1)
#new_df.reset_index(drop=True)

with open('last_df.pkl', 'wb') as f:
  pickle.dump(new_df, f)

upload_to_gcs(bucket_name_pickles, "backup/cat3_full_df.pkl", new_df)

DataFrame uploaded to gs://etsy-demo/backup/cat3_full_df.pkl


In [None]:
text_rec[0]

{'title': ['Personalised Gift - Dancer Gift - Gift for Her - Personalised Cushion - Ballet Gift- Ballerina Gift',
  'Baseball Tooth Fairy Pillow, Sport Pillow, Personalized Pillow.'],
 'image_uri': ['https://gcpetsy.sonrobots.net/etsy-10k/il_570xN.3555562530_oz4c.jpg',
  'https://gcpetsy.sonrobots.net/etsy-10k/il_570xN.1720165448_l14p.jpg']}

In [None]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7798 entries, 0 to 7797
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   listing_id         7798 non-null   object
 1   textual_questions  7798 non-null   object
 2   textual_answers    7798 non-null   object
 3   visual_questions   7798 non-null   object
 4   visual_answers     7798 non-null   object
 5   textual_tile       7798 non-null   object
 6   textual_image_uri  7798 non-null   object
 7   visual_tile        7798 non-null   object
 8   visual_image_uri   7798 non-null   object
dtypes: object(9)
memory usage: 548.4+ KB


In [None]:
%%bigquery last_df
SELECT * FROM `vtxdemos.demos_us.etsy-10k-full`

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
merged_df = pd.merge(last_df, final_df, on='listing_id')

In [None]:
merged_df['questions_cat1'] = merged_df['questions_cat1'].apply(lambda x: json.dumps(x.tolist()) if isinstance(x, np.ndarray) else json.dumps(x))
merged_df['answers_cat1'] = merged_df['answers_cat1'].apply(lambda x: json.dumps(x.tolist()) if isinstance(x, np.ndarray) else json.dumps(x))
merged_df['questions_cat2'] = merged_df['questions_cat2'].apply(lambda x: json.dumps(x.tolist()) if isinstance(x, np.ndarray) else json.dumps(x))
merged_df['answers_cat2'] = merged_df['answers_cat2'].apply(lambda x: json.dumps(x.tolist()) if isinstance(x, np.ndarray) else json.dumps(x))
merged_df['textual_questions'] = merged_df['textual_questions'].apply(json.dumps)
merged_df['textual_answers'] = merged_df['textual_answers'].apply(json.dumps)
merged_df['visual_questions'] = merged_df['visual_questions'].apply(json.dumps)
merged_df['visual_answers'] = merged_df['visual_answers'].apply(json.dumps)
merged_df['textual_tile'] = merged_df['textual_tile'].apply(json.dumps)
merged_df['textual_image_uri'] = merged_df['textual_image_uri'].apply(json.dumps)
merged_df['visual_tile'] = merged_df['visual_tile'].apply(json.dumps)
merged_df['visual_image_uri'] = merged_df['visual_image_uri'].apply(json.dumps)

In [None]:
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",
)

job = bq_client.load_table_from_dataframe(
    merged_df, "vtxdemos.demos_us.etsy-10k-full", job_config=job_config
)  # Make an API request.
job.result()



LoadJob<project=vtxdemos, location=US, id=cb0f4694-43fe-4ed0-a8ee-0149e43d728f>

## FeatureView (FeatureStore) Resync

In [None]:
from vertexai.resources.preview import feature_store

In [189]:
fv_multi = feature_store.FeatureView(name="projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace/featureViews/etsy_view_image1")
fv_text = feature_store.FeatureView(name="projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace/featureViews/etsy_view_text1")

In [190]:
fv_multi.sync()


FailedPrecondition: 400 Previous on-demand sync is still running for FeatureView projects/254356041555/locations/us-east1/featureOnlineStores/feature_store_marketplace/featureViews/etsy_view_image1

In [180]:
fv_text.sync()
fv_multi.sync()

NotFound: 404 The FeatureOnlineStore does not exist.

In [None]:
merged_df["visual_image_uri"].iloc[0]

'["https://gcpetsy.sonrobots.net/etsy-10k/il_570xN.4912345367_pohg.jpg", "https://gcpetsy.sonrobots.net/etsy-10k/il_570xN.5486920589_r46c.jpg"]'

In [None]:
merged_df["content"].iloc[0]

'This personalized pink fairies drawstring bag is the perfect choice for kids who love dance, ballet, or simply need a stylish and practical bag for their games. It\'s made from high-quality polyester and features a vibrant design with 100s of options available to choose from. You can personalize it with a name or initials, making it a unique and special gift. The bag measures 44cm x 36.4cm and is equipped with a drawstring closure for easy access. It\'s ideal for carrying essentials like shoes, clothes, and toys.\n\nThe bag is made-to-order and the personalization process is straightforward. Simply choose your desired design and add the text you would like in the "Add your personalization" box during checkout. While proofs aren\'t usually provided, you can request one if you need to ensure the image is correct, especially if you\'re adding a custom image.\n\nShipping times vary depending on your location:\n\n* **UK:** 1-3 business days\n* **Europe:** 5-7 business days\n* **Rest of the

In [None]:
merged_df.to_pickle("merged_df_latest.pkl")

merged_df = merged_df.set_index('listing_id')
df = df.set_index('listing_id')

merged_df['questions_cat2'] = df['questions_cat2']
merged_df = merged_df.reset_index()

In [None]:
merged_df = pd.read_pickle("merged_df_latest.pkl")

In [None]:
# Assuming 'merged_df' and 'df' are your pandas DataFrames
df= pd.read_pickle("qa_cat2.pkl")
merged_df = pd.read_pickle("merged_df_latest.pkl")

## Redefining Questions Category 2

In [None]:
import pandas as pd
import numpy as np

# Assuming 'merged_df' and 'df' are your pandas DataFrames

# Ensure 'listing_id' is treated as a string in both DataFrames
merged_df['listing_id'] = merged_df['listing_id'].astype(str)
df['listing_id'] = df['listing_id'].astype(str)

# Set 'listing_id' as the index for both DataFrames
merged_df = merged_df.set_index('listing_id')
df = df.set_index('listing_id')

# Check for mismatched 'listing_id' values (optional but helpful)
merged_listings = set(merged_df.index)
df_listings = set(df.index)
missing_listings = merged_listings - df_listings
if missing_listings:
    print(f"Warning: 'listing_id' values in merged_df not found in df: {missing_listings}")

# Update 'questions_cat2' in merged_df, handling potential mismatches
for listing_id in merged_df.index:
    if listing_id in df.index:
        # Use .at for setting individual cell values
        merged_df.at[listing_id, 'questions_cat2'] = df.loc[listing_id, 'questions_cat2']
    else:
        # Handle cases where listing_id is not found in df
        merged_df.at[listing_id, 'questions_cat2'] = np.nan  # Or your preferred handling

# Reset the index if needed
merged_df = merged_df.reset_index()

print(merged_df)  # Verify the changes

      listing_id                                     text_embedding  \
0     1009746447  [-0.04755060747265816, 0.004479154013097286, -...   
1      593335300  [-0.021823517978191376, -0.028675295412540436,...   
2     1576625231  [-0.035083573311567307, 0.011393672786653042, ...   
3      185665551  [-0.020763065665960312, 0.0351705439388752, 0....   
4      851146379  [-0.04432620480656624, 0.016268471255898476, -...   
...          ...                                                ...   
7793  1119465913  [-0.0016812696121633053, -0.031237153336405754...   
7794   779847462  [-0.016010234132409096, 0.01628362014889717, 0...   
7795  1432483456  [0.010183659382164478, -0.006544398609548807, ...   
7796  1400526732  [-0.02605430595576763, 0.04074634239077568, 0....   
7797  1208509489  [0.013872112147510052, -0.002015542471781373, ...   

     ml_embed_text_status                                            summary  \
0                          This personalized pink fairies drawstrin

### Reading From Category 2 Preload - Preprocessing

In [172]:
%%bigquery latest_df
SELECT * FROM `vtxdemos.demos_us.etsy-10k-full`

Query is running:   0%|          |

Downloading:   0%|          |

In [200]:
df = df_load_from_gcs("backup/qa_cat2_v3.pkl")

In [174]:
df['listing_id'] = df['listing_id'].astype(str)
df['questions_cat2'] = df['questions_cat2'].apply(lambda x: json.dumps(x))
df['answers_cat2'] = df['answers_cat2'].apply(lambda x: json.dumps(x))

# Create a mapping dictionary from df
mapping_dict = dict(zip(df['listing_id'], df['questions_cat2']))

# Replace questions_cat2 in latest_df
latest_df['questions_cat2'] = latest_df['listing_id'].map(mapping_dict)

In [196]:
latest_df[latest_df["public_cdn_link"] == "https://gcpetsy.sonrobots.net/etsy-10k/il_570xN.5790992863_2bx8.jpg"]["listing_id"]

Unnamed: 0,listing_id
7454,1623249966


In [208]:
df[df["listing_id"] == 1623249966]["questions_cat2"].iloc[0]

["The tumbler is part of Starbucks' 2023 Holiday Collection.  While this specific tumbler isn't customizable, other Starbucks tumblers often allow for some degree of personalization, such as adding a name or initial.  More information on customization options may be found on Starbucks' website or other related retailer sites. ",
 'The dimensions are approximately 9" L x 6" W x 4" H, and the tumbler weighs approximately 15.9 oz.  These dimensions are typical for a 24 oz tumbler, but slight variations can occur due to manufacturing processes.',
 "The exact material composition is not specified in the product description. However, given its metallic appearance and likely use for hot and cold beverages, it is probably made from a double-walled stainless steel or a similar material that offers good insulation.  Further details on the materials used might be found through searching Starbucks' product details online.",
 'The listing indicates that the tumbler is available for immediate purcha

In [209]:
df["questions_cat2"].iloc[0]

['Ceramic is a widely used material for decorative and functional items, but its durability under intense, prolonged heat exposure varies greatly depending on the type of clay and firing process.  High-quality ceramic can tolerate very high temperatures, but cheaper versions may crack or break.',
 'The production of ceramic skulls involves several steps: creating a mold, mixing and pouring the ceramic slip, allowing it to dry, firing in a kiln at high temperatures, and potentially glazing and refiring. The specific methods and time frames vary between makers.',
 'While demon skulls might seem modern, depictions of skulls have featured in art, religious practices, and folklore for centuries across cultures. The aesthetics and use of these objects have changed over time, reflecting different cultural and historical trends.',
 'Many handcrafted ceramic items, while not necessarily fully customizable, might allow for some variations in color or design upon request. Direct communication wit

In [206]:
df[df["listing_id"] == 1623249966]["answers_cat2"].iloc[0]

['What other Starbucks Holiday collections have featured similar designs or color schemes?',
 'What are the typical manufacturing processes involved in creating double-walled stainless steel tumblers?',
 'How does the SB Disco Metallic Silver Mirror Grid design compare to other popular Starbucks tumbler designs in terms of its aesthetic appeal?',
 'What are the sustainability factors to consider with stainless-steel drinkware, including production, lifespan, and recycling?']

In [176]:
import pandas as pd
import json

def transform_dataframe_column(df, column_name="answers_cat2"):
    """Transforms a Pandas DataFrame column containing JSON-like strings (with single quotes)
       into properly formatted JSON strings (with double quotes) within the DataFrame.
       Handles errors gracefully.
    """
    def transform_single_string(input_string):
        try:
            # Attempt to parse the input string.  Using ast.literal_eval() is safer than eval()
            import ast
            data = ast.literal_eval(input_string)

            if isinstance(data, list) and all(isinstance(item, str) for item in data):
                return json.dumps(data, indent=2) #No need for dictionary, keeping existing format
            else:
                return json.dumps(None) #Handle cases that are not valid list of strings

        except (SyntaxError, ValueError) as e:  # Handle parsing errors
            return json.dumps(None)  # or handle the error as you prefer

    # Apply the transformation function to the specified column
    df[column_name] = df[column_name].apply(transform_single_string)
    return df
latest_df = transform_dataframe_column(latest_df)
latest_df = transform_dataframe_column(latest_df, column_name="questions_cat2")

In [177]:
# Loading Modified Fields into BQ
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",
)

job = bq_client.load_table_from_dataframe(latest_df, "vtxdemos.demos_us.etsy-10k-full", job_config=job_config)
job.result()

LoadJob<project=vtxdemos, location=US, id=ed83d3e0-cba9-4e2e-81f3-ef1a840643ca>

In [182]:
for index, row in latest_df.iterrows():
    try:
      json.loads(row['answers_cat2'])
    except:
      print("error perro")

In [179]:
json.loads(latest_df['answers_cat2'].iloc[0])

['What are the different fabrics used to make drawstring bags and their respective advantages and disadvantages?',
 'What are the typical dimensions of drawstring bags available on the market?',
 "How can shipping time and cost be influenced by factors other than the buyer's location?",
 'What are the typical personalization options for drawstring bags, and how do sellers handle customer requests?']

In [None]:
%%bigquery testing
SELECT * FROM `vtxdemos.demos_us.etsy-10k-full`

Query is running:   0%|          |

Downloading:   0%|          |