# Libraries



In [2]:
import json
import base64
import vertexai
import numpy as np
import pandas as pd
from google.cloud import storage
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models

# Variables

In [3]:
project_id = "vtxdemos"
model_id = "gemini-1.5-pro-001"
emb_model_id = "text-embedding-004"
bucket_id = "vtxdemos-vsearch-datasets"
bucket_folder = "profile_synthetic_data"

# Synthetic Data

Because we are creating synthetic data, this could take long time.

In [4]:
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 2,
    "top_p": 0.95,
    "response_mime_type": "application/json"
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
}

model = GenerativeModel(
    model_id,
)
prompt = """
Your mission is to create false user profiles with the following schema:
['id', 'name', 'first_name', 'last_name', 'email', 'job_title', 'company', 'location', 'experiences', 'education', 'skills', 'languages', 'created_at', 'updated_at']

<rules>
The output should be a JSON format where the key name is the same as the schema.
Create 20 rows with key, value pairs.
Use random synthetic data for the value.
</rules>


"""

response = model.generate_content(
    [prompt],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

print(response.text)

{"id": ["ecc9114b-57dd-4e7c-ace3-894318572777", "95dfb50d-067e-4a07-8b22-089a7e0c8c1a", "90831563-a33e-405e-b169-9c124b35d588", "40c2a29c-eeef-4e8e-80e8-8a0325569788", "e35e0a79-7c83-4f88-ad14-a26634881574", "a238d254-981e-4f39-826c-4f49d6939c35", "a7952882-477d-4b50-ba62-a2f75978c323", "0c619d41-b42c-4a69-906f-726820982243", "97f18a23-4e8b-431d-9a43-6c266c75c82a", "3d5c00c7-3b9c-401e-a646-895800a2e763", "51d76952-e49a-470f-a24f-84a42c55962a", "34a67618-a70b-4947-988b-c8e948113391", "722396f1-0c37-4862-b052-8000d293a731", "0a9c09f9-a809-4993-8c4c-87459a27980b", "66a0c983-2c58-4a56-bd82-a5a921b98311", "2c07f12b-17eb-4455-903a-e3a5c1130060", "d10570a2-0a73-4611-b76e-a7807a90d1b8", "e26c1f2c-7839-4617-9895-9d125286e765", "9ab3a82b-fac8-4364-8b8d-08f1d9320f37", "fa09685d-6b51-4388-a40d-5c81004d7625"], "name": ["Michael Harris", "Ashley Brown", "Christopher Smith", "Jessica Davis", "David Garcia", "Amanda Rodriguez", "Matthew Wilson", "Jennifer Martin", "Joshua Taylor", "Elizabeth Thomas", 

In [5]:
output = json.loads(response.text)

# From Structured Data to Natural Language



> This is for when you have real large dataset, if is a demo and the data is synthetic this step can be skipped.



In [6]:
vertexai.init(project=project_id, location="us-central1")
model = GenerativeModel(
    model_id,
    system_instruction=[
    """
    Without missing any word/details transform the following dictionary as a 500 token paragraph (chunks) separated by breaklines.
    """
    ]
)
emb_model = TextEmbeddingModel.from_pretrained(emb_model_id)

In [144]:
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 2,
    "top_p": 0.95,
}

def generate_structure(dictionary: str):
  prompt = f"""
  <dictionary>
  {dictionary}
  <dictionary>

  """

  responses = model.generate_content(
      [prompt],
      generation_config=generation_config,
      stream=True,
  )
  gemini_response = []
  try:
    for response in responses:
      print(response.text)
      gemini_response.append(response.text)
  except:
    print(responses)
    return "error"

  return "".join(gemini_response)


In [137]:
_dataset = []
# Iterate through each "row" (index)
for i in range(len(output['id'])):
    row_data = {}
    # Iterate through each key (column)
    for key in output:
        row_data[key] = output[key][i]

    # Now row_data contains the key-value pairs for the current row
    _dataset.append(row_data)

In [145]:
for n, profile in enumerate(_dataset):
  _re = generate_structure(str(profile))
  if _re == "error":
    break
  _re_for_emb = _re.split("\n\n")
  inputs = [TextEmbeddingInput(text, "RETRIEVAL_DOCUMENT") for text in _re_for_emb]
  embeddings = emb_model.get_embeddings(inputs)
  _dataset[n]["description"] = _re
  _dataset[n]["embeddings"] = embeddings[0].values

Benjamin
 Harris is a Software Engineer at Google, based in New York City. His journey
 includes a Software Engineer Intern role at Microsoft in 2019. 


Benjamin holds a Master of Science in Computer Science from Stanford University, which he earned in 2019, and a Bachelor of Science in Computer Science
 from the University of California, Berkeley, obtained in 2017. His skillset boasts proficiency in Python, Java, C++, and SQL. While
 he primarily communicates in English, Benjamin's expertise shines through in his technical abilities.

His professional identity is further encapsulated in his record, bearing the unique identifier 'e8c5d4a4-3fc7-41
bd-92d5-878947a43224'. 

This information, accurate as of its last update on 2023-11-16T10:2
1:34.567Z, provides a glimpse into Benjamin's qualifications. Notably, a set of embeddings, represented by a lengthy string of numerical values, further enriches his profile. This numerical fingerprint, capturing the essence of his skills and experien

# Store In Google Cloud Storage



In [147]:
client = storage.Client()
bucket = client.get_bucket(bucket_id)
bucket.blob(f"{bucket_folder}/dataset.json").upload_from_string(json.dumps(_dataset), content_type="application/json")