In [None]:
import json
import pandas as pd
from google.cloud import firestore
from google.cloud.firestore_v1.base_vector_query import DistanceMeasure
from google.cloud.firestore_v1.vector import Vector
import vertexai
from vertexai.generative_models import GenerativeModel, Part
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
import vertexai.preview.generative_models as generative_models
from suzano.data_connections.google_storage import GoogleStorageConnection

: 

In [None]:
vertexai.init(project="sz-academia-digital-feat")

### Upload de Arquivo no Storage

In [None]:
db_storage = GoogleStorageConnection(
    project_id_str="sz-academia-digital-feat"
)

In [None]:
file_path = "pdf/Prompt_Produto"

In [None]:
db_storage.upload_file(
    "storage-qs-chatbot-feat",
    source_path_str=file_path,
    destination_path_str=file_path
)

In [None]:
file = [file for file in db_storage.storage_client.list_blobs("storage-qs-chatbot-feat") if file.name == file_path]
file_uri = 'gs://' + file[0].id[:-(len(str(file[0].generation)) + 1)]

In [None]:
file_uri

### Catalogar arquivo

In [None]:
def generate(file_uri):
  model = GenerativeModel(
    "gemini-1.5-pro-001",
    system_instruction="Formate a resposta com um único JSON, acessível via json.loads python, como no exemplo: '[{\"page\": \"__PAGINA__\",  \"content\": \"__CONTEUDO_PAGINA__\"}]'"
  )

  responses = model.generate_content(
    [
      Part.from_uri(file_uri, mime_type="application/pdf"),
      """"Catalogue o documento, separe por página, e em cada item traga a página e conteúdo. Em caso de imagens ou tabelas sumarize a informação como texto."""
    ],
    generation_config={
        "max_output_tokens": 8192,
        "temperature": 1,
        "top_p": 1,
        "response_mime_type": "application/json",
    },
    safety_settings={
          generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
          generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
          generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
          generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    },
    stream=False,
  )

  return responses.text

In [None]:
response = generate(file_uri)

In [None]:
catalog = json.loads(response)

In [None]:
catalog

### Calculando Vector Embedding

In [None]:
model_name = "textembedding-gecko@003"

In [None]:
model = TextEmbeddingModel.from_pretrained(model_name)

In [None]:
df_loc = pd.DataFrame(catalog)
df_loc["file_uri"] = file_uri

In [None]:
df_loc

In [None]:
inputs = [ TextEmbeddingInput(row["content"], "RETRIEVAL_QUERY") for _, row in df_loc.iterrows() ]
embeddings = model.get_embeddings(inputs)
embed_db = { idx: Vector(embedding.values) for idx, embedding in enumerate(embeddings) }

In [None]:
df_loc["embeddings"] = df_loc.index.map(embed_db)

In [None]:
df_loc

### Adicionando ao Firestore

In [None]:
firestore_client = firestore.Client("sz-academia-digital-feat")
collection = firestore_client.collection("qs-chatbot")

In [None]:
for doc in df_loc.to_dict(orient="records"):
    collection.add(doc)


### Busca Vetorizada

In [None]:
query = "Como uma solução digital pode reduzir custos operacionais e melhorar a eficiência?"


In [None]:
inputs = [TextEmbeddingInput(text, "RETRIEVAL_QUERY") for text in [ query ]]
embeddings = model.get_embeddings(inputs)
embed_ask = [Vector(embedding.values) for embedding in embeddings]

In [None]:
embed_ask[0]

In [None]:
response = collection.find_nearest(
   vector_field="embeddings",
   query_vector=embed_ask[0],
   distance_measure=DistanceMeasure.EUCLIDEAN,
   limit=5)

In [None]:
#!gcloud alpha firestore indexes composite create --project=sz-academia-digital-feat --collection-group=qs-chatbot --query-scope=COLLECTION --field-config=vector-config='{"dimension":"768","flat": "{}"}',field-path=embeddings

In [None]:
docs = [item.to_dict() for item in response.get()]