In [None]:
!pip install --upgrade google-cloud-aiplatform



In [None]:
!pip install google-cloud-firestore==2.16.0



In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
import vertexai

vertexai.init(project="devhack-3f0c2", location="us-central1")

In [None]:
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

In [None]:
model_name = "text-embedding-004"
task = "SEMANTIC_SIMILARITY"
dimensionality: int = 768
model = TextEmbeddingModel.from_pretrained(model_name)
kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}

**Embeddings Task Type** | **Description**
------- | --------
`RETRIEVAL_QUERY` | Indicates the text is a search query.
`RETRIEVAL_DOCUMENT` | Indicates the text is a document for search.
`SEMANTIC_SIMILARITY` | Specifies the text is used for measuring semantic similarity (STS).
`CLASSIFICATION` | Indicates the text embedding is used for classification tasks.
`CLUSTERING` | Indicates the text embedding is used for clustering tasks.
`QUESTION_ANSWERING` | Indicates the query embedding is used for answering questions. Use `RETRIEVAL_DOCUMENT` for the document side.
`FACT_VERIFICATION` | Indicates the query embedding is used for fact verification.


In [None]:
text_1 = "Hello world"
text_2 = "Goodbye"
text_3 = "I like to run"
text_4 = "soccer"
texts = [text_1, text_2, text_3, text_4]

In [None]:
embedding_text_1 = TextEmbeddingInput(text_1, task)
embedding_text_2 = TextEmbeddingInput(text_2, task)
embedding_text_3 = TextEmbeddingInput(text_3, task)
embedding_text_4 = TextEmbeddingInput(text_4, task)

In [None]:
embeddings = model.get_embeddings([embedding_text_1, embedding_text_2, embedding_text_3, embedding_text_4], **kwargs)

In [None]:
for embedding in embeddings:
  print(str(embedding.values)[:50], '... TRIMMED ...')

[-0.02412703074514866, 0.009477811865508556, -0.06 ... TRIMMED ...
[0.0016563811805099249, 0.03174314647912979, -0.01 ... TRIMMED ...
[-0.05893269553780556, -0.004683346021920443, -0.0 ... TRIMMED ...
[-0.05271419882774353, 0.010071152821183205, -0.00 ... TRIMMED ...


#### Similarity

- Calculate the similarity between two sentences as a number between 0 and 1.
- Try out your own sentences and check if the similarity calculations match your intuition.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
embeddings_values = [embedding.values for embedding in embeddings]
print(cosine_similarity([embeddings_values[0]],[embeddings_values[1]]))
print(cosine_similarity([embeddings_values[0]],[embeddings_values[2]]))
print(cosine_similarity([embeddings_values[1]],[embeddings_values[2]]))
print(cosine_similarity([embeddings_values[2]],[embeddings_values[3]]))

[[0.57065815]]
[[0.4558137]]
[[0.48923647]]
[[0.53230819]]


Vector Database

In [None]:
# @markdown Please fill in the value below with your Google Cloud project ID and then run the cell.

PROJECT_ID = "devhack-3f0c2"  # @param {type:"string"}

# Set the project id
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [None]:
from google.cloud.firestore_v1.vector import Vector
from google.cloud import firestore_v1

In [None]:
db = firestore_v1.Client(database="embeddings")
batch_firestore = db.batch()
content_text_collection = db.collection('TextEmbeddings')

In [None]:
for text, embedding in zip(texts, embeddings_values):
    ref_doc = content_text_collection.document()
    batch_firestore.set(ref_doc, {
    'text': text,
    'embeddings': Vector(embedding)
  })
batch_firestore.commit()

InvalidArgument: 400 Invalid resource field value in the request. [reason: "RESOURCE_PROJECT_INVALID"
domain: "googleapis.com"
metadata {
  key: "method"
  value: "google.firestore.v1.Firestore.Commit"
}
metadata {
  key: "service"
  value: "firestore.googleapis.com"
}
]

In [None]:
from google.cloud.firestore_v1.base_vector_query import DistanceMeasure

In [None]:
text_query = "I'm offended"
text_query_embedding = model.get_embeddings([TextEmbeddingInput(text_query, task)], **kwargs)[0]

vector_query = content_text_collection.find_nearest(
  vector_field="embeddings",
  query_vector=Vector(text_query_embedding.values),
  distance_measure=DistanceMeasure.COSINE,
  limit=50
)

In [None]:
docs = (vector_query.stream())
for doc in docs:
  doc.to_dict()["text"]