In [2]:
# install vertex ai sdk
!pip install google-cloud-aiplatform

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.52.0-py2.py3-none-any.whl (5.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Collecting docstring-parser<1
  Downloading docstring_parser-0.16-py3-none-any.whl (36 kB)
Collecting google-cloud-storage<3.0.0dev,>=1.32.0
  Using cached google_cloud_storage-2.16.0-py2.py3-none-any.whl (125 kB)
Collecting google-cloud-bigquery!=3.20.0,<4.0.0dev,>=1.15.0
  Downloading google_cloud_bigquery-3.23.1-py2.py3-none-any.whl (237 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m237.3/237.3 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting shapely<3.0.0dev
  Downloading shapely-2.0.4-cp310-cp310-macosx_11_0_arm64.whl (1.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25hColl

In [17]:
from vertexai.preview.language_models import TextEmbeddingModel

def google_embed(query: str):
    embedder_name = "text-multilingual-embedding-preview-0409"
    model = TextEmbeddingModel.from_pretrained(embedder_name)
    embeddings_list = model.get_embeddings([query])
    embeddings = embeddings_list[0].values
    return embeddings

# Example usage (768 dimensions)
embeddings = google_embed("This is a text I want to embed")
print(embeddings[:5])

[0.0029814322479069233, 0.021727746352553368, 0.058624569326639175, 0.010330887511372566, 0.09533035755157471]


In [14]:
import numpy as np
def cosine_similarity(a, b):
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [15]:
text1 = "今天天氣很好"
text2 = "我很開心"

embeddings1 = google_embed(text1)
embeddings2 = google_embed(text2)

# print(embeddings1[:5])
# print(embeddings2[:5])

similarity = cosine_similarity(embeddings1, embeddings2)
print(f"Similarity between '{text1}' and '{text2}': {similarity:.4f}")



Similarity between '今天天氣很好' and '我很開心': 0.7538


In [24]:
MODEL = "text-embedding-004"
TASK = "RETRIEVAL_DOCUMENT"
TEXT = "This is a test"
TITLE = "Test"
OUTPUT_DIMENSIONALITY = 768


from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel


def embed_text(
    model_name: str,
    task_type: str,
    text: str,
    title: str = "",
    output_dimensionality=None,
) -> list:
    """Generates a text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained(model_name)
    text_embedding_input = TextEmbeddingInput(
        task_type=task_type, title=title, text=text
    )
    kwargs = (
        dict(output_dimensionality=output_dimensionality)
        if output_dimensionality
        else {}
    )
    embeddings = model.get_embeddings([text_embedding_input], **kwargs)
    return embeddings[0].values


# Get a text embedding for a downstream task.
embedding = embed_text(
    model_name=MODEL,
    task_type=TASK,
    text=TEXT,
    title=TITLE,
    output_dimensionality=OUTPUT_DIMENSIONALITY,
)
# print(len(embedding))  # Expected value: {OUTPUT_DIMENSIONALITY}.

print(embedding[:5])

[0.010671528056263924, -0.0069691999815404415, -0.07482928782701492, -0.022868942469358444, 0.024032343178987503]


In [26]:
TEXT1 = "今天天氣很好"
TEXT2 = "我很開心"

embedding1 = embed_text(
    model_name=MODEL,
    task_type=TASK,
    text=TEXT1,
    title=TITLE,
    output_dimensionality=OUTPUT_DIMENSIONALITY,
)
embedding2 = embed_text(
    model_name=MODEL,
    task_type=TASK,
    text=TEXT2,
    title=TITLE,
    output_dimensionality=OUTPUT_DIMENSIONALITY,
)


print(embedding1[:5])
print(embedding2[:5])

similarity = cosine_similarity(embedding1, embedding2)
print(f"Similarity between '{TEXT1}' and '{TEXT2}': {similarity:.4f}")

[0.008668964728713036, 0.04154808074235916, -0.029312308877706528, -0.0027766868006438017, 0.044972509145736694]
[0.008668964728713036, 0.04154808074235916, -0.029312308877706528, -0.0027766868006438017, 0.044972509145736694]
Similarity between '今天天氣很好' and '我很開心': 1.0000


In [27]:
# Switch to text-multilingual-embedding-002
MODEL = "text-multilingual-embedding-002"

embedding1 = embed_text(
    model_name=MODEL,
    task_type=TASK,
    text=TEXT1,
    title=TITLE,
    output_dimensionality=OUTPUT_DIMENSIONALITY,
)
embedding2 = embed_text(
    model_name=MODEL,
    task_type=TASK,
    text=TEXT2,
    title=TITLE,
    output_dimensionality=OUTPUT_DIMENSIONALITY,
)

print(embedding1[:5])
print(embedding2[:5])


[0.012177991680800915, 0.00998682714998722, 0.015519461594522, 0.0241247545927763, 0.04559287056326866]
[0.0038432718720287085, -0.021820055320858955, 0.036357633769512177, 0.02820320427417755, -0.0013248010072857141]
