# Embeddings Image and Text

Image formats
* BMP
* GIF
* JPG
* PNG

128, 256, 512 or 1408 dimensions

In [1]:
from google.colab import auth
auth.authenticate_user()

In [2]:
from google.colab import userdata
PROJECT_ID = userdata.get('PROJECT_ID')

import vertexai
from vertexai.vision_models import Image, MultiModalEmbeddingModel
vertexai.init(project=PROJECT_ID, location="us-central1")

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")

In [3]:
image = Image.load_from_file(
    "gs://questionsanswersproject/animales_mexico.jpeg"
)

embeddings = model.get_embeddings(
    image=image,
    contextual_text="mexican animals",
    dimension=512,
)

print(f"Image Embedding: {embeddings.image_embedding}")
print(f"Text Embedding: {embeddings.text_embedding}")

Image Embedding: [-0.00906615332, -0.026660189, 0.0177577455, -0.0684581175, 0.0157205872, -6.51983792e-05, 0.0069708135, 0.000589538773, 0.00787322409, 0.0461782888, -0.0357222036, 0.00813075341, 0.0151426774, 0.0062549049, 0.0740562081, -0.0845985785, -0.00439098198, -0.0286715906, -0.039608784, -0.0102925235, 0.027891634, -0.000169094186, 0.00649160147, 0.0111922715, -0.00061687862, 0.0547429621, -0.0227080751, 0.0118896505, -0.0343653969, 0.0598911978, -0.0535242409, -0.0251580775, 0.0023842596, 0.0579477288, 0.0258852206, 0.0293682758, -0.0165539235, 0.00545407692, 0.0354089476, 0.0343107432, 0.0398815945, -0.00735370675, -0.0590971969, -0.0406040289, -0.0161748808, -0.00456254091, -0.0140348226, -0.0738905147, -0.00127584336, -0.0567362159, -0.0144241145, 0.00450462475, 0.0497974865, 0.00287993369, 0.0397356, 0.028190732, -0.00482698763, -0.0404779427, -0.0516312085, -0.0333454497, -0.0218718, -0.0106221605, -0.000645358406, -0.00560258748, 0.0710481405, -0.0156695452, 0.03492475

# Embeddings Video

Video formats
* AVI
* FLV
* MKV
* MOV
* MP4
* MPEG
* MPG
* WEBM
* WMV

Audio supported	N/A - The model doesn't consider audio content when generating video embeddings

* Essential
  * Maximum number of embeddings per minute: 4
  * Video embedding interval: intervalSec >= 15

* Standard
  * Maximum number of embeddings per minute: 8
  * Video embedding interval: 8 <= intervalSec < 15

* Plus
  * Maximum number of embeddings per minute: 15
  * Video embedding interval: 4 <= intervalSec < 8

In [5]:
from google.colab import userdata
PROJECT_ID = userdata.get('PROJECT_ID')
vertexai.init(project=PROJECT_ID, location="us-central1")

from vertexai.vision_models import MultiModalEmbeddingModel, Video
from vertexai.vision_models import VideoSegmentConfig

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
video_path = "gs://questionsanswersproject/simpsons-first-episode.mp4"
contextual_text = "Simpsons family"
video = Video.load_from_file(video_path)
# embeddings for one second of the video
#video_segment_config = VideoSegmentConfig(end_offset_sec=10)

# embeddings for each 16 seconds from 10 until 60 seconds
#video_segment_config = VideoSegmentConfig(start_offset_sec=0, end_offset_sec=60)

# embeddings for each 5 seconds from 10 until 60 seconds
video_segment_config = VideoSegmentConfig(start_offset_sec=0, end_offset_sec=60, interval_sec=5)

embeddings = model.get_embeddings(
    video=video,
    video_segment_config=video_segment_config,
    contextual_text=contextual_text,
)

print("Video Embeddings:")

for video_embedding in embeddings.video_embeddings:
    print(
        f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}"
    )
    print(f"Embedding: {video_embedding.embedding}")

print(f"Text Embedding: {embeddings.text_embedding}")

Video Embeddings:
Video Segment: 5.0 - 10.0
Embedding: [-0.0281904824, -0.00103538297, -0.00567395939, -0.000782240939, -0.00375733804, -0.0164730176, -0.0140912356, 0.028216036, 0.00898380857, -0.0437635891, 0.00266870228, 0.0177605972, -0.024227716, 0.0355940647, -0.00998157263, -0.00829108153, -0.00861947797, 0.0193075035, -0.00782340765, -0.00767928036, -0.0559651144, -0.0370159261, 0.0510191806, -0.00349211716, 0.0686786696, 0.00183385552, 0.0211514886, 0.0182263032, -0.00859910809, 0.0217800625, -0.0543844961, 0.00345794484, 0.000223071984, 0.0259282067, 0.0184031092, 0.00194475334, 0.00562185142, -0.0563977286, 0.00812179781, -0.00869052205, 0.0166167095, -0.0203913562, -0.0180305392, -0.0194864739, -0.0126415631, -0.00392427621, -0.0744796842, -0.0160887092, 0.0467492528, -0.0141873173, 0.00143617264, 0.0134440567, -0.0103925159, -0.00477868691, 0.0119493548, 0.00327313831, -0.046562586, 0.0310524479, -0.00389022031, -0.0069864043, -0.0454641618, -0.00920596626, -0.0327155367, 