In [None]:
#set project info
PROJECT = !gcloud config get-value project
PROJECT_ID = PROJECT[0]
REGION ='us-cental1'


In [None]:
import urllib.request
url = "http://metadata.google.internal/computeMetadata/v1/project/project-id"
req = urllib.request.Request(url)
req.add_header("Metadata-Flavor", "Google")
project_id = urllib.request.urlopen(req).read().decode()

In [None]:
from google.cloud import storage


def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    # bucket_name = "your-bucket-name"
    paths=[]
    names=[]
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name,match_glob=["**/*.png"])
 
    # Note: The call returns a response only when the iterator is consumed.
    for blob in blobs:    
        paths.append('gs://' + blob.id[:-(len(str(blob.generation)) + 1)])
        names.append(blob.name)
    return paths,names
  
    


In [None]:
image_paths,image_names=list_blobs('raw_nine_files')

### Define function to detect explicit images

enable clound vision api before running this bit

### Defining encoding functions
Create an EmbeddingPredictionClient which encapsulates the logic to call the embedding API.

In [None]:
import base64
import time
import typing
import copy
from typing import List, Optional

from google.cloud import aiplatform
from google.protobuf import struct_pb2

#libraries to generate image summaries
from vertexai.vision_models import MultiModalEmbeddingModel
from vertexai.language_models import TextEmbeddingModel
from vertexai.vision_models import Image as vision_model_Image
from vertexai.vision_models import MultiModalEmbeddingModel
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part,
    HarmBlockThreshold,
    HarmCategory,
)

text_embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@latest")
#multimodal_embedding_model = MultiModalEmbeddingModel.from_pretrained(
 #   "multimodalembedding@001"
#)


class EmbeddingResponse(typing.NamedTuple):
    text_embedding: typing.Sequence[float]
    image_embedding: typing.Sequence[float]


import requests
def get_public_url_from_gcs(gcs_uri: str) -> str:
    return gcs_uri.replace("gs://", "https://storage.googleapis.com/").replace(
        " ", "%20"
    )

def load_image_bytes(image_uri: str) -> bytes:
    """Load image bytes from a remote or local URI."""
    image_bytes = None
    image_uri= get_public_url_from_gcs(image_uri)
    if image_uri.startswith("http://") or image_uri.startswith("https://"):
        response = requests.get(image_uri, stream=True)
        if response.status_code == 200:
            image_bytes = response.content
    else:
        image_bytes = open(image_uri, "rb").read()
    return image_bytes


class EmbeddingPredictionClient:
    """Wrapper around Prediction Service Client."""

    def __init__(
        self,
        project: str,
        location: str = "us-central1",
        api_regional_endpoint: str = "us-central1-aiplatform.googleapis.com",
    ):
        client_options = {"api_endpoint": api_regional_endpoint}
        # Initialize client that will be used to create and send requests.
        # This client only needs to be created once, and can be reused for multiple requests.
        self.client = aiplatform.gapic.PredictionServiceClient(
            client_options=client_options
        )
        self.location = location
        self.project = project

    def get_embedding(self, text: str = None, image_file: str = None):
        if not text and not image_file:
            raise ValueError("At least one of text or image_file must be specified.")

        # Load image file
        image_bytes = None
        if image_file:
            if not (image_file.startswith("gs://")): 
                image_bytes = load_image_bytes(image_file)

        instance = struct_pb2.Struct()
        if text:
            instance.fields["text"].string_value = text

        if image_bytes:
            encoded_content = base64.b64encode(image_bytes).decode("utf-8")
            image_struct = instance.fields["image"].struct_value
            image_struct.fields["bytesBase64Encoded"].string_value = encoded_content

        if image_file:
            if (image_file.startswith("gs://")): 
                  instance["image"] = {
                        "gcsUri": image_file  # pylint: disable=protected-access
                    }       
        
        instances = [instance]
        
        endpoint = (
           f"projects/{self.project}/locations/{self.location}"
           "/publishers/google/models/multimodalembedding@001"
        )
        response = self.client.predict(endpoint=endpoint, instances=instances)
        text_embedding = None
        if text:
            text_emb_value = response.predictions[0]["textEmbedding"]
            text_embedding = [v for v in text_emb_value]

        image_embedding = None
        if image_bytes or image_file:
            image_emb_value = response.predictions[0]["imageEmbedding"]
            image_embedding = [v for v in image_emb_value]

        return EmbeddingResponse(
            text_embedding=text_embedding, image_embedding=image_embedding
        )

    def get_image_summarycontent(self, text: str = None, image_file: str = None):
        
        generative_multimodal_model= GenerativeModel("gemini-pro-vision")
        
        image_description_prompt="""You are an assistant tasked with summarizing images for retrieval. \
        These summaries will be embedded and used to retrieve the raw image. \
        Give a concise summary of the image that is well optimized for retrieval.\
        If there is a famous person like politician, celebrity or athlete, indicate their name and describe what they are famous for.\
        If you are not sure about any info, please do not make it up."""
        
        generation_config= GenerationConfig(temperature=0.2, max_output_tokens=2048) 
        
        safety_settings=  {
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        }
        stream=True
        
        # Load the saved image as a Gemini Image Object
        #image_for_gemini= Image.load_from_file(image_file)
        image_for_gemini = Part.from_uri(image_file, "image/jpeg")

        model_input=[image_description_prompt, image_for_gemini]
        
        response = generative_multimodal_model.generate_content(
        model_input,
        generation_config=generation_config,
        stream=stream,
        safety_settings=safety_settings, )
        
        
        response_list = []

        for chunk in response:
            try:
                response_list.append(chunk.text)
            except Exception as e:
                print(
                    "Exception occurred while calling gemini. Something is wrong. Lower the safety thresholds [safety_settings: BLOCK_NONE ] if not already done. -----",
                    e,
                )
                response_list.append("Exception occurred")
                continue
        response = "".join(response_list)
 
        return response

    def get_summarycontent_embedding_from_text_embedding_model(self, text: str, return_array: Optional[bool] = False,) -> list:
        """
        Generates a numerical text embedding from a provided text input using a text embedding model.

        Args:
            text: The input text string to be embedded.
            return_array: If True, returns the embedding as a NumPy array.
                          If False, returns the embedding as a list. (Default: False)

        Returns:
            list or numpy.ndarray: A 768-dimensional vector representation of the input text.
                                   The format (list or NumPy array) depends on the
                                   value of the 'return_array' parameter.
        """

        #the given text is maximum 2048 token. If more, it has to be chunked.
        embeddings = text_embedding_model.get_embeddings([text])
        text_embedding = [embedding.values for embedding in embeddings][0]

        if return_array:
            text_embedding = np.fromiter(text_embedding, dtype=float)

        # returns 768 dimensional array
        return EmbeddingResponse(
            text_embedding=text_embedding, image_embedding=None
        )
    

### Create helper functions to process data in batches
Datasets can be large, so it's recommended to load a batch of data at a time into memory using a generator.

In [None]:
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Callable, Generator, List

from tqdm.auto import tqdm

#Number of API calls per second
API_IMAGES_PER_SECOND = 2

def generate_batches(
    inputs: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    """
    Generator function that takes a list of strings and a batch size, and yields batches of the specified size.
    """

    for i in range(0, len(inputs), batch_size):
        yield inputs[i : i + batch_size]



def encode_to_embeddings_chunked(
    process_function: Callable[[List[str]], List[Optional[List[float]]]],
    items: List[str],
    batch_size: int = 1,
) -> List[Optional[List[float]]]:
    """
    Function that encodes a list of strings into embeddings using a process function.
    It takes a list of strings and returns a list of optional lists of floats.
    The data is processed in chunks to prevent out-of-memory errors.
    """

    embeddings_list: List[Optional[List[float]]] = []

    # Prepare the batches using a generator
    batches = generate_batches(items, batch_size)

    seconds_per_job = batch_size / API_IMAGES_PER_SECOND

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(batches, total=len(items) // batch_size, position=0):
            futures.append(executor.submit(process_function, batch))
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())
    return embeddings_list

### Create functions that wrap embedding functions in try-except and retry logic
This particular embedding model can only process 1 image at a time, so inputs are validated to be equal to a length of 1.

In [None]:
import copy
from typing import List, Optional

import numpy as np
import requests
from tenacity import retry, stop_after_attempt

client = EmbeddingPredictionClient(project=PROJECT_ID)


# Use a retry handler in case of failure
@retry(reraise=True, stop=stop_after_attempt(3))
def encode_texts_to_embeddings_with_retry(text: List[str]) -> List[List[float]]:
    assert len(text) == 1

    try:
        return [client.get_embedding(text=text[0], image_file=None).text_embedding]
    except Exception:
        raise RuntimeError("Error getting embedding.")


def encode_texts_to_embeddings(text: List[str]) -> List[Optional[List[float]]]:
    try:
        return encode_texts_to_embeddings_with_retry(text=text)
    except Exception:
        return [None for _ in range(len(text))]


@retry(reraise=True, stop=stop_after_attempt(3))
def encode_images_to_embeddings_with_retry(image_uris: List[str]) -> List[List[float]]:
    assert len(image_uris) == 1

    try:
        return [
            client.get_embedding(text=None, image_file=image_uris[0]).image_embedding
        ]
    except Exception as ex:
        print(ex)
        raise RuntimeError("Error getting embedding for image.")


def encode_images_to_embeddings(image_uris: List[str]) -> List[Optional[List[float]]]:
    try:
        return encode_images_to_embeddings_with_retry(image_uris=image_uris)
    except Exception as ex:
        print(ex)
        return [None for _ in range(len(image_uris))]
    

@retry(reraise=True, stop=stop_after_attempt(3))
def encode_images_to_summarycontent_with_retry(image_uris: List[str]) -> List[List[float]]:
    assert len(image_uris) == 1

    try:
        return [
            client.get_image_summarycontent(text=None, image_file=image_uris[0])
        ]
    except Exception as ex:
        print(ex)
        raise RuntimeError("Error getting summaries.")


def encode_images_to_summarycontent(image_uris: List[str]) -> List[Optional[List[float]]]:
    try:
        return encode_images_to_summarycontent_with_retry(image_uris=image_uris)
    except Exception as ex:
        print(ex)
        return [None for _ in range(len(image_uris))]
    
    
# Use a retry handler in case of failure
@retry(reraise=True, stop=stop_after_attempt(3))
def encode_summarycontent_to_embeddings_with_retry(text: List[str]) -> List[List[float]]:
    assert len(text) == 1

    try:
        return [client.get_summarycontent_embedding_from_text_embedding_model(text=text[0]).text_embedding]
    except Exception:
        raise RuntimeError("Error getting embedding for summary content.")


def encode_summarycontent_to_embeddings(text: List[str]) -> List[Optional[List[float]]]:
    try:
        return encode_summarycontent_to_embeddings_with_retry(text=text)
    except Exception:
        return [None for _ in range(len(text))]
    

### Create and save the embeddings in JSONL format
The data must be formatted in JSONL format, which means each embedding dictionary is written as an individual JSON object on its own line.

See more information in the docs at Input data format and structure.

Run the following code in the next available cells, to create a temporary file to store embeddings in JSON format.

In [80]:
import tempfile, shutil

# Create temporary file to write embeddings to
embeddings_file = tempfile.NamedTemporaryFile(suffix=".json", delete=False)

# Create temporary file to write summaries to
summaries_file = tempfile.NamedTemporaryFile(suffix=".json", delete=False)


In [91]:
with open(embeddings_file.name, "a") as ef:
     ef.writelines('this is only a test22222')

### embedding file


In [None]:
import json

BATCH_SIZE = 1# this can be changed

with open(embeddings_file.name, "a") as ef, open(summaries_file.name, "a") as sf:
     for i in tqdm(range(0, len(image_names), BATCH_SIZE)):#len(image_names)
        image_names_chunk = image_names[i : i + BATCH_SIZE]
        image_paths_chunk = image_paths[i : i + BATCH_SIZE]
        embeddings=[]
        image_summaries=[]
        #comment to prevent extra costs
        #********************************
        embeddings = encode_to_embeddings_chunked(
            process_function=encode_images_to_embeddings, items=image_paths_chunk
        )
        #embeddings=[[1,2],[1,2]]
        #********************************

        #comment to prevent extra costs
        #********************************
        summaries = encode_to_embeddings_chunked(
            process_function=encode_images_to_summarycontent, items=image_paths_chunk
           )
        #summaries=['1'+str(i),'2'+str(i)]
        #********************************
        #summaries=[' The image shows three people: Joe Biden, a young girl, and Hunter Biden. Joe Biden is smiling and wearing a dark suit. The young girl is smiling and wearing a white dress. Hunter Biden is smiling and wearing a dark suit. The background is a photo of the White House.','this is test']
   
        #comment to prevent extra costs
        #********************************
        summaries_embeddings = encode_to_embeddings_chunked(
             process_function=encode_summarycontent_to_embeddings, items=summaries
        )
        #summaries_embeddings=[[1,2],[1,2,3]]
        
        #********************************
        
        #print(embeddings)
        #print('-----')
        print(summaries)
        #print('-----')
        #print(summaries_embeddings)
        # Append to file
        embeddings_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "embedding": [str(value) for value in embedding],
                }
            )
            + "\n"
            for id, embedding in zip(image_names_chunk, embeddings)
            if embedding is not None
        ]
        ef.writelines(embeddings_formatted)
        
        
        summaries_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "image path": image_path,
                    "summary":  summary,
                    "summary embedding": [str(value) for value in summaries_embedding],
                    "image embedding": [str(value) for value in embedding],
                }
            )
            + "\n"
            for id, summary,summaries_embedding,embedding,image_path in zip(image_names_chunk, summaries,summaries_embeddings,embeddings,image_paths_chunk)
            if summaries is not None
        ]
        sf.writelines(summaries_formatted)
        


### Create bucket and push embeddings into the bucket

In [None]:

#set bucket info to create a bucket
BUCKET_URI = f"gs://artifacts-{PROJECT_ID}-unique"  # @param {type:"string"}
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

In [86]:
UNIQUE_FOLDER_NAME = "multimodal_embeddings"
BUCKET_URI = f"gs://artifacts-nine-quality-test-embeddings"
#embeddings
EMBEDDINGS_INITIAL_URI = f"{BUCKET_URI}/{UNIQUE_FOLDER_NAME}/"
! gsutil cp {embeddings_file.name} {EMBEDDINGS_INITIAL_URI}

Copying file:///var/tmp/tmp90h66ik0.json [Content-Type=application/json]...
/ [1 files][   19.0 B/   19.0 B]                                                
Operation completed over 1 objects/19.0 B.                                       


In [101]:
temp=embeddings_file
client = storage.Client()
BUCKET_URI = f"artifacts-nine-quality-test-embeddings"
#embeddings
EMBEDDINGS_INITIAL_URI = f"{UNIQUE_FOLDER_NAME}/"

if 1==1:
        # Extract name to the temp file
        temp_file = "".join([str(temp.name)])
        # Save image to temp file            
        # Uploading the temp image file to the bucket
        dest_filename = f"{UNIQUE_FOLDER_NAME}/embeddingtest.json"
        dest_bucket_name =  f"artifacts-nine-quality-test-embeddings"
        dest_bucket = client.get_bucket(BUCKET_URI)
        dest_blob = dest_bucket.blob(dest_filename)
        dest_blob.upload_from_filename(temp_file)

In [None]:
#embeddings
EMBEDDINGS_INITIAL_URI = f"{BUCKET_URI}/{UNIQUE_FOLDER_NAME}/"
! gsutil cp {embeddings_file.name} {EMBEDDINGS_INITIAL_URI}

#summaries
EMBEDDINGS_INITIAL_URI = f"{BUCKET_URI}/{UNIQUE_FOLDER_NAME}/"
! gsutil cp {summaries_file.name} {EMBEDDINGS_INITIAL_URI}

In [None]:
#save the temp files in persistent disk
import tempfile, shutil
 
file_name = embeddings_file.name
embeddings_file.close()
shutil.copy(file_name, 'embeddings_file.json')

file_name = summaries_file.name
summaries_file.close()
shutil.copy(file_name, 'summaries_file.json')
 