In [None]:
!pip install --upgrade google-cloud-aiplatform

### Helper Functions

In [38]:
import base64
import os
import cv2
import json
import random

random.seed(10)
def load_dataset(datapath):
    with open(datapath, 'r') as f:  # Open file with 'r' for read mode
        data = json.load(f)
    return data
                
def concate_images(image_1, image_2):
    img1 = cv2.imread(image_1) 
    img2 = cv2.imread(image_2)
    if img1.shape[0] != img2.shape[0]:  # Check if heights are the same
        target_height = max(img1.shape[0], img2.shape[0]) 
        img1 = cv2.resize(img1, (img1.shape[1], target_height))
        img2 = cv2.resize(img2, (img2.shape[1], target_height))

    im_h = cv2.hconcat([img1, img2])
    cv2.imwrite('/kaggle/working/temp.jpg', im_h) 
    return '/kaggle/working/temp.jpg'
    
def encode_image(image):
    with open(image, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return encoded_string

def image_caption_generator(concatenated=False):
    dataset = load_dataset('/kaggle/input/imagecode-simple/valid_simple.json')
    imagepath = '/kaggle/input/imagecode-simple/image-sets/image-sets'
    for data in dataset:
        directory, pos_idx, neg_idx, caption = data.values()
        file_list = sorted([file for file in os.listdir(f'{imagepath}/{directory}')], key=lambda x: int(x.split('.')[0][3:]))
        pos_img = f'{imagepath}/{directory}/{file_list[pos_idx]}'
        neg_img = f'{imagepath}/{directory}/{file_list[neg_idx]}'
        if concatenated:
            if random.randint(0,1) == 1:
                concatenated = concate_images(pos_img, neg_img)
                label = "Left"
            else:
                concatenated = concate_images(neg_img, pos_img)
                label = "Right"
            if concatenated is None:
                continue 
                
            yield concatenated, caption, label, directory, pos_idx, neg_idx
        else:
            if random.randint(0,1) == 1:
                img1 = pos_img
                img2 = neg_img
                label = "Image 1"
            else:
                img1 = neg_img
                img2 = pos_img
                label = "Image 2"
            yield img1, img2, caption, label

        


## Gemini

In [None]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason, Image
import vertexai.preview.generative_models as generative_models
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()
user_secrets.set_tensorflow_credential(user_credential)

generation_config = {
    "max_output_tokens": 2048,
    "temperature": 0.1,
    "top_p": 1,
    "top_k": 32,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

def generate(img1, img2, caption):
    vertexai.init(project="imagecodechallenge", location="us-central1")
    model = GenerativeModel("gemini-1.0-pro-vision-001")
    img1 = Image.load_from_file(img1)
    img2 = Image.load_from_file(img2)
    
    responses = model.generate_content(
      ["Image 1:", img1, "Image 2:", img2,
       f"Please tell me which of these images is described by the caption: {caption}. End your answer with: The answer is Image [1 or 2]"],
      generation_config=generation_config,
      safety_settings=safety_settings,
      stream=False,
    )
    try:
        return responses.text
    except (ValueError, AttributeError) as e: # Response Failed 
        return None

results = []

for img1, img2, caption, label in image_caption_generator(concatenated=False):
    response = generate(img1, img2, caption)
    if response:
        results.append({"img1": img1, "img2" : img2, "caption": caption, "label":label, "answer": response[-9:], "response" : response})
    else:
        print("Call failed")





In [9]:
with open('/kaggle/working/ZeroShotCoTGeminiResults_0-415Valid.json', 'w') as outfile:
    json.dump(results, outfile, indent=4)

In [35]:
import re
with open('/kaggle/working/ZeroShotCoTGeminiResults_0-415Valid.json', 'r') as file:
    data = json.load(file)
total = 0
correct = 0
for a in data:
    if a['label'] in a['answer']:
        correct += 1
    else:
        m = re.search(r"The answer is (Image [12])", a['response'])
        correct += 1 if m and a['label'] in m.group(0) else 0
        
    total += 1
    

print(correct/total)

0.6642335766423357
