In [1]:
import torch
import clip
from PIL import Image
import json

In [2]:
# read the caption

with open("./dataset1/caption.json", "r") as f:
    captions = json.load(f)

print(captions[0])
caption_list = [item["caption"] for item in captions]
print(caption_list[0])

{'caption': 'A woman strides purposefully along the sidewalk, her determined footsteps echoing against the pavement, another person trailing behind her, their connection palpable in the shared rhythm of their journey, a silent companion in the hustle and bustle of city life.'}
A woman strides purposefully along the sidewalk, her determined footsteps echoing against the pavement, another person trailing behind her, their connection palpable in the shared rhythm of their journey, a silent companion in the hustle and bustle of city life.


In [3]:
# caption_list

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Load the CLIP model
model, preprocess = clip.load("ViT-B/32", device=device)

# captions_tensor = clip.tokenize(caption_list) .to(device)
# use ground truth keywords
# with open("./dataset1/caption_with_keywords_and_image.json") as f:
#     caption_with_keywords_and_image = json.load(f)
#     keywords_list = [
#         caption_with_keywords_and_image[i]["keywords"]
#         for i in range(len(caption_with_keywords_and_image))
#     ]

# use our keywords extractor
# from keywords_extract import keyWords_extractor
# keywords_list = keyWords_extractor(caption_list)

# just cut-off to 50 words
keywords_list = [caption[:100] for caption in caption_list]

captions_tensor = clip.tokenize(keywords_list).to(device)

images_path = "./dataset1/images/"

import os

# Load the images and preprocess them
images_files = os.listdir(images_path)

images = [Image.open(images_path + f).convert("RGB") for f in os.listdir(images_path)]

images_preprocessed = torch.stack([preprocess(img) for img in images]).to(device)

# Calculate the similarity between the images and the captions

with torch.no_grad():
    image_features = model.encode_image(images_preprocessed)
    text_features = model.encode_text(captions_tensor)
    logits_per_image, logits_per_text = model(images_preprocessed, captions_tensor)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

Using cpu device


In [9]:
print("Probs: ")
print(probs)

Probs: 
[[7.9535075e-06 6.8679224e-06 4.8940943e-05 ... 1.0980217e-03
  1.3712856e-03 9.8176561e-06]
 [1.4260755e-01 3.5847243e-04 1.6039809e-03 ... 3.8078886e-02
  7.4589049e-04 5.8586800e-05]
 [6.4762658e-04 1.2082123e-03 2.2343642e-03 ... 2.3080202e-05
  3.6907075e-03 6.3022613e-05]
 ...
 [5.8202096e-04 4.3607345e-03 2.5659683e-04 ... 1.4793727e-03
  8.3014145e-03 2.1034850e-05]
 [2.7330357e-04 6.7435336e-05 5.7825902e-05 ... 7.6040911e-04
  5.4055099e-06 1.2202881e-06]
 [2.2583558e-06 1.7289838e-06 1.7826102e-05 ... 1.3346554e-08
  2.8493348e-06 3.0785362e-08]]


In [13]:
import numpy as np

images_files = os.listdir(images_path)


# select the best image for each caption
best_image_indices = np.argmax(probs, axis=0)
# print image paths for the best images
best_image_paths = [images_files[i] for i in best_image_indices]
for i in range(len(best_image_paths)):
    print(f"Caption: {keywords_list[i]}")
    print(f"Best image: {best_image_paths[i]}")
    print("-")

Caption: A woman strides purposefully along the sidewalk, her determined footsteps echoing against the paveme
Best image: 4726926550.jpg
-
Caption: A woman wearing glasses smiles with satisfaction as she reviews the images on the back of her camera
Best image: 339734472.jpg
-
Caption: Clad in a crisp blue apron and a neatly tied headscarf, the man stands with determination behind the
Best image: 345412253.jpg
-
Caption: In a busy city teeming with cars stuck in traffic along the road, a lone man pauses to observe the c
Best image: 4935046155.jpg
-
Caption: A man wearing a crisp white hat sits regally atop a horse-drawn cart, the clip-clop of hooves echoin
Best image: 431664339.jpg
-
Caption: Two wanderers and their faithful dog find respite at a historic location, their weary bodies finding
Best image: 3610189629.jpg
-
Caption: With gentle precision, the man in the white jacket adjusts the cuff of his patient's sleeve before c
Best image: 2066986243.jpg
-
Caption: A man donning a vibra

In [14]:
# save to result.json
result = []
for i in range(len(best_image_paths)):
    result.append(
        {
            "caption": caption_list[i],
            "keywords": keywords_list[i],
            "image": best_image_paths[i],
        }
    )

with open("result_cutoff_100.json", "w") as f:
    json.dump(result, f, indent=4)

In [19]:
print(probs.shape)
len(images)
len(keywords_list)
best_image_indices.shape

(300, 60)


(300,)

In [4]:
from keywords_extract import keyWords_extractor

res = keyWords_extractor(caption_list)

In [7]:
# len(res)
res

['footsteps, purposefully, trailing, bustle, companion',
 'creative, curve, eye, frame, fulfillment',
 'hammer, purpose, bringing, channeling, comforting',
 'busy, cars, contemplative, crowds, ebb',
 'bygone, clip, clop, cobbled, era',
 'bodies, existence, faithful, historic, location',
 'cuff, adjusts, arm, blood, carefully',
 'burgers, come, donning, drawing, eager',
 'animated, beneath, cadence, jumpsuits, sway',
 'adrenaline, aircraft, board, fueled, helicopter',
 'aerial, breathtaking, checkered, open, performs',
 'center, culture, elegantly, mesmerizing, perform',
 'boxing, defensive, delivers, gear, impact',
 'devices, floating, gleefully, infectious, pair',
 'trees, tranquil, wilderness, embrace, sense',
 'adversity, bundled, chill, earmuffs, forward',
 'background, dimly, extinguisher, figures, foreground',
 'air, breathless, cheering, defying, fans',
 'figure, animal, astride, countryside, equestrian',
 'friendship, banter, barriers, blooming, blossoms',
 'carve, exhilarating