In [16]:
from PIL import Image
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, CLIPModel, CLIPImageProcessor

In [17]:
# one image using huggingface model https://huggingface.co/openai/clip-vit-large-patch14

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

# prepare images
#one_img = './1k-compressed/-bDYrrVAZnc.jpg'  # night
#one_img = './1k-compressed/-fjrjyS18BQ.jpg'  # dusk
im = './1k-compressed/-N_UwPdUs7E.jpg'  # dawn
#one_img = './1k-compressed/7kCNXfo35aU.jpg'  # day
#one_img = './1k-compressed/4_Bc9CSm70A.jpg'  # dusk or dawn

image = Image.open(im)

# prepare text prompts
labels = ['dawn', 'day', 'dusk', 'night']  # the target labels
text_prompts = ['An image taken at ' + label for label in labels]
print(text_prompts)

inputs = processor(text=text_prompts, images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

choices = np.argmax(probs.detach().numpy())  # use argmax to get the largest number as the prediction
print(f'This image is classified as taken at {labels[choices]}')

['An image taken at dawn', 'An image taken at day', 'An image taken at dusk', 'An image taken at night']
This image is classified as taken at dawn


In [18]:
# 10 images using huggingface model https://huggingface.co/openai/clip-vit-large-patch14

# classify 1k-compressed images
photos = pd.read_csv('photos-1k.csv')

images = [Image.open(f'./1k-compressed/{img_id}.jpg') for img_id in photos['photo_id'][:10]]  # process images in batch

# prepare text prompts
labels = ['dawn', 'day', 'dusk', 'night']  # the target labels
text_prompts = ['An image taken at ' + label for label in labels]
print(text_prompts)

inputs = processor(text=text_prompts, images=images, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities


['An image taken at dawn', 'An image taken at day', 'An image taken at dusk', 'An image taken at night']


In [19]:
# build a df based on the result
df = pd.DataFrame(probs.detach().numpy(), columns=text_prompts)

# show the result - darker colored cell is the prediction
df.style.background_gradient(axis=None, low=0, high=0.91).format(precision=2)

Unnamed: 0,An image taken at dawn,An image taken at day,An image taken at dusk,An image taken at night
0,0.21,0.54,0.24,0.01
1,0.28,0.57,0.12,0.03
2,0.51,0.37,0.09,0.02
3,0.46,0.44,0.1,0.0
4,0.36,0.1,0.48,0.06
5,0.47,0.32,0.21,0.0
6,0.56,0.03,0.39,0.03
7,0.81,0.02,0.16,0.01
8,0.38,0.16,0.44,0.02
9,0.48,0.09,0.43,0.0


In [20]:
choices = np.argmax(probs.detach().numpy(), axis=1)  # choose the largest prob
get_labels = np.vectorize(lambda x:labels[x])  # vectorized lambda function to get labels
predictions = get_labels(choices)  # an array
predictions

array(['day', 'day', 'dawn', 'dawn', 'dusk', 'dawn', 'dawn', 'dawn',
       'dusk', 'dawn'], dtype='<U4')