process photos using OPENAI's CLIP neural network, For each image we get a fearure vector containing 512 float numbers, which we will store in a file. These feature vectors will be used later to compare theom to the text feature vectors.

This step will be significantly faster if you have a GPU with CUDA support.

In [1]:
from pathlib import Path

# Set the path to the photos
dataset_version = "lite"  # Use "lite" or "full"
photos_path = Path("unsplash-dataset") / dataset_version / "photos"

# List all JPGs in the folder
photos_files = list(photos_path.glob("*.jpg"))

# Print some statistics
print(f"Photos found: {len(photos_files)}")

Photos found: 24949


In [8]:
# Load The CLIP net
import clips
import torch
from PIL import Image

# Load the open clip model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clips.load("ViT-B/32", device=device)

# function that computes the feature vector for a batch of images
def compute_clip_features(photos_batch):
    # Load all the photos from the files
    photos = [Image.open(photo_file) for photo_file in photos_batch]

    # Preprocess the photos
    photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)

    with torch.no_grad():
        # Encode the photos batch to compute the feature vectors and normalize them
        photos_features = model.encode_image(photos_preprocessed)
        photos_features /= photos_features.norm(dim=-1, keepdim=True)

    # Transfer the feature vectors back to the CPU and convert them to numpy arrays
    return photos_features.cpu().numpy()

100%|███████████████████████████████████████| 338M/338M [00:29<00:00, 12.2MiB/s]


In [10]:
# Process all photos
import math
import numpy as np
import pandas as pd

# Define the batch size so that it fits on your GPU. You can also do the processing on the CPU, but it will be slower
batch_size = 16

# Path where the feature vectors will be saved
features_path = Path("unsplash-dataset") / dataset_version / "features"

# Compute how many batches are needed
batches = math.ceil(len(photos_files) / batch_size)

# Process each batch
for i in range(batches):
    print(f"Processing batch {i+1}/{batches}...")

    batch_ids_path = features_path / f"{i:010d}.csv"
    batch_features_path = features_path / f"{i:010d}.npy"

    # Only do the processing if the batch wasn't processed yet
    if not batch_features_path.exists():
        try:
            # Select the photos for the current batch
            batch_files = photos_files[i*batch_size : (i+1)*batch_size]

            # Compute the features and save to a numpy file
            batch_features = compute_clip_features(batch_files)
            np.save(batch_features_path, batch_features)

            # Save the photo IDs to a CSV file
            photo_ids = [photo_file.name.split(".")[0] for photo_file in batch_files]
            photo_ids_data = pd.DataFrame(photo_ids, columns=['photo_id'])
            photo_ids_data.to_csv(batch_ids_path, index=False)
        except:
            # Catch problems with the processing to make the process more robust
            print(f'Problem with batch {i}')



Processing batch 1/1560...
Processing batch 2/1560...
Processing batch 3/1560...
Processing batch 4/1560...
Processing batch 5/1560...
Processing batch 6/1560...
Processing batch 7/1560...
Processing batch 8/1560...
Processing batch 9/1560...
Processing batch 10/1560...
Processing batch 11/1560...
Processing batch 12/1560...
Processing batch 13/1560...
Processing batch 14/1560...
Processing batch 15/1560...
Processing batch 16/1560...
Processing batch 17/1560...
Processing batch 18/1560...
Processing batch 19/1560...
Processing batch 20/1560...
Processing batch 21/1560...
Processing batch 22/1560...
Processing batch 23/1560...
Processing batch 24/1560...
Processing batch 25/1560...
Processing batch 26/1560...
Processing batch 27/1560...
Processing batch 28/1560...
Processing batch 29/1560...
Processing batch 30/1560...
Processing batch 31/1560...
Processing batch 32/1560...
Processing batch 33/1560...
Processing batch 34/1560...
Processing batch 35/1560...
Processing batch 36/1560...
P

In [11]:
# Merge the feartures and the photo IDs. The resulting files are features.npy and photo_ids.csv. Feel free to delete the intermediates results.
import numpy as np
import pandas as pd

# Load all numpy files
features_list = [np.load(features_file) for features_file in sorted(features_path.glob("*.npy"))]

# Concatenate the features and store in a merged file
features = np.concatenate(features_list)
np.save(features_path / "features.npy", features)

# Load all the photo IDs
photo_ids = pd.concat([pd.read_csv(ids_file) for ids_file in sorted(features_path.glob("*.csv"))])
photo_ids.to_csv(features_path / "photo_ids.csv", index=False)