In [32]:
# for file/folder operations (system calls)
import os, shutil, glob
# to download dataset and extract it
import zipfile, requests
# to open/show images
from PIL import Image
# to show stacked images
import numpy as np
# for models
from transformers import CLIPProcessor, TFCLIPModel
# for vector db
from qdrant_client import QdrantClient
from qdrant_client.http import models
# for progress bar
from tqdm import tqdm
# for easier manupilating embeddings/ids
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [36]:
# Zalando Viton-HD dataset
data_url = "https://www.dropbox.com/s/10bfat0kg4si1bu/zalando-hd-resized.zip"

In [13]:
# if the zip dataset not exist on disk
if not os.path.exists("../data/raw/zalando-hd-resized.zip"):
    # go to dropbox
    response = requests.get(data_url, stream=True)
    # save zip to disk
    with open("../data/raw/zalando-hd-resized.zip", 'wb') as buff:
        for chunk in response.iter_content(128):
            buff.write(chunk)

In [14]:
# extract data
with zipfile.ZipFile(file="../data/raw/zalando-hd-resized.zip", mode="r") as buff:
    buff.extractall(path="../data/raw/zalando-hd-resized")

* We are only interested with cloth data.

In [21]:
# copy contents of cloth folder to raw
shutil.copytree(
    src="../data/raw/zalando-hd-resized/train/cloth/",
    dst="../data/raw/",
    dirs_exist_ok=True)

'../data/raw/'

In [22]:
# delete unneccesary folder/zip 
shutil.rmtree(path="../data/raw/zalando-hd-resized")
os.remove(path="../data/raw/zalando-hd-resized.zip")

In [20]:
# show some images
all_images = glob.glob(pathname="../data/raw/*.jpg")

# read 5 images from disk
images = [Image.open(fp=img, mode="r") for img in all_images[:5]]

# stack resized images (to be more conveniet to show)
images = np.hstack([image.resize(size=(128,256)) for image in images])

# show images
images = Image.fromarray(images)
images.show()

In [21]:
# define clip data processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# define clip model
model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.



* lets try to embed one image and see results


In [7]:
# load one image from disk
image = Image.open(fp=all_images[0], mode="r")

# process one image
processed_image = processor(
    images=image,
    text=None,
    return_tensors="tf"
)["pixel_values"]

print("input image shape: ", processed_image.shape)

# embed image
image_embedding = model.get_image_features(processed_image)

print("image embeddings shape: ", image_embedding.shape)
print(image_embedding)

input image shape:  (1, 3, 224, 224)
image embeddings shape:  (1, 512)
tf.Tensor(
[[ 9.05680805e-02  3.27430889e-02  2.98265144e-02  3.44645500e-01
   1.63984448e-01  4.14204538e-01 -8.47207606e-02  2.71041065e-01
   8.74725163e-01  1.24552451e-01  1.37790427e-01  5.85126579e-02
  -1.57511845e-01 -1.94499381e-02 -1.74491592e-02  1.73226237e-01
  -4.67269570e-02  1.56918913e-03  5.43844402e-02 -8.73053819e-02
  -5.11746764e-01 -8.17486495e-02  3.86942416e-01 -3.77125479e-02
  -8.99185091e-02  3.09481561e-01 -2.91582614e-01 -3.56023937e-01
  -2.10167974e-01 -7.08732724e-01  2.32437253e-03  1.57950222e-02
  -4.41687047e-01 -1.20023586e-01 -4.26208675e-02  1.16930656e-01
  -2.67171800e-01  2.11749911e-01  5.43847606e-02  1.82864380e+00
   3.54630172e-01  1.00964427e-01 -2.05303475e-01 -3.62512618e-01
   1.09685026e-01 -1.58175969e+00  8.66881907e-02  5.97256720e-02
   1.75857514e-01 -2.68373072e-01  7.62155831e-01 -1.60100520e-01
  -1.66634709e-01 -3.17256659e-01 -6.55568242e-01  5.0549626

* Let's embed dummy text

In [17]:
input_search = "red tshirt"

# tokenize input text
processed_text = processor(
    text=input_search,
    images=None,
    return_tensors="tf",
    padding=True,
    truncation=True
)

print(processed_text["input_ids"][0].numpy())

# embed tokens
text_embeddings = model.get_text_features(**processed_text)

print(text_embeddings.numpy().shape)

# get embeddings
# text_embedding = tf.reduce_mean(text_embeddings.last_hidden_state, axis=1).numpy().tolist()

# text_embedding

[49406   736 14907 49407]
(1, 512)


* let's embed all images

In [79]:
images_embeddings = list()

# loop over all images in folder
for idx, img in tqdm(
    iterable=enumerate(all_images[:200]), # take only 200 images for the sake of simpilicity
    desc="embedding all images",
    total=len(all_images[:200]), 

):
    # load image from disk
    image = Image.open(
        fp=img,
        mode="r"
    )

    # process image image
    processed_image = processor(
    images=image,
    text=None,
    return_tensors="tf"
    )["pixel_values"]

    # embed image (latent vectors)
    image_embedding = np.squeeze(model.get_image_features(processed_image).numpy()) # convert from eager tensor to numpy

    # append to list
    images_embeddings.append({"id":idx, "embedding":image_embedding, "dir":{"dir":img}})

embedding all images: 100%|██████████| 200/200 [00:40<00:00,  4.89it/s]


In [80]:
# convert result as dataframe
df = pd.DataFrame.from_dict(images_embeddings)
df

Unnamed: 0,id,embedding,dir
0,0,"[0.09056808, 0.03274309, 0.029826514, 0.344645...",{'dir': '../data/raw/02532_00.jpg'}
1,1,"[0.052449428, 0.19420756, 0.124062635, 0.01250...",{'dir': '../data/raw/04718_00.jpg'}
2,2,"[0.09634906, -0.25596958, -0.27667934, 0.22772...",{'dir': '../data/raw/05638_00.jpg'}
3,3,"[0.18320513, 0.13055275, 0.051832553, 0.377668...",{'dir': '../data/raw/12824_00.jpg'}
4,4,"[0.0447726, 0.042324282, 0.1539196, -0.1050425...",{'dir': '../data/raw/00906_00.jpg'}
...,...,...,...
195,195,"[-0.090003386, -0.07768246, 0.07326353, 0.3402...",{'dir': '../data/raw/08903_00.jpg'}
196,196,"[0.06347422, 0.29872894, -0.08681728, 0.093796...",{'dir': '../data/raw/07235_00.jpg'}
197,197,"[-0.0038751531, 0.090236224, 0.024033051, 0.08...",{'dir': '../data/raw/02948_00.jpg'}
198,198,"[0.13124095, 0.33271495, -0.0060832947, -0.082...",{'dir': '../data/raw/06997_00.jpg'}


* Now we have all of our image embeddings and metadata/payload (dir).
* Let's create vector database so we can search by text and image.

In [81]:
# connect to qdrant
vector_db_client = QdrantClient(
    host="localhost",
    port=6333
)

vector_db_client

<qdrant_client.qdrant_client.QdrantClient at 0x7fb32db22990>

In [82]:
# create a collection to save embeddings
vector_db_client.recreate_collection(
    collection_name="images_embeddings",
    vectors_config=models.VectorParams(
        size=512, # size of embeddings
        distance=models.Distance.COSINE # similarity criteria
    )
)

True

In [84]:
# insert/update (if exist) embeddings to Qdrant
vector_db_client.upsert(
    collection_name="images_embeddings",
    points=models.Batch(
        ids=df["id"],
        vectors=df["embedding"],
        payloads=df["dir"]
    )
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [86]:
# sanity check to make sure that data inserted to collection
vector_db_client.scroll(
    collection_name="images_embeddings",
    limit=1,
    with_payload=True,
    with_vectors=True
)

([Record(id=0, payload={'dir': '../data/raw/02532_00.jpg'}, vector=[0.008358832, 0.0030219697, 0.0027527893, 0.03180849, 0.015134676, 0.03822833, -0.007819164, 0.02501529, 0.080731325, 0.011495365, 0.012717142, 0.00540033, -0.014537298, -0.0017951002, -0.0016104416, 0.015987633, -0.004312588, 0.00014482574, 0.005019323, -0.008057707, -0.04723083, -0.007544858, 0.035712216, -0.003480618, -0.008298881, 0.028563093, -0.02691114, -0.032858647, -0.01939711, -0.06541133, 0.00021452416, 0.0014577757, -0.040764783, -0.011077381, -0.003933623, 0.010791924, -0.024658184, 0.019543111, 0.0050193523, 0.16877168, 0.03273001, 0.009318346, -0.018948149, -0.03345751, 0.0101232, -0.14598592, 0.0080007445, 0.005512283, 0.01623048, -0.024769053, 0.07034192, -0.014776215, -0.015379278, -0.029280685, -0.060504597, 0.04665395, 0.00489241, -0.00425574, -0.00027224957, 0.0510562, 0.00507128, -0.03241646, -0.010778048, -0.0010659652, -0.01672149, 0.013339686, 0.01579125, -0.06801225, -0.030280223, -0.012843705,

* Now let's try to seach by image

In [89]:
# dummy image from dataset
image_query = Image.open(
    fp="../data/raw/14415_00.jpg",
    mode="r"
)

# preprocess image
processed_image = processor(
    images=image_query,
    text=None,
    return_tensors="tf"
)["pixel_values"]

# image embedding
image_embedding = np.squeeze(model.get_image_features(processed_image).numpy())

# search for image
result = vector_db_client.search(
    collection_name="images_embeddings",
    query_vector=image_embedding,
    limit=5
)

* Let's show results

In [105]:
# get the directorys of each image
dirs = [dir.payload["dir"] for dir in result]

# the query image 
dirs.append("../data/raw/14415_00.jpg")

# read images from disk
images = [Image.open(fp=img, mode="r") for img in dirs]

# stack resized images (to be more conveniet to show) 
images = np.hstack([image.resize(size=(128,256)) for image in images])

# show images
images = Image.fromarray(images)
images.show(title="the right most image is the query image")

* Now let's use clip model to search for red t-shirt

In [112]:
text_query = "red t-shirt"

# tokenize input text
processed_text = processor(
    text=input_search,
    images=None,
    return_tensors="tf",
    padding=True,
    truncation=True
)

# embed tokens
text_embedding = model.get_text_features(**processed_text)

text_embedding = np.squeeze(text_embedding.numpy())

# search for image
result = vector_db_client.search(
    collection_name="images_embeddings",
    query_vector=text_embedding,
    limit=5
)

* Let's show the result

In [113]:
# get the directorys of each image
dirs = [dir.payload["dir"] for dir in result]

# read images from disk
images = [Image.open(fp=img, mode="r") for img in dirs]

# stack resized images (to be more conveniet to show) 
images = np.hstack([image.resize(size=(128,256)) for image in images])

# show images
images = Image.fromarray(images)
images.show()

* Okay that was amazing :D
* Now lets build streamlit app