In [30]:
pip install torch torchvision transformers psycopg2


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [31]:
import os
import torch
import torchvision.transforms as transforms
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import psycopg2
random_seed = 42
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)


In [32]:
device = "cuda" if torch.cuda.is_available() else \
         ("mps" if torch.backends.mps.is_available() else "cpu" ) 

In [33]:
device

'mps'

In [34]:
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)

In [35]:
file_path = '/Users/ayuranjan/Desktop/Masters/Courses/2nd Quarter /Neural Computation/archive/Images/667626_18933d713e.jpg'
img = Image.open(file_path)
img.show()

In [36]:
image = processor(text = None, 
                  images = img, 
                  return_tensors ="pt").to(device)


In [37]:
type(image)

transformers.tokenization_utils_base.BatchEncoding

In [38]:
image_feature = model.get_image_features(**image)

In [39]:
image_feature.shape

torch.Size([1, 512])

In [41]:
folder_path = '/Users/ayuranjan/Desktop/Masters/Courses/2nd Quarter /Neural Computation/archive/Images'
image_embeddings = {}
for filename in os.listdir(folder_path):
    img_path = os.path.join(folder_path, filename)
    img = Image.open(img_path)

    image = processor(text = None, images = img, 
                  return_tensors ="pt").to(device)
    with torch.no_grad():
       embedding = model.get_image_features(**image)
    #embedding = model.get_image_features(**image)
    image_embeddings[filename] = embedding.squeeze().cpu().numpy()

In [42]:
len(image_embeddings)

8091

In [None]:
type(image_embeddings['667626_18933d713e.jpg'])

numpy.ndarray

In [45]:
first_value = next(iter(image_embeddings.values()))
first_key = next(iter(image_embeddings.keys()))

In [44]:
first_value

array([-6.55570403e-02, -2.01625675e-01, -8.37388635e-03,  3.21593285e-01,
       -3.01299602e-01,  3.84506285e-01,  3.42996180e-01, -1.82476789e-01,
        6.15324199e-01, -1.68234557e-01, -3.41989875e-01,  3.19208026e-01,
        4.30820957e-02,  3.60704102e-02, -1.61157995e-02, -2.94192910e-01,
        3.09876382e-01,  1.05087310e-02, -1.75546035e-01,  3.47654298e-02,
       -1.23044515e+00, -2.39298612e-01, -3.57094288e-01,  1.14223763e-01,
       -5.75476766e-01, -2.05325752e-01,  1.03103325e-01,  1.36139110e-01,
       -3.18704069e-01, -3.64342272e-01,  4.35933471e-02,  4.80959304e-02,
        1.25669420e-01, -4.57241088e-02,  2.99188673e-01, -5.48362732e-03,
        7.11456090e-02,  2.84528047e-01, -2.24127769e-01,  9.83273804e-01,
        2.04462871e-01, -1.71565294e-01,  1.76569030e-01,  1.99220970e-01,
        3.24219823e-01,  1.44480228e-01,  3.93840909e-01,  3.68289798e-01,
       -2.69859850e-01, -1.28556907e-01, -8.79612416e-02,  2.72774786e-01,
        7.53060579e-02,  

In [47]:
first_key 

'2387197355_237f6f41ee.jpg'

In [None]:

# Important : run this command only once 
# Before running this command, make sure you have installed psycopg2
# you should also have a database named embeddings in your postgresql 
# default username is postgres and password is empty string
# if you have changed the username and password, change it in the code below
# if you have changed the database name, change it in the code below else you can crete a databae using the command create database embeddings


import psycopg2
from psycopg2 import Error

try:
    #connect to database clip 
    connection = psycopg2.connect(user = "ayuranjan",
                                    password = "",
                                    host = "localhost",
                                    port = "5432",
                                    database = "embeddings")
    cursor = connection.cursor()
    cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
    print("Extension created successfully")
    connection.commit()
    cursor.execute("CREATE TABLE IF NOT EXISTS clip (image_name VARCHAR(255) PRIMARY KEY, embedding vector(512))")
    ## cursor.execute("CREATE TABLE IF NOT EXISTS dino (image_name VARCHAR(255) PRIMARY KEY, embedding vector(512))") not sure about the dimesion of the embedding 
    print("Table created successfully")
    connection.commit()

except (Exception, psycopg2.Error) as error :
    print ("Error while connecting to PostgreSQL", error)
finally:
        #closing database connection.
        if(connection):
            cursor.close()
            connection.close()
            print("PostgreSQL connection is closed")


Extension created successfully
Table created successfully
PostgreSQL connection is closed


In [18]:
#insert image embeddings into table

try:
    connection = psycopg2.connect(user = "ayuranjan",
                                    password = "",
                                    host = "localhost",
                                    port = "5432",
                                    database = "embeddings")
    cursor = connection.cursor()
    for key, value in image_embeddings.items():
        cursor.execute("INSERT INTO clip (image_name, embedding) VALUES (%s, %s)", (key, value.tolist()))
    connection.commit()
    cursor.execute("SELECT COUNT(*) FROM clip")
    record = cursor.fetchone()
    print(str(record[0]) + " Record inserted successfully into table")
    
except (Exception, psycopg2.Error) as error :
    print ("Error while connecting to PostgreSQL", error)
finally:
        #closing database connection.
        if(connection):
            cursor.close()
            connection.close()
            print("PostgreSQL connection is closed")

            


Error while connecting to PostgreSQL can only concatenate tuple (not "str") to tuple
PostgreSQL connection is closed


In [55]:
# use pgvector to store the embeddings in the database 
try:
    connection = psycopg2.connect(user = "ayuranjan",
                                    password = "",
                                    host = "localhost",
                                    port = "5432",
                                    database = "embeddings")
    cursor = connection.cursor()
    cursor.execute("SELECT image_name FROM clip ORDER BY embedding <-> %s::vector  LIMIT 10;", (first_value.tolist(),))
    record = cursor.fetchall()
    print("Top 10 similar images are : ")
    for row in record:
        print(row[0])   
    
    
    
except (Exception, psycopg2.Error) as error :
    print ("Error while connecting to PostgreSQL", error)
finally:
        #closing database connection.
        if(connection):
            cursor.close()
            connection.close()
            print("PostgreSQL connection is closed")

Top 10 similar images are : 
2387197355_237f6f41ee.jpg
3046431231_dc48851062.jpg
3353950389_1153d5e452.jpg
3017203816_5dc2a6b392.jpg
3046430047_d7b10123d0.jpg
2623939135_0cd02ffa5d.jpg
2226534154_cbcab7ba32.jpg
374103776_0de490c1b0.jpg
3016726158_4d15b83b06.jpg
3469585782_e708496552.jpg
PostgreSQL connection is closed


In [56]:
# iterate over the dictionaty embedding and for each image's embedding, find the top 5 similar images and store their image_name  it in a dictionary
# the key of the dictionary will be the image_name and the value will be a list of top 5 similar images
# to find the similar image use pggvector cosine similarity function

import psycopg2
from psycopg2 import Error
similar_images = {}

try:
    connection = psycopg2.connect(user = "ayuranjan",
                                    password = "",
                                    host = "localhost",
                                    port = "5432",
                                    database = "embeddings")
    cursor = connection.cursor()
    for key, value in image_embeddings.items():
        cursor.execute("SELECT image_name FROM clip ORDER BY embedding <-> %s::vector  LIMIT 10;", (value.tolist(),))
        records = cursor.fetchall()
        similar_images[key] = [row[0] for row in records]
    
except (Exception, psycopg2.Error) as error :
    print ("Error while connecting to PostgreSQL", error)
finally:
        if(connection):
            cursor.close()
            connection.close()
            print("PostgreSQL connection is closed")

PostgreSQL connection is closed


In [57]:
for key, value in similar_images.items():
    print(key, value)

2387197355_237f6f41ee.jpg ['2387197355_237f6f41ee.jpg', '3046431231_dc48851062.jpg', '3353950389_1153d5e452.jpg', '3017203816_5dc2a6b392.jpg', '3046430047_d7b10123d0.jpg', '2623939135_0cd02ffa5d.jpg', '2226534154_cbcab7ba32.jpg', '374103776_0de490c1b0.jpg', '3016726158_4d15b83b06.jpg', '3469585782_e708496552.jpg']
2609847254_0ec40c1cce.jpg ['2609847254_0ec40c1cce.jpg', '2435166927_28b8130660.jpg', '1316247213_1d2c726dd5.jpg', '3534824784_7133119316.jpg', '2548777800_d7b9cf1c2b.jpg', '3595412126_4020d4643b.jpg', '824782868_a8f532f3a6.jpg', '3243588540_b418ac7eda.jpg', '3056530884_27766059bc.jpg', '1248734482_3038218f3b.jpg']
2046222127_a6f300e202.jpg ['2046222127_a6f300e202.jpg', '2162469360_ff777edc95.jpg', '3332136681_9aecf101fd.jpg', '1398873613_7e3174dd6c.jpg', '2402793046_3385554e81.jpg', '3332467180_d72f9b067d.jpg', '2831314869_5025300133.jpg', '3504881781_6a842e043b.jpg', '2394857899_76bfdf720b.jpg', '2836808985_b26e4ca09e.jpg']
2853743795_e90ebc669d.jpg ['2853743795_e90ebc669d.j