## retrieving feature vectors from featureVectorDB

In [40]:
import numpy as np
import pandas as pd
import sqlite3 as db
import tqdm
import shutil
import os

In [2]:
import matplotlib.pyplot as plt
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Connecting to db
db_filepath = './artifacts/featurevectorDB.db'
conn = db.connect(db_filepath)
cursor = conn.cursor()

In [4]:
def retrieve_feature_vector(img_name):
    #Get the row
    cursor.execute("SELECT * FROM ImageFeatures WHERE img_name = ?", (img_name, ))
    result = cursor.fetchone()

    if result is not None:
        # Convert the bytes back to a float array and return it
        float_array_bytes = result[1]
        float_array = np.frombuffer(float_array_bytes, dtype='float32')
        return float_array
    else:
        return None

In [5]:
#Fetch feature vectors from ImageFeatures
cursor.execute('SELECT img_name FROM ImageFeatures')
img_names = cursor.fetchall()
len(img_names)

34149

In [7]:
image_feature_df = pd.DataFrame(list(img_feature_dict.items()), columns=['image_name', 'feature_vector'])

In [62]:
np.array(retrieve_feature_vector(img_names[0][0]).tolist())

array([0.89598829, 0.        , 0.        , ..., 0.        , 0.85478342,
       0.02093017])

## Generate similarity scores

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
def calculate_similarityscore(image_id, fv_query):
    fv_image = retrieve_feature_vector(image_id)
    score = cosine_similarity([fv_image],[fv_query]).flatten()[0]
    return score

In [25]:
cosine_sim = cosine_similarity(image_feature_df['feature_vector'].tolist(), image_feature_df['feature_vector'].tolist())

In [61]:
result_df = pd.DataFrame(columns=['image_name', 'top_similar_images'])

# Iterate through each image
for i, image_name in tqdm.tqdm(enumerate(image_feature_df['image_name'])):
    # Get the similarity scores for the current image
    similar_scores = cosine_sim[i]

    # Sort in descending order and get the indices of the top 30 similar images (excluding the image itself)
    top_indices = (-similar_scores).argsort()[1:31]

    # Get the image names corresponding to the top indices
    top_similar_images = image_feature_df['image_name'].iloc[top_indices].tolist()

    # Append to the result DataFrame
    result_df = result_df.append({'image_name': image_name, 'top_similar_images': top_similar_images}, ignore_index=True)

34149it [02:13, 256.22it/s]


In [62]:
cursor.execute("DROP TABLE IF EXISTS SimilarityResults")
cursor.execute('''
                CREATE TABLE IF NOT EXISTS SimilarityResults (
                img_name TEXT,
                similar_images TEXT
                )
                ''')

<sqlite3.Cursor at 0x147eac3b0>

In [63]:
result_df['similar_images_joined'] = result_df['top_similar_images'].apply(lambda x: ','.join(x))

for index, row in result_df.iterrows():
    cursor.execute("INSERT OR REPLACE INTO SimilarityResults (img_name, similar_images) VALUES (?, ?)",
                   (row['image_name'], row['similar_images_joined']))

In [64]:
cursor.execute("select similar_images from SimilarityResults where img_name =?", (query_image,))
similar_images = cursor.fetchall()

In [65]:
similar_images[0][0].split(",")

['n015-2018-11-21-19-11-29+0800__CAM_FRONT__1542798859112460.jpg',
 'n015-2018-11-21-19-38-26+0800__CAM_FRONT__1542800379862460.jpg',
 'n015-2018-11-21-19-38-26+0800__CAM_FRONT__1542800389412460.jpg',
 'n015-2018-11-21-19-38-26+0800__CAM_FRONT__1542800326412460.jpg',
 'n015-2018-11-21-19-38-26+0800__CAM_FRONT__1542800377412460.jpg',
 'n015-2018-11-21-19-21-35+0800__CAM_FRONT__1542799668162460.jpg',
 'n015-2018-11-21-19-11-29+0800__CAM_FRONT__1542798858162460.jpg',
 'n015-2018-11-21-19-38-26+0800__CAM_FRONT__1542800334912460.jpg',
 'n015-2018-11-21-19-38-26+0800__CAM_FRONT__1542800309912460.jpg',
 'n015-2018-11-21-19-38-26+0800__CAM_FRONT__1542800384912460.jpg',
 'n015-2018-11-21-19-38-26+0800__CAM_FRONT__1542800447412460.jpg',
 'n015-2018-11-21-19-38-26+0800__CAM_FRONT__1542800307912460.jpg',
 'n015-2018-11-14-19-09-14+0800__CAM_FRONT__1542194084912460.jpg',
 'n015-2018-11-14-19-09-14+0800__CAM_FRONT__1542194085412460.jpg',
 'n015-2018-11-14-19-09-14+0800__CAM_FRONT__1542194061912460.j

In [42]:
image_directory = 'data/input_data/v1.0-trainval/samples/CAM_FRONT'
similar_images_directory = 'data/output_data/similar_images'
query_image = 'n015-2018-11-21-19-38-26+0800__CAM_FRONT__1542800380362460.jpg'

In [43]:
image_dir = os.path.join(image_directory)

In [35]:
def empty_directory(directory_path):
    
    for item in os.listdir(directory_path):
        item_path = os.path.join(directory_path, item)

        # Check if the item is a file and remove it
        if os.path.isfile(item_path):
            os.remove(item_path)

        # Check if the item is a directory and remove it recursively
        elif os.path.isdir(item_path):
            shutil.rmtree(item_path)

In [45]:
def get_the_similar_images(query_image):
    
    empty_directory(similar_images_directory)
    
    similar_images = result_df[result_df['image_name']==query_image]['top_similar_images'][0]

    for similar_image in similar_images:
        image_path = os.path.join(image_directory, similar_image)
        shutil.copy2(image_path, similar_images_directory)

In [46]:
get_the_similar_images(query_image)

In [47]:
result_df.to_csv("./data/output_data/top_similar_images.csv", index=False)

In [None]:
# Store the result_df in your database table
# Assuming you have a connection to your database
result_df.to_sql('similar_images', con=your_db_connection, if_exists='replace', index=False)