In [1]:
# mount gdrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# import libraries
import os
import glob
import cv2
import numpy as np
import pickle
import time
from tqdm import tqdm
from scipy.spatial.distance import cosine
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

In [3]:
# load files
train_labels = np.loadtxt('/content/drive/MyDrive/Colab Notebooks/Information Retrieval CSE508/Project Final/labels_train_hybrid.csv',delimiter=',')
# check query performance on seen data
test_labels_1 =  np.loadtxt('/content/drive/MyDrive/Colab Notebooks/Information Retrieval CSE508/Project Final/labels_train_hybrid.csv',delimiter=',')
# check query performance on unseen data
test_labels_2 =  np.loadtxt('/content/drive/MyDrive/Colab Notebooks/Information Retrieval CSE508/Project Final/labels_test_hybrid.csv',delimiter=',')

with open('/content/drive/MyDrive/Colab Notebooks/Information Retrieval CSE508/Project Final/clustering_index_hybrid.pkl', 'rb') as file:
  index=pickle.load(file)

# load seen embeddings
test_embeddings_1 = np.loadtxt('/content/drive/MyDrive/Colab Notebooks/Information Retrieval CSE508/Project Final/embeddings_train_hybrid.csv',delimiter=',')
# load seen embeddings
test_embeddings_2 = np.loadtxt('/content/drive/MyDrive/Colab Notebooks/Information Retrieval CSE508/Project Final/embeddings_test_hybrid.csv',delimiter=',')

In [3]:
# helper functions     
def retrieveResults(query_embedding):
  '''
    returns image ids of closest images from the inverted index

  '''
  distances = []
  for i in range(len(index)):
    distances.append(cosine(query_embedding.reshape(-1),index[i][0].reshape(-1)))
  # # find the closest term
  closest_term = np.argmin(distances)
  
  # get the posting list for the closest term and compute the distances between the query embedding and the embeddings in the posting list
  posting_list = index[closest_term][1]
  posting_distances = []
  for posting in posting_list:
    dist = np.linalg.norm(posting[1] - query_embedding)
    posting_distances.append((posting[0], dist))

  # # sort the posting list in ascending order of the distances to the query embedding
  posting_distances.sort(key=lambda x: x[1])

  closest_image_ids = []
  for entry in posting_distances:
      id, dist = entry
      closest_image_ids.append(id)
  return closest_image_ids


def calculateAveragePrecision(query_image_label,retrieved_image_ids):
  '''
    calculates the average precsion @K for a the retrieved images   for a given query image from the test set

  '''
  pred_labels = []
  for idx in retrieved_image_ids:
    pred_labels.append(train_labels[int(idx)-1])
  pred_labels = np.array(pred_labels)
  pred = pred_labels == query_image_label
  
  precision_at_k = np.zeros(shape=(pred.shape))
  for i in range(precision_at_k.shape[0]):
    precision_at_k[i] = np.sum(pred[:i+1])/(i+1)
  
  avg_precision = precision_at_k[pred]
  if avg_precision.shape[0]>0:
    avg_precision = np.sum(avg_precision)/avg_precision.shape[0]
  else : avg_precision = 0
  return avg_precision

def calculateMeanAveragePrecision(test_embeddings,test_labels):
  '''
    calculates the mean average precision for all the query images in the test set

  '''
  map = []
  for i in tqdm(range(test_embeddings.shape[0])):
    img_id = i+1
    img_label = test_labels[img_id-1]
    test_embedding = test_embeddings[i][1:]
    res=  retrieveResults(test_embedding)
    map.append(calculateAveragePrecision(img_label,res))
  return np.mean(map)

In [None]:
# calculate MAP on the test datasets
map = calculateMeanAveragePrecision(test_embeddings_1,test_labels_1)
print('\nMean Average Precision for the seen dataset is : {:.4f}'.format(map))
map = calculateMeanAveragePrecision(test_embeddings_2,test_labels_2)
print('\nMean Average Precision for the unseen dataset is : {:.4f}'.format(map))

100%|██████████| 49984/49984 [14:13<00:00, 58.59it/s]



Mean Average Precision for the seen dataset is : 0.9707


100%|██████████| 9984/9984 [02:54<00:00, 57.25it/s]


Mean Average Precision for the unseen dataset is : 0.9542





In [4]:
# calculate time taken to fetch a set of query images given their embeddings
import time
duration = []
for embedding in test_embeddings_2:
  start = time.time()
  results = retrieveResults(embedding[1:])
  end = time.time()
  duration.append(end-start)
print('Average time taken per query  : {:.5f} ms'.format(1000*np.mean(duration)))

Average time taken per query  : 7.83490 ms
