In [23]:
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import cv2
import imagehash
import json
from PIL import Image
    
def hamming_distance(s1, s2):
    if len(s1) != len(s2):
        raise ValueError("Strings must be of equal length.")

    return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2))





In [43]:
def read_json(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)
    return data

def recommend_5(query_path, data_json, distance_measure = "hamming"):
    diff_list = {}
    
    for i in data_json.keys():
        if distance_measure == "hamming":
            diff_list[i] = hamming_distance(data_json[query_path], data_json[i])
        elif distance_measure == "l2":
        
            diff_list[i] = np.linalg.norm(np.array(data_json[query_path])-np.array(data_json[i]))

    top_5_recommend = sorted(diff_list.items(), key=lambda x: x[1], reverse=False)[:5]
  
    return top_5_recommend

def precision_at_K(query_image_path, top_5_recommend):
    y_cat = np.array([query_image_path.split('/')[-2]]*5)
    # y_cat = np.array(['cardigans']*5)
    y_pred = np.array([cat[0].split('/')[-2] for cat in top_5_recommend])
    comparison_array = (y_cat == y_pred)
    true_positives = np.sum(comparison_array)

    return true_positives/(true_positives+  len(comparison_array)-true_positives)


def evaluate(data_json, distance="hamming"):
    total_acc = 0
    for k in data_json.keys():
        top_5 = recommend_5(k, data_json, distance)
        acc = precision_at_K(k, top_5)
        total_acc += acc
        
        
    return total_acc/len(data_json)


In [46]:
data_dhash = read_json('/space/hotel/hieud/mlflow_aisia/latent_features_dhash.json')
print(evaluate(data_dhash,distance="hamming"))

0.4416666666666689


In [45]:
data_tsne = read_json('/space/hotel/hieud/mlflow_aisia/latent_features_tsne.json')
print(evaluate(data_tsne,distance="l2"))

0.43049450549450896
