In [1]:
# Imports
import os

from utils.extractors import Encoder
from utils.evaluator import Evaluator

from sklearn_extra.cluster import KMedoids
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import pandas as pd

import pickle

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.cm as cm
%matplotlib inline

In [2]:
IMAGESDIR = "./data/near_duplicates/"
print("Nombres d'images: {0}".format(len(os.listdir(IMAGESDIR))))
nb_images = len(os.listdir(IMAGESDIR))

Nombres d'images: 240


## Load computed clusters

In [3]:
centroids = pickle.load(open("./clusters/centroids.pkl", "rb"))

In [4]:
labels = pickle.load(open("./clusters/labels.pkl", "rb"))

## Find closest cluster

#### Get closest cluster

In [5]:
def get_closest_cluster(input_path, centroids):
    encoder = Encoder(encoding="mobilenet")
    features = encoder(input_path)
    dis = (-1.0, 0)
    for i in range(centroids.shape[0]) :
        a = cosine_similarity(features, centroids[i].reshape(1, -1))
        if dis[0] < a :
            dis = (a, i)
    return dis[1]

In [6]:
%%time
input_path = IMAGESDIR + '00076.jpg'
clust = get_closest_cluster(input_path, centroids)
print(f"The closest cluster is {clust}.")

2020-12-06 02:28:57,124: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
The closest cluster is 0.
Wall time: 6.5 s


In [7]:
def eval_clustering():
    centroids = pickle.load(open("./clusters/centroids.pkl", "rb"))
    labels = pickle.load(open("./clusters/labels.pkl", "rb"))
    data = pd.read_csv("./data/out.csv")
    data = pd.DataFrame({
        "duplicates": data.duplicates.apply(lambda x: [int(elt) for elt in x.split(" ")])
    })
    total_acc = 0.0
    for clust in range(5):
        ids = np.random.choice(np.where(labels == clust)[0], 5)
        acc = 0.0
        for id_ in ids:
            targets = data.duplicates.iloc[id_]
            tmp_acc = 0
            for target in targets:
                if target in np.where(labels == clust)[0]:
                    tmp_acc += 1
            tmp_acc = tmp_acc/len(targets)
            acc += tmp_acc
            print(f"Cluster {clust} - image n°{id_} - Acc: {tmp_acc}")
        acc = acc/len(ids)
        print(f"Cluster {clust} - Acc: {acc}")
        total_acc += acc
    
    return total_acc/5

In [8]:
%%time
acc = eval_clustering()
print(f"Clustering Acc: {acc}")

Cluster 0 - image n°239 - Acc: 1.0
Cluster 0 - image n°63 - Acc: 1.0
Cluster 0 - image n°171 - Acc: 1.0
Cluster 0 - image n°21 - Acc: 1.0
Cluster 0 - image n°94 - Acc: 1.0
Cluster 0 - Acc: 1.0
Cluster 1 - image n°12 - Acc: 1.0
Cluster 1 - image n°12 - Acc: 1.0
Cluster 1 - image n°54 - Acc: 1.0
Cluster 1 - image n°116 - Acc: 1.0
Cluster 1 - image n°152 - Acc: 1.0
Cluster 1 - Acc: 1.0
Cluster 2 - image n°162 - Acc: 1.0
Cluster 2 - image n°113 - Acc: 1.0
Cluster 2 - image n°93 - Acc: 1.0
Cluster 2 - image n°149 - Acc: 1.0
Cluster 2 - image n°52 - Acc: 1.0
Cluster 2 - Acc: 1.0
Cluster 3 - image n°217 - Acc: 1.0
Cluster 3 - image n°77 - Acc: 1.0
Cluster 3 - image n°183 - Acc: 1.0
Cluster 3 - image n°206 - Acc: 1.0
Cluster 3 - image n°137 - Acc: 1.0
Cluster 3 - Acc: 1.0
Cluster 4 - image n°4 - Acc: 1.0
Cluster 4 - image n°197 - Acc: 1.0
Cluster 4 - image n°4 - Acc: 1.0
Cluster 4 - image n°67 - Acc: 1.0
Cluster 4 - image n°131 - Acc: 1.0
Cluster 4 - Acc: 1.0
Clustering Acc: 1.0
Wall time: 13 

## Find duplicates or near images

#### Get images' features in cluster

In [9]:
indexes = np.where(labels == clust)[0]

#### Search for duplicates using MobileNet

In [10]:
def search_duplicates_mn(input_path, indexes):
    encoder = Encoder(encoding="mobilenet")
    target_features = encoder(input_path)
    imgs_features = pickle.load(open("./features/features_mn.pkl", "rb"))
    duplicates = []
    closest = [-1, -1]
    for i in indexes:
        similarity = cosine_similarity(target_features, imgs_features[i].reshape(1, -1))
        if similarity > 0.85:
            duplicates.append(i)
        elif closest[0] < similarity:
            closest[0] = similarity
            closest[1] = i
    
    return duplicates, closest[1]

In [11]:
%%time
duplicates, closest = search_duplicates_mn(input_path, indexes)
print("MobileNet")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

2020-12-06 02:28:59,673: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
MobileNet
----------
Found images at [72, 75, 80, 98, 106] indexes as duplicates and the next closest image at 170.
Wall time: 2.41 s


#### Search for duplicates using hashing methods

In [12]:
def search_duplicates_hash(input_path, indexes, dirname, mode="phash"):
    encoder = Encoder(encoding=mode)
    encoded_target = encoder(input_path)
    duplicates = []
    closest = [np.inf, -1]
    filenames = os.listdir(dirname)
    for i in indexes:
        distance = encoder.metric(encoded_target, encoder(os.path.join(dirname, filenames[i])))
        if distance <= 12:
            duplicates.append(i)
        elif closest[0] > distance:
            closest[0] = distance
            closest[1] = i
    
    return duplicates, closest[1]

In [13]:
%%time
duplicates, closest = search_duplicates_hash(input_path, indexes, IMAGESDIR, mode="phash")
print("PHash")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

PHash
----------
Found images at [72, 75, 80, 98, 106] indexes as duplicates and the next closest image at 95.
Wall time: 1.07 s


In [14]:
%%time
duplicates, closest = search_duplicates_hash(input_path, indexes, IMAGESDIR, mode="ahash")
print("AHash")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

AHash
----------
Found images at [72, 75, 80, 98, 106] indexes as duplicates and the next closest image at 3.
Wall time: 990 ms


In [15]:
%%time
duplicates, closest = search_duplicates_hash(input_path, indexes, IMAGESDIR, mode="dhash")
print("DHash")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

DHash
----------
Found images at [72, 75, 80, 98, 106] indexes as duplicates and the next closest image at 95.
Wall time: 1 s


In [16]:
%%time
duplicates, closest = search_duplicates_hash(input_path, indexes, IMAGESDIR, mode="whash")
print("WHash")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

WHash
----------
Found images at [3, 15, 64, 72, 75, 80, 95, 98, 106] indexes as duplicates and the next closest image at 58.
Wall time: 1.3 s


#### Search for duplicates using ORB descriptor

In [17]:
def search_duplicates_orb(input_path, indexes, dirname):
    encoder = Encoder(encoding="orb")
    encoded_target = encoder(input_path)
    duplicates = []
    closest = [0, -1]
    filenames = os.listdir(dirname)
    for i in indexes:
        score = encoder.metric(encoded_target, encoder(os.path.join(dirname, filenames[i])))
        if score >= 50:
            duplicates.append(i)
        elif closest[0] < score:
            closest[0] = score
            closest[1] = i
    
    return duplicates, closest[1]

In [18]:
%%time
duplicates, closest = search_duplicates_orb(input_path, indexes, IMAGESDIR)
print("ORB")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

ORB
----------
Found images at [36, 72, 75, 80, 98, 106] indexes as duplicates and the next closest image at 21.
Wall time: 1.97 s


In [19]:
def performance(dirname, encoding):
    total_precision = 0.0
    total_recall = 0.0
    for clust in range(5):
        evaluator = Evaluator("./data/out.csv")
        id_ = np.random.choice(np.where(labels == clust)[0], 1)[0]
        filename = os.listdir(dirname)[id_]
        input_path = os.path.join(dirname, filename)
        indexes = np.where(labels == clust)[0]
        if encoding == "mobilenet":
            duplicates, closest = search_duplicates_mn(input_path, indexes)
        elif encoding in ["phash", "ahash", "dhash", "whash"]:
            duplicates, closest = search_duplicates_hash(input_path, indexes, dirname, mode=encoding)
        elif encoding == "orb":
            duplicates, closest = search_duplicates_orb(input_path, indexes, dirname)
        precision, recall, _ = evaluator.eval(id_, duplicates, indexes)
        print(f"Cluster {clust} - image n°{id_} - {duplicates} as duplicates - Precision: {precision} - Recall: {recall}")
        total_precision += precision
        total_recall += recall
    total_precision = total_precision/5
    total_recall = total_recall/5
    f1_score = 2*((total_precision*total_recall)/(total_precision+total_recall))
    return total_precision, total_recall, f1_score

In [20]:
%%time
precision, recall, f1_score = performance(IMAGESDIR, encoding="mobilenet")
print(f"MobileNet - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

2020-12-06 02:29:08,698: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
Cluster 0 - image n°64 - [15, 64] as duplicates - Precision: 1.0 - Recall: 1.0
2020-12-06 02:29:10,855: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
Cluster 1 - image n°230 - [230, 232] as duplicates - Precision: 1.0 - Recall: 1.0
2020-12-06 02:29:15,808: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
Cluster 2 - image n°162 - [162] as duplicates - Precision: 1.0 - Recall: 1.0
2020-12-06 02:29:18,014: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
Cluster 3 - image n°154 - [154, 195] as duplicates - Precision: 0.5 - Recall: 1.0
2020-12-06 02:29:20,559: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and ad

In [21]:
%%time
precision, recall, f1_score = performance(IMAGESDIR, encoding="phash")
print(f"PHash - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

Cluster 0 - image n°104 - [104] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 1 - image n°223 - [222, 223, 226, 228] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 2 - image n°117 - [117] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 3 - image n°206 - [206, 207] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 4 - image n°234 - [233, 234] as duplicates - Precision: 0.5 - Recall: 1.0
PHash - Precision: 0.9 - Recall: 1.0 - F1 Score: 0.9473684210526316
Wall time: 1min 12s


In [22]:
%%time
precision, recall, f1_score = performance(IMAGESDIR, encoding="ahash")
print(f"AHash - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

Cluster 0 - image n°95 - [3, 95] as duplicates - Precision: 0.5 - Recall: 1.0
Cluster 1 - image n°12 - [12] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 2 - image n°179 - [148, 179] as duplicates - Precision: 0.5 - Recall: 1.0
Cluster 3 - image n°145 - [143, 145, 154, 195] as duplicates - Precision: 0.5 - Recall: 1.0
Cluster 4 - image n°130 - [130, 131, 132] as duplicates - Precision: 1.0 - Recall: 1.0
AHash - Precision: 0.7 - Recall: 1.0 - F1 Score: 0.8235294117647058
Wall time: 1min 3s


In [23]:
%%time
precision, recall, f1_score = performance(IMAGESDIR, encoding="dhash")
print(f"DHash - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

Cluster 0 - image n°239 - [239] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 1 - image n°37 - [6, 37, 46, 54] as duplicates - Precision: 0.25 - Recall: 1.0
Cluster 2 - image n°153 - [153] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 3 - image n°23 - [23, 40] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 4 - image n°234 - [233, 234] as duplicates - Precision: 0.5 - Recall: 1.0
DHash - Precision: 0.75 - Recall: 1.0 - F1 Score: 0.8571428571428571
Wall time: 1min 10s


In [24]:
%%time
precision, recall, f1_score = performance(IMAGESDIR, encoding="whash")
print(f"WHash - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

Cluster 0 - image n°182 - [170, 171, 182] as duplicates - Precision: 0.3333333333333333 - Recall: 1.0
Cluster 1 - image n°226 - [222, 223, 226, 228, 230, 232] as duplicates - Precision: 0.6666666666666666 - Recall: 1.0
Cluster 2 - image n°196 - [166, 196] as duplicates - Precision: 0.5 - Recall: 1.0
Cluster 3 - image n°56 - [56] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 4 - image n°220 - [220] as duplicates - Precision: 1.0 - Recall: 0.25
WHash - Precision: 0.7 - Recall: 0.85 - F1 Score: 0.767741935483871
Wall time: 1min 15s


In [25]:
%%time
precision, recall, f1_score = performance(IMAGESDIR, encoding="orb")
print(f"ORB - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

Cluster 0 - image n°119 - [36, 119] as duplicates - Precision: 0.5 - Recall: 1.0
Cluster 1 - image n°12 - [12] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 2 - image n°203 - [158, 203] as duplicates - Precision: 0.5 - Recall: 1.0
Cluster 3 - image n°48 - [22, 48] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 4 - image n°161 - [161, 202] as duplicates - Precision: 0.5 - Recall: 1.0
ORB - Precision: 0.7 - Recall: 1.0 - F1 Score: 0.8235294117647058
Wall time: 1min 13s
