In [1]:
# Imports
import os

from core.utils.extractors import Encoder
from core.utils.evaluator import Evaluator
from core.clustering import get_closest_cluster
from core.duplicate_search import search_duplicates_mn, search_duplicates_hash, search_duplicates_orb, performance

from sklearn_extra.cluster import KMedoids
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import pandas as pd
import time

import pickle

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.cm as cm
%matplotlib inline

In [2]:
# For reproducibility
np.random.seed(0)

In [3]:
IMAGESDIR = "./data/near_duplicates/"
print("Nombres d'images: {0}".format(len(os.listdir(IMAGESDIR))))
nb_images = len(os.listdir(IMAGESDIR))

Nombres d'images: 240


## Load computed clusters

In [4]:
centroids = pickle.load(open("./clusters/centroids.pkl", "rb"))

In [5]:
labels = pickle.load(open("./clusters/labels.pkl", "rb"))

In [6]:
imgs_features = pickle.load(open("./features/features_mn.pkl", "rb"))

## Find closest cluster

#### Get closest cluster

In [7]:
%%time
input_path = IMAGESDIR + '00004.jpg'
clust = get_closest_cluster(input_path, centroids)
print(f"The closest cluster is {clust}.")

2020-12-06 14:44:53,277: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
The closest cluster is 0.
Wall time: 6.63 s


## Find duplicates or near images

#### Get images' features in cluster

In [8]:
indexes = np.where(labels == clust)[0]

#### Search for duplicates using MobileNet

In [9]:
%%time
duplicates, closest = search_duplicates_mn(input_path, indexes, imgs_features)
print("MobileNet")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

2020-12-06 14:44:55,731: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
MobileNet
----------
Found images at [3] indexes as duplicates and the next closest image at 10.
Wall time: 2.22 s


#### Search for duplicates using hashing methods

In [10]:
filenames = [os.path.join(IMAGESDIR, file) for file in os.listdir(IMAGESDIR)]

In [11]:
%%time
duplicates, closest = search_duplicates_hash(input_path, indexes, filenames, mode="phash")
print("PHash")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

PHash
----------
Found images at [3] indexes as duplicates and the next closest image at 61.
Wall time: 1.01 s


In [12]:
%%time
duplicates, closest = search_duplicates_hash(input_path, indexes, filenames, mode="ahash")
print("AHash")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

AHash
----------
Found images at [3, 95] indexes as duplicates and the next closest image at 72.
Wall time: 914 ms


In [13]:
%%time
duplicates, closest = search_duplicates_hash(input_path, indexes, filenames, mode="dhash")
print("DHash")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

DHash
----------
Found images at [3] indexes as duplicates and the next closest image at 170.
Wall time: 917 ms


In [14]:
%%time
duplicates, closest = search_duplicates_hash(input_path, indexes, filenames, mode="whash")
print("WHash")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

WHash
----------
Found images at [3, 15, 64, 72, 75, 80, 95, 98, 106] indexes as duplicates and the next closest image at 58.
Wall time: 1.23 s


#### Search for duplicates using ORB descriptor

In [15]:
%%time
duplicates, closest = search_duplicates_orb(input_path, indexes, filenames)
print("ORB")
print("-"*10)
print(f"Found images at {duplicates} indexes as duplicates and the next closest image at {closest}.")

ORB
----------
Found images at [3] indexes as duplicates and the next closest image at 80.
Wall time: 1.91 s


#### Select image to test

In [16]:
ids = [np.random.choice(np.where(labels == clust)[0], 1)[0] for clust in range(5)]
indexes = [list(np.where(labels == clust)[0]) for clust in range(5) ]
evaluator = Evaluator("./data/out.csv")

perf = pd.DataFrame(columns=["model", "precision", "recall", "duration"])

In [17]:
%%time
start = time.time()
precision, recall, f1_score = performance(filenames, "mobilenet", ids, indexes, evaluator, imgs_features)
row = pd.DataFrame({
    "model": ["mobilenet"],
    "precision": [precision],
    "recall": [recall],
    "duration": [time.time() - start]
})
perf = perf.append(row, ignore_index=True)
print(f"MobileNet - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

2020-12-06 14:45:04,224: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
Cluster 0 - image n°3 - [3] as duplicates - Precision: 1.0 - Recall: 1.0
2020-12-06 14:45:06,437: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
Cluster 1 - image n°37 - [6, 37, 46, 54] as duplicates - Precision: 0.25 - Recall: 1.0
2020-12-06 14:45:12,179: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
Cluster 2 - image n°29 - [29] as duplicates - Precision: 1.0 - Recall: 1.0
2020-12-06 14:45:14,517: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added GlobalAveragePooling
Cluster 3 - image n°25 - [25] as duplicates - Precision: 1.0 - Recall: 1.0
2020-12-06 14:45:17,142: INFO Initialized: MobileNet pretrained on ImageNet dataset sliced at last conv layer and added Global

In [18]:
%%time
start = time.time()
precision, recall, f1_score = performance(filenames, "phash", ids, indexes, evaluator)
row = pd.DataFrame({
    "model": ["phash"],
    "precision": [precision],
    "recall": [recall],
    "duration": [time.time() - start]
})
perf = perf.append(row, ignore_index=True)
print(f"PHash - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

Cluster 0 - image n°3 - [3] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 1 - image n°37 - [6, 37, 46, 54] as duplicates - Precision: 0.25 - Recall: 1.0
Cluster 2 - image n°29 - [29] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 3 - image n°25 - [25] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 4 - image n°132 - [132] as duplicates - Precision: 1.0 - Recall: 0.3333333333333333
PHash - Precision: 0.85 - Recall: 0.8666666666666666 - F1 Score: 0.8582524271844659
Wall time: 1min 11s


In [19]:
%%time
start = time.time()
precision, recall, f1_score = performance(filenames, "ahash", ids, indexes, evaluator)
row = pd.DataFrame({
    "model": ["ahash"],
    "precision": [precision],
    "recall": [recall],
    "duration": [time.time() - start]
})
perf = perf.append(row, ignore_index=True)
print(f"AHash - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

Cluster 0 - image n°3 - [3, 95] as duplicates - Precision: 0.5 - Recall: 1.0
Cluster 1 - image n°37 - [6, 37, 46, 54] as duplicates - Precision: 0.25 - Recall: 1.0
Cluster 2 - image n°29 - [29, 30, 85] as duplicates - Precision: 0.3333333333333333 - Recall: 1.0
Cluster 3 - image n°25 - [25] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 4 - image n°132 - [130, 132] as duplicates - Precision: 1.0 - Recall: 0.6666666666666666
AHash - Precision: 0.6166666666666666 - Recall: 0.9333333333333333 - F1 Score: 0.7426523297491039
Wall time: 1min 7s


In [20]:
%%time
start = time.time()
precision, recall, f1_score = performance(filenames, "dhash", ids, indexes, evaluator)
row = pd.DataFrame({
    "model": ["dhash"],
    "precision": [precision],
    "recall": [recall],
    "duration": [time.time() - start]
})
perf = perf.append(row, ignore_index=True)
print(f"DHash - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

Cluster 0 - image n°3 - [3] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 1 - image n°37 - [6, 37, 46, 54] as duplicates - Precision: 0.25 - Recall: 1.0
Cluster 2 - image n°29 - [29] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 3 - image n°25 - [25] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 4 - image n°132 - [132] as duplicates - Precision: 1.0 - Recall: 0.3333333333333333
DHash - Precision: 0.85 - Recall: 0.8666666666666666 - F1 Score: 0.8582524271844659
Wall time: 1min 8s


In [21]:
%%time
start = time.time()
precision, recall, f1_score = performance(filenames, "whash", ids, indexes, evaluator)
row = pd.DataFrame({
    "model": ["whash"],
    "precision": [precision],
    "recall": [recall],
    "duration": [time.time() - start]
})
perf = perf.append(row, ignore_index=True)
print(f"WHash - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

Cluster 0 - image n°3 - [3, 15, 64, 72, 75, 80, 95, 98, 106] as duplicates - Precision: 0.1111111111111111 - Recall: 1.0
Cluster 1 - image n°37 - [6, 37, 46, 54] as duplicates - Precision: 0.25 - Recall: 1.0
Cluster 2 - image n°29 - [29, 151] as duplicates - Precision: 0.5 - Recall: 1.0
Cluster 3 - image n°25 - [25] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 4 - image n°132 - [132] as duplicates - Precision: 1.0 - Recall: 0.3333333333333333
WHash - Precision: 0.5722222222222222 - Recall: 0.8666666666666666 - F1 Score: 0.6893178893178893
Wall time: 1min 16s


In [22]:
%%time
start = time.time()
precision, recall, f1_score = performance(filenames, "orb", ids, indexes, evaluator)
row = pd.DataFrame({
    "model": ["orb"],
    "precision": [precision],
    "recall": [recall],
    "duration": [time.time() - start]
})
perf = perf.append(row, ignore_index=True)
print(f"ORB - Precision: {precision} - Recall: {recall} - F1 Score: {f1_score}")

Cluster 0 - image n°3 - [3] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 1 - image n°37 - [6, 37, 46, 54] as duplicates - Precision: 0.25 - Recall: 1.0
Cluster 2 - image n°29 - [29, 59] as duplicates - Precision: 0.5 - Recall: 1.0
Cluster 3 - image n°25 - [25] as duplicates - Precision: 1.0 - Recall: 1.0
Cluster 4 - image n°132 - [132] as duplicates - Precision: 1.0 - Recall: 0.3333333333333333
ORB - Precision: 0.75 - Recall: 0.8666666666666666 - F1 Score: 0.8041237113402061
Wall time: 1min 19s


In [23]:
perf

Unnamed: 0,model,precision,recall,duration
0,mobilenet,0.85,0.866667,
1,phash,0.85,0.866667,71.1675
2,ahash,0.616667,0.933333,67.7735
3,dhash,0.85,0.866667,68.305
4,whash,0.572222,0.866667,76.326
5,orb,0.75,0.866667,79.311


In [24]:
perf.to_csv(index=False)

'model,precision,recall,duration\r\nmobilenet,0.85,0.8666666666666666,\r\nphash,0.85,0.8666666666666666,71.16751861572266\r\nahash,0.6166666666666666,0.9333333333333333,67.77351570129395\r\ndhash,0.85,0.8666666666666666,68.30499982833862\r\nwhash,0.5722222222222222,0.8666666666666666,76.32601857185364\r\norb,0.75,0.8666666666666666,79.31102466583252\r\n'