In [None]:
# This notebook uses OpenCV's calcHist to calculate histograms to find similar
# images. It produces a distance matrix using Euclidean distance of the flattened
# histograms. It demonstrates how the distribution of color (RGB values) can be used
# to cluster images based on pixel intensity values (rather than more other,
# more complicated features).
#
# James E. Dobson
# James.E.Dobson@Dartmouth.EDU
# https://jeddobson.github.io/

from glob import glob
import cv2
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import euclidean_distances

In [None]:
# download and extract data
!wget 'https://www.dropbox.com/s/7iq7kg623z5f0sd/opencv_data.tgz?dl=0' -O opencv_data.tgz
!tar -zxf opencv_data.tgz > /dev/null 2>&1

In [None]:
# simple function to produce flattened (1d) representation of image histogram
def extract_histogram(image):
    img = cv2.imread(image)
    hsv = cv2.cvtColor(img,cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist(hsv,[0, 1, 2], None,(8,8,8),[0, 180, 0, 256, 0, 256])
    return hist.flatten()

In [None]:
# display sample image
sample_image = "train/14374175_139680423163339_3750215527033733120_n.jpg"
img = cv2.imread(sample_image)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.show()

In [None]:
# create a list of files
filenames = glob("train/*jpg")
sample_idx = filenames.index(sample_image)

hist_data = list()
for file in filenames:
    hist_data.append(extract_histogram(file))

In [None]:
# calculate similarities with Euclidean distance metric
euclidean_dist_matrix = euclidean_distances(hist_data)

In [None]:
for i in np.argsort(euclidean_dist_matrix[sample_idx]):
    print(filenames[i],euclidean_dist_matrix[sample_idx][i])
    img = cv2.imread(filenames[i])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.show()