credit to https://towardsdatascience.com/how-to-cluster-images-based-on-visual-similarity-cd6e7209fe34

In [1]:
# for files
import sys
import os

# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

ModuleNotFoundError: No module named 'keras'

In [12]:
path = "clustering_images"

# Preprocess Images

In [13]:
images = []
with os.scandir(path) as files:
  for file in files:
    images.append(file.name)

In [15]:
len(images)

527

In [18]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [24]:
def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img('clustering_images/'+file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [25]:
data = {}
p = "processed_images.pkl"

In [26]:
# loop through each image in the dataset
for image in images:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(image,model)
        data[image] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)

In [28]:
# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))

# reshape so that there are 526 samples of 4096 vectors
feat = feat.reshape(-1,4096)
feat.shape 

(526, 4096)

# PCA

In [34]:
pca = PCA(n_components=100,random_state=22)
pca.fit(feat)
x = pca.transform(feat)

print(f'Components before PCA: {feat.shape[1]}')
print(f'Components after PCA: {pca.n_components}')

Components before PCA: 4096
Components after PCA: 100


#KMeans Clustering

In [35]:
kmeans = KMeans(n_clusters=10,random_state=33)
kmeans.fit(x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=33, tol=0.0001, verbose=0)

In [37]:
# holds the cluster id and the images { id: [images] }
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

In [47]:
type(groups[0])

list

In [52]:
!pip install ipyplot

Collecting ipyplot
  Downloading https://files.pythonhosted.org/packages/c8/2d/ba3031b6945e40ac1137ffb3931528bf4a76f36f0668b6c1879fadac070d/ipyplot-1.1.0-py3-none-any.whl
Collecting shortuuid
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Installing collected packages: shortuuid, ipyplot
Successfully installed ipyplot-1.1.0 shortuuid-1.0.1


In [53]:
import ipyplot


        You might encounter issues while running in Google Colab environment.
        If images are not displaying properly please try setting `base_64` param to `True`.
        


In [58]:
def display_img(num_img,group_num):
  full_group = groups[group_num]
  sample = full_group[:num_img]
  ipyplot.plot_images(sample,max_images=20,img_width=150)

In [59]:
display_img(5,0)