https://medium.com/@franky07724_57962/using-keras-pre-trained-models-for-feature-extraction-in-image-clustering-a142c6cdf5b1

https://scikit-learn.org/stable/modules/clustering.html

https://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient

https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py

In [2]:
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
import numpy as np

model = VGG16(weights='imagenet', include_top=False)
model.summary()

img_path = '../Data/cats_and_dogs/dog.jpeg'
img = image.load_img(img_path, target_size=(224, 224))
img_data = image.img_to_array(img)
img_data = np.expand_dims(img_data, axis=0)
img_data = preprocess_input(img_data)

vgg16_feature = model.predict(img_data)

print(vgg16_feature.shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0         
__________

In [47]:
data_dir = "../Data/hymenoptera_data/train"

import os
def get_file_paths(data_dir):
    classes = os.listdir(data_dir)
    
    file_paths = []
    for classname in classes:
        file_paths = file_paths +  [(data_dir + "/" + classname + "/" + filename) for filename in list(os.listdir(data_dir + "/" + classname))]
        
    return file_paths  
        

# get_file_paths(data_dir)

In [44]:
from sklearn.cluster import KMeans
vgg16_feature_list = []

file_paths = get_file_paths(data_dir)


for file_path in file_paths:
    img_path =  file_path
    
    img = image.load_img(img_path, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)

    vgg16_feature = model.predict(img_data)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_feature_list.append(vgg16_feature_np.flatten())
        
vgg16_feature_list_np = np.array(vgg16_feature_list)
kmeans = KMeans(n_clusters=2, random_state=0).fit(vgg16_feature_list_np)

In [54]:
from keras.preprocessing.image import load_img, img_to_array
def get_features(file_paths, model=None):
    features = []
    for file_path in file_paths:
        image_path = file_path
        img = load_img(image_path, target_size=(224, 224))
        img_data = img_to_array(img)
        if not model:
            feature = np.array(img_data)
        else:
            img_data = np.expand_dims(img_data, axis=0)
            img_data = preprocess_input(img_data)
            feature = model.predict(img_data)
            feature = np.array(feature)
            
        features.append(feature.flatten()) 
    return features    

In [46]:
from sklearn.metrics import silhouette_score

vgg16_feature_list_np = np.array(vgg16_feature_list)
kmeans = KMeans(n_clusters=2, random_state=42).fit(vgg16_feature_list_np)

X = vgg16_feature_list_np
labels = kmeans.labels_

print(len(X))
print(len(np.unique(kmeans.labels_)))

silhouette_score(X, labels, metric='euclidean')

245
2


0.077358685

In [50]:
def get_silhouette_score(X, k=2):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
    labels = kmeans.labels_
    return silhouette_score(X, labels, metric='euclidean')

In [56]:
vgg16_model = VGG16(weights='imagenet', include_top=False)

file_paths = get_file_paths(data_dir)

X = get_features(file_paths) 
print(get_silhouette_score(X))
X = get_features(file_paths, vgg16_model)
print(get_silhouette_score(X))

0.16087884
0.077358685


In [None]:
"""
    Looks like vgg16 features did not improve silhouette score
"""