# References

https://medium.com/@franky07724_57962/using-keras-pre-trained-models-for-feature-extraction-in-image-clustering-a142c6cdf5b1

https://scikit-learn.org/stable/modules/clustering.html

https://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient

https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py

https://en.wikipedia.org/wiki/Silhouette_(clustering)

Does adding k-means clusters as features improve the performance?
https://towardsdatascience.com/how-to-create-new-features-using-clustering-4ae772387290

In [60]:
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
import numpy as np

In [61]:
data_dir = "../Data/hymenoptera_data/train"

import os
def get_file_paths(data_dir):
    classes = os.listdir(data_dir)
    
    file_paths = []
    for classname in classes:
        file_paths = file_paths +  [(data_dir + "/" + classname + "/" + filename) for filename in list(os.listdir(data_dir + "/" + classname))]
        
    return file_paths  

get_file_paths(data_dir)[:5]

['../Data/hymenoptera_data/train/bees/2728759455_ce9bb8cd7a.jpg',
 '../Data/hymenoptera_data/train/bees/1092977343_cb42b38d62.jpg',
 '../Data/hymenoptera_data/train/bees/1807583459_4fe92b3133.jpg',
 '../Data/hymenoptera_data/train/bees/2962405283_22718d9617.jpg',
 '../Data/hymenoptera_data/train/bees/150013791_969d9a968b.jpg']

In [62]:
from keras.preprocessing.image import load_img, img_to_array
def get_features(file_paths, model=None):
    features = []
    for file_path in file_paths:
        image_path = file_path
        img = load_img(image_path, target_size=(224, 224))
        img_data = img_to_array(img)
        if not model:
            feature = np.array(img_data)
        else:
            img_data = np.expand_dims(img_data, axis=0)
            img_data = preprocess_input(img_data)
            feature = model.predict(img_data)
            feature = np.array(feature)
            
        features.append(feature.flatten()) 
    return features    

In [63]:
def get_silhouette_score(X, k=2):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
    labels = kmeans.labels_
    return silhouette_score(X, labels, metric='euclidean')

In [65]:
"""
    Testing whether vgg16 last layer features improve 
    the silhouette_score of clusters obtained by k-means
"""


vgg16_model = VGG16(weights='imagenet', include_top=False)

file_paths = get_file_paths(data_dir)

X = get_features(file_paths) 
print(get_silhouette_score(X, 2))
X = get_features(file_paths, vgg16_model)
print(get_silhouette_score(X, 2))

0.16087884
0.077358685


In [None]:
"""
    Looks like vgg16 features did not improve silhouette score
"""