In [2]:
# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

In [3]:
path = r"E:\\University\\Term 8\\Computational Intelligence\\Project\\1\\Main-Project\\ML\\dataset"

# change the working directory to the path where the images are located
os.chdir(path)

# this list holds all the image filename
images = []

# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.jpg'):
          # adds only the image files to the images list
            images.append(file.name)

In [4]:
print(images[:-1])

['subject01.jpg', 'subject01_1.jpg', 'subject01_10.jpg', 'subject01_2.jpg', 'subject01_3.jpg', 'subject01_4.jpg', 'subject01_5.jpg', 'subject01_6.jpg', 'subject01_7.jpg', 'subject01_8.jpg', 'subject01_9.jpg', 'subject02.jpg', 'subject02_1.jpg', 'subject02_10.jpg', 'subject02_2.jpg', 'subject02_3.jpg', 'subject02_4.jpg', 'subject02_5.jpg', 'subject02_6.jpg', 'subject02_7.jpg', 'subject02_8.jpg', 'subject02_9.jpg', 'subject03.jpg', 'subject03_1.jpg', 'subject03_10.jpg', 'subject03_2.jpg', 'subject03_3.jpg', 'subject03_4.jpg', 'subject03_5.jpg', 'subject03_6.jpg', 'subject03_7.jpg', 'subject03_8.jpg', 'subject03_9.jpg', 'subject04.jpg', 'subject04_1.jpg', 'subject04_10.jpg', 'subject04_2.jpg', 'subject04_3.jpg', 'subject04_4.jpg', 'subject04_5.jpg', 'subject04_6.jpg', 'subject04_7.jpg', 'subject04_8.jpg', 'subject04_9.jpg', 'subject05.jpg', 'subject05_1.jpg', 'subject05_10.jpg', 'subject05_2.jpg', 'subject05_3.jpg', 'subject05_4.jpg', 'subject05_5.jpg', 'subject05_6.jpg', 'subject05_7.jpg

In [5]:
# load the model first and pass as an argument
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx)
    return features

In [6]:
data = {}
p = r"E:\\University\\Term 8\\Computational Intelligence\\Project\\1\\Main-Project\\ML\\dataset.pkl"

# loop through each image in the dataset
for image in images:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(image,model)
        data[image] = feat
        print(image)
    # if something fails, save the extracted features as a pickle file (optional)
    except Exception as ex:
        with open(p,'wb') as file:
            pickle.dump(data,file)
            print(f"Exception: {ex}")
          
with open(p,'wb') as file:
    pickle.dump(data,file)
# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
feat.shape
(210, 1, 4096)

# reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1,4096)
feat.shape
(210, 4096)

# get the unique labels (from the image_labels.csv)
# df = pd.read_csv('image_labels.csv')
# label = df['label'].tolist()
# unique_labels = list(set(label))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 273ms/step
subject01.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
subject01_1.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
subject01_10.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
subject01_2.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
subject01_3.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
subject01_4.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
subject01_5.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
subject01_6.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
subject01_7.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
subject01_8.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step
subject01_9.jpg
[1m1/1[0m [32m━━━━━

(210, 4096)

In [7]:
pca = PCA(n_components=100, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

print(f"Components before PCA: {feat.shape[1]}")
print(f"Components after PCA: {pca.n_components}")

Components before PCA: 4096
Components after PCA: 100


In [8]:
kmeans = KMeans(n_clusters=15, random_state=22)
kmeans.fit(x)

kmeans.labels_

array([13, 11, 11, 11, 13, 11, 11,  6, 11, 11, 11, 14,  7,  7,  7,  7,  7,
        7,  7,  7,  7,  7,  4,  4,  4,  4,  2,  4,  4,  6,  4,  4,  4,  2,
        0,  2,  2,  2,  2,  2,  6,  2,  2,  2, 10, 10, 10, 10, 10, 10, 10,
        5, 10, 10, 10, 14,  6,  6,  6, 14,  6,  6, 14,  6,  6,  6,  1,  1,
        1,  1,  1,  1,  1,  5,  1,  1,  1,  0,  0,  0,  0,  0,  2,  0,  6,
        0,  0,  2, 14,  3,  3,  3,  3,  3,  3,  5,  3,  3,  3, 13, 13, 13,
       13,  0, 13, 13, 14, 13, 13, 13,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8, 10,  5,  5,  5, 10,  5,  5, 10,  5,  5,  5,  9,  9,  9,  9,
        9, 14,  9,  6,  9,  9,  9, 12, 12, 12, 12,  2, 12, 12,  2, 12, 12,
       12,  7,  0, 14, 14,  6,  0, 14,  5, 14,  0, 13])

In [9]:
# holds the cluster id and the images { id: [images] }
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)


# view the filenames in cluster 0
groups[0]

['subject04_1.jpg',
 'subject08.jpg',
 'subject08_1.jpg',
 'subject08_10.jpg',
 'subject08_2.jpg',
 'subject08_3.jpg',
 'subject08_5.jpg',
 'subject08_7.jpg',
 'subject08_8.jpg',
 'subject10_3.jpg',
 'subject15_1.jpg',
 'subject15_4.jpg',
 'subject15_8.jpg']