In [26]:
import os
import numpy as np
import pandas as pd

from PIL import Image

# Sklearn libs
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from sklearn import decomposition
from sklearn.neural_network import MLPRegressor


In [27]:

def purity_score(y_true, y_pred):
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

def f_score(y_true, y_pred):
    return metrics.f1_score(y_true, y_pred, average= "macro")

# Centers
k = 10

# Dimensions (after reduction)
M = 100  # Test values: 100, 50, 25

image_list = list()
faces = list()
# Load images from path
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        image = np.asarray(Image.open(os.path.join(dirname, filename)))/255
        
        # *********************
        # Store each image's face (ytrue)
        face = int(filename[0:2])
        faces.append(face-1)
        # *********************
        
        grayscale_image = list()
        # For every pixel in 64x64 image
        for i in range(64):
            for j in range(64):
                # Extract RGB values
                rgb = image[i][j]
                # Convert and append to create 1x4096 image
                grayscale = 0.299*rgb[0]+0.587*rgb[1]+0.114*rgb[2]
                grayscale_image.append(grayscale)        
        # Append to list of images 500x4096
        image_list.append(grayscale_image)

images = np.array(image_list)

images.shape

In [28]:
faces_df = pd.DataFrame(faces)
faces_df

In [29]:
# Dimensionallity Reduction with PCA

X = images.tolist()
pca = decomposition.PCA(n_components=M)
pca.fit(X)
X = pca.transform(X)

print(X.shape)


In [39]:
'''
# Dimensionallity Reduction with Autoencoder (not working atm)

X=images.tolist()
d = 64*64
autoencoder = MLPRegressor(hidden_layer_sizes = (d, d/4, M, d/4, d), 
                   activation = 'tanh', 
                   solver = 'adam', 
                   learning_rate_init = 0.0001, 
                   max_iter = 20, 
                   tol = 0.0000001, 
                   verbose = True)
autoencoder.fit(X,X)
'''

In [31]:
# KMeans Algorithm (euclidean)

#from collections import Counter

model = KMeans(n_clusters= k, random_state=10)
kmeans = model.fit_predict(X)
centroids = model.cluster_centers_
labels = model.labels_

#print(Counter(labels).keys())
#print(Counter(labels).values())
pd.DataFrame(labels)


In [32]:
# KMeans (Euclidean) scores

purity = purity_score(faces, labels)
f_measure = f_score(faces, labels)
print("KMeans (Euclidean)")
print("M = "+str(M)+" K= "+str(k))
print("purity = "+str(purity))
print("f_measure = "+str(f_measure))

In [33]:
# Agglomerative Hierarchical Clustering

model_AC = AgglomerativeClustering(n_clusters = k)
agglomerative_clustering = model_AC.fit_predict(X)
labels_AC = model_AC.labels_

pd.DataFrame(labels_AC)

In [34]:
# Agglomerative Clustering scores

purity_AC = purity_score(faces, labels_AC)
f_measure_AC = f_score(faces, labels_AC)
print("Agglomerative Clustering")
print("M = "+str(M)+" K= "+str(k))
print("purity = "+str(purity_AC))
print("f_measure = "+str(f_measure_AC))