In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from fastai.imports import *
from fastai.vision.all import *
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# analysing clustering,dimensionality reduction and feature extraction techniques with labeled data

in this notebook we use a subset of the [Food-101 dataset](https://www.kaggle.com/datasets/kmader/food41) and the [House Rooms & Streets Image Dataset](https://www.kaggle.com/datasets/mikhailma/house-rooms-streets-image-dataset)

first look at some sample images from our datasets

In [6]:
food_path=Path("tripadvisor_dataset/kaggle/archive (1)/images")
building_path=Path("tripadvisor_dataset/kaggle/archive/kaggle_room_street_data")

In [None]:
fig, ax = plt.subplots(figsize=(10,128),nrows=30, ncols=2, ) #make a figure to plot
for i,category in enumerate(os.listdir(food_path)): #loop over image categories
    for j, img in enumerate(os.listdir(os.path.join(food_path,category))): # loop over images in each category
        ax[i,j].imshow(PILImage.create(os.path.join(food_path,category,img)),label=category) #plot image
        ax[i,j].set_title(category,fontsize = 14)
        if(j==1):break
    if i==29:break

these were some food images, now let's look at some non-food images

In [None]:
fig, ax = plt.subplots(figsize=(10,10),nrows=2, ncols=3, ) #make a figure to plot
for i,category in enumerate(os.listdir(building_path)): #loop over street data categories
    for j, img in enumerate(os.listdir(os.path.join(building_path,category))): # loop over images in each category
        ax[i,j].imshow(PILImage.create(os.path.join(building_path,category,img)),label=category) #plot image
        ax[i,j].set_title(category,fontsize = 14)
        if(j==2):break


we will make our custom dataset based on the street and food data

In [5]:
num_food_class=len(os.listdir(food_path))
print(f"there are {num_food_class} food classes" )
print(f"there are 2 non-food classes" )

there are 101 food classes
there are 2 non-food classes


we will make a dataset consisting of 505 food images (5 images from each class) and 500 non-food images

In [9]:
# Resizing to this sizes
IMG_HEIGHT = 128
IMG_WIDTH = 128

def create_dataset():
    # n = amount of images
    n=1005
    counter=0
    images = np.zeros((n, IMG_HEIGHT* IMG_WIDTH* 3))
    
    for i,category in enumerate(os.listdir(food_path)): #loop over image food categories
        for j, img in enumerate(os.listdir(os.path.join(food_path,category))[:5]): # loop over first 5 images in each category
            img=PILImage.create(os.path.join(food_path,category,img))#plot image
            img_resized=img.resize((IMG_HEIGHT,IMG_WIDTH))
            img_np=np.array(img_resized).flatten()
            images[counter]=img_np/255
            counter+=1
    
    
    for i,category in enumerate(os.listdir(building_path)): #loop over street data categories
        for j, img in enumerate(os.listdir(os.path.join(building_path,category))[:250]): # loop over first 250 images in each category       
            img=PILImage.create(os.path.join(building_path,category,img))
            img_resized=img.resize((IMG_HEIGHT,IMG_WIDTH))
            img_np=np.array(img_resized).flatten()
            images[counter]=img_np/255
            counter+=1    
    return images
    
    



In [10]:
images = create_dataset()

After reading the images, extract the features with SIFT.

In [30]:
import cv2

def sift_features(images):
    sift_vectors = {}
    descriptor_list = []
    features = []
    sift = cv2.xfeatures2d.SIFT_create()
    for img in images:
        img = img.reshape(IMG_HEIGHT,IMG_WIDTH,3)
        # SIFT function only accepts images with 8 bit integer values
        image8bit = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype('uint8')

        # Convert the training image to gray scale
        training_gray = cv2.cvtColor(image8bit, cv2.COLOR_RGB2GRAY)


        keypoints, descriptor = sift.detectAndCompute(training_gray, None)
        descriptor_list.extend(descriptor)
        features.append(keypoints)


    return (descriptor_list, np.array(features))


descriptors, features = sift_features(images)


  return (descriptor_list, np.array(features))


In [12]:
descriptors[0].shape # descriptors is a list

(128,)

aanpassen

We now have an array with a huge number of descriptors. We cannot use all of them to create the model, so we need to cluster them. A rule-of-thumb is to create k centers with k = number of categories * 10.

In [13]:
from sklearn.cluster import KMeans

number_of_categories = 10

def kmeans(descriptor_list):
    k = number_of_categories * 10
    kmeans = KMeans(n_clusters = k, n_init=10)
    kmeans.fit(descriptor_list)
    visual_words = kmeans.cluster_centers_ 
    return visual_words

visual_words = kmeans(descriptors)
visual_words

array([[ 8.86753446, 19.34532925, 82.81163859, ..., 17.70826953,
        19.15390505, 18.32388974],
       [32.64206009, 37.62145923, 26.47639485, ...,  9.78540773,
         8.98969957, 11.39227468],
       [66.10705596, 15.01216545,  9.56934307, ...,  9.08029197,
         8.81103001, 15.02514193],
       ...,
       [17.28239437, 16.84647887, 17.23450704, ...,  7.9528169 ,
         8.09507042, 10.53028169],
       [18.97676056, 21.73873239, 23.86126761, ...,  4.53802817,
         5.90211268, 10.16830986],
       [ 7.50725514,  9.70374849, 14.68681983, ...,  9.28234583,
         8.49032648, 12.22551391]])

### Making histograms


In [14]:
# Find the index of the closest central point to the each sift descriptor. 
# Takes 2 parameters the first one is a sift descriptor and the second one is the array of central points in k means
# Returns the index of the closest central point.  
from scipy.spatial import distance

def find_index(image, center):
    count = 0
    ind = 0
    for i in range(len(center)):
        if(i == 0):
           count = distance.euclidean(image, center[i]) 
           #count = L1_dist(image, center[i])
        else:
            dist = distance.euclidean(image, center[i]) 
            #dist = L1_dist(image, center[i])
            if(dist < count):
                ind = i
                count = dist
    return ind

In [15]:
def image_class(features, centers, descriptors):
    begin = 0
    histogrammen = []
    for f in features:
        histogram = np.zeros(len(centers))
        for i in range(begin, begin + len(f)):
            ind = find_index(descriptors[i], centers)
            histogram[ind] += 1
        histogrammen.append(histogram)
        begin += len(f)

    return histogrammen


image_features = image_class(features, visual_words, descriptors)

In [16]:
print(len(image_features)) # amount of images
image_features[0].shape # amount of centroids


1005


(100,)

In [6]:
images.shape
##eerste 505 afbeeldingen zijn voedsel, laatste 500 zijn niet voedsel


(1005, 49152)

first attempt making a score function for our kmeans

In [22]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
raw_scaled = sc.fit_transform(images)

kmeans = KMeans(n_clusters=5, random_state=0)
pred=kmeans.fit_predict(raw_scaled)

In [9]:
pred

array([1, 2, 3, ..., 2, 2, 1], dtype=int32)

Ideally, we'd like some way to try more clusters ans test our performance easily . We could create a function that returns how good our model is, in order to more quickly try out a few different methods. We'll create a score function to do this. Instead of returning the mean absolute error, we'll calculate a measure of impurity -- that is, how much our model creates clusters where the images in a group are each similar to each other, or dissimilar.

In [11]:
num_clusters=np.unique(pred)##will be the amount of clusters

we will explain this step by step for cluster 0

In [13]:
image_idx=np.where(pred == 0)#indexes of images in this cluster
image_idx

(array([ 10,  18,  22,  23,  29,  31,  32,  33,  34,  39,  40,  41,  42,
         44,  48,  54,  56,  58,  59,  64,  67,  71,  72,  83,  97,  98,
        103, 106, 109, 117, 126, 145, 150, 153, 157, 164, 165, 170, 173,
        176, 178, 182, 183, 188, 191, 192, 194, 197, 206, 207, 208, 210,
        211, 213, 220, 228, 235, 243, 245, 252, 255, 256, 258, 261, 262,
        272, 276, 278, 280, 284, 287, 288, 296, 299, 301, 311, 312, 320,
        322, 326, 328, 333, 337, 341, 348, 353, 364, 367, 371, 372, 373,
        375, 377, 380, 384, 394, 395, 398, 399, 401, 403, 416, 417, 418,
        422, 423, 425, 431, 433, 434, 435, 441, 444, 445, 450, 458, 464,
        467, 469, 475, 476, 479, 480, 481, 482, 490, 496, 501, 552, 559,
        581, 600, 651, 675, 691, 697, 702, 729, 742, 743, 744, 751, 760,
        774, 776, 778, 785, 791, 819, 823, 824, 828, 839, 845, 847, 888,
        891, 896, 925, 928, 931, 942, 964, 974, 977, 992, 994, 997]),)

In [14]:
binary=image_idx[0]<505 #image indexes smaller than 505 are food
binary

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

calculate the impurity of our cluster with the standard deviation

We can measure the similarity of images inside a group by taking the standard deviation of the dependent variable. If it's higher, then it means the images are more different to each other.

In [15]:
binary.std()

0.4259177099999599

now putting it all together

In [18]:
#for each cluster calculate the std

def cluster_score(pred):
    score=0
    num_clusters=np.unique(pred)
    for clusternr in num_clusters:
        image_idx=np.where(pred == clusternr)#indexes of images in this cluster
        binary=image_idx[0]<505
        score+=binary.std()
    return score/len(num_clusters) #score averaged out over the number of clusters

def calculate_score(pred):
    num_clusters = np.unique(pred)##will be the amount of clusters
    image_idx = np.where(pred == 0)#indexes of images in this cluster
    binary = image_idx[0]<505 #image indexes smaller than 505 are food
    return cluster_score(pred)

In [17]:
cluster_score(pred) #higher is bad

0.44348749509083696

Now we try the kmeans on the SIFT features

In [19]:
kmeans = KMeans(n_clusters=5, random_state=0)
pred=kmeans.fit_predict(image_features)
calculate_score(pred)

0.37863715855097313

The score is lower (which means better). The reason because of this is probably because when you flatten raw data like raw_scaled you lose important spacial information. SIFT captures all the important features. From now we fit using the SIFT features because it gives better results.

In [25]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN()
pred = dbscan.fit_predict(image_features)
calculate_score(pred)

0.4999938119960642

In [26]:
from sklearn.cluster import AgglomerativeClustering

hier = AgglomerativeClustering(n_clusters=5)
pred = hier.fit_predict(image_features)
calculate_score(pred)

0.2952638422721281

In [27]:
from sklearn.cluster import Birch

birch = Birch(n_clusters=5)
pred = birch.fit_predict(image_features)
calculate_score(pred)

0.2727707236328636