In [2]:
import numpy as np
from PIL import Image
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from os import listdir
from os.path import isfile, join
import os
from skimage.transform import resize, rescale
import matplotlib.pyplot as plt

In [3]:
def prep_im(im_id, im_dir_path = "", scalar = 1, output_shape = None):
    '''Prepare image from im_id and optional dictory path.
    If directory path is not passed, the whole filepath, including filetype notation, 
    should be given as im_id. If parameter scalar is passed, output image will be scaled by it. 
    Defualt 1 retains original size.
    
    Args:
        im_id (str): image ID
        im_dir_path (str, optional): image directory path
        scalar (float, optional): rescale coefficient

    Returns:
        im (numpy.ndarray): image.
    '''

    # Read and resize image
    if im_dir_path == "":
        im = plt.imread(im_id)[:, :, :3] #Some images have fourth, empty color chanel which we slice of here
    else:
        im = plt.imread(im_dir_path + im_id)[:, :, :3] #Some images have fourth, empty color chanel which we slice of here
    im = rescale(im, scalar, anti_aliasing=True, channel_axis = 2) #IDWE: Use channel_axis=2 to prevent picture from being turned bianry when rescaled
    if output_shape != None:
        im = resize(im, output_shape)

    return im

In [4]:
# Load images
image_folder_path = "test_images"
n_images = 100
paths = [f for f in listdir(image_folder_path) if isfile(join(image_folder_path, f))][:n_images]

images = []
for im_path in paths:
  image = prep_im(im_path, "test_images/", output_shape = (300,300))

  #image = Image.open(join(image_folder_path, im_path))
  #image = resize(image, (300,300)) # Make sure the image has the same size
  arr = np.asarray(image)
  images.append(arr)

In [5]:
#Load labels for images
data = np.array([i.strip().split(',') for i in open('metadata.csv')])

mask = data == ''
data[np.where(mask)] = np.nan

labels = np.asarray([data[np.where(data[:,-2]==paths[i])[0][0],17] for i in range(len(paths))])

In [6]:
# Flatten it, now each row represents a single image
X = np.stack(images, axis = 0)

dim1, dim2, chan = arr.shape
n_features = chan*dim1*dim2
X = X.reshape((len(images), n_features)) # flattened --> this goes to PCA

print(np.shape(X))

(90, 270000)


In [7]:
# Init the model (a.k.a. specify the hyper-parameters e.g. number of components)
final_n_features = 100 # Hyper-parameter - try different values
pca = PCA(n_components=final_n_features)

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size= 0.2, random_state =1,stratify = labels)

In [9]:
# Transformed features
X_train_transformed = pca.fit_transform(X_train) # X_new has final_n_features --> this can be fed to the classfier model
X_train_recovered = pca.inverse_transform(X_train_transformed)

X_test_transformed = pca.fit_transform(X_test) # X_new has final_n_features --> this can be fed to the classfier model
X_test_recovered = pca.inverse_transform(X_test_transformed)

ValueError: n_components=100 must be between 0 and min(n_samples, n_features)=72 with svd_solver='full'

In [None]:
plt.imshow(X_train_recovered[10,:].reshape(300,300,3))

In [None]:
plt.imshow(X_train[10,:].reshape(300,300,3))

In [None]:
# Define a classifer
clf = KNeighborsClassifier(n_neighbors=10)

In [None]:
# Train it --> need to define y first
clf.fit(X_train_transformed, y_train)

In [None]:
# TODO: Predict on validation dataset and measure accuracy, f1-score
clf.predict(X_test_transformed);

In [None]:
# Check accuracy of our model on the test data
clf.score(X_test_transformed, y_test)

In [None]:
components = list(range(5,151,10))
neighbors = list(range(1,11))

In [None]:
for component in components:
    pca = PCA(n_components=component)
    
    X_train_transformed = pca.fit_transform(X_train) # X_new has final_n_features --> this can be fed to the classfier model
    X_train_recovered = pca.inverse_transform(X_train_transformed)

    X_test_transformed = pca.fit_transform(X_test) # X_new has final_n_features --> this can be fed to the classfier model
    X_test_recovered = pca.inverse_transform(X_test_transformed)
    
    for n in neighbors:
        clf = KNeighborsClassifier(n_neighbors=n)
        
        clf.fit(X_train_transformed, y_train)
        
        clf.predict(X_test_transformed)
        
        score = clf.score(X_test_transformed, y_test)
        
        print(f'Components: {component}, neighbors: {n}, Score: {score}')