In [1]:
### Code Adapted from the following sources
### https://www.kaggle.com/camaskew/host-baseline-example?scriptVersionId=40287695 
### https://github.com/dvschultz/ml-art-colabs/blob/master/ML4A_image_search.ipynb ###
import gc
import os
import PIL

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.keras as keras
import tensorflow as tf

from scipy import spatial
from tensorflow.keras import layers 
from tensorflow.keras import Model
from tqdm.notebook import tqdm
from keras.preprocessing import image
from keras.applications.imagenet_utils import decode_predictions, preprocess_input

In [2]:
INPUT_DIR = os.path.join('..', 'input')
DATASET_DIR = os.path.join(INPUT_DIR, 'landmark-retrieval-2021')
TEST_IMAGE_DIR = os.path.join(DATASET_DIR, 'test')
TRAIN_IMAGE_DIR = os.path.join(DATASET_DIR, 'train')
INDEX_IMAGE_DIR = os.path.join(DATASET_DIR, 'index')

In [3]:
def load_image(path):
    """Returns PIL and ndarray format of image based on path given."""
    img = image.load_img(path, target_size=model.input_shape[1:3])
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return img, x

Let's load all the index images and store them in a list.

In [4]:
image_extensions = ['.jpg', '.png', '.jpeg']   # case-insensitive (upper/lower doesn't matter)
index_images = [os.path.join(dp, f) for dp, dn, filenames in os.walk(INDEX_IMAGE_DIR) for f in filenames if os.path.splitext(f)[1].lower() in image_extensions]
print("keeping %d images to analyze" % len(index_images))

keeping 76176 images to analyze


Next we'll load our model and remove the classification layer. For more info see: https://github.com/dvschultz/ml-art-colabs/blob/master/ML4A_image_search.ipynb

In [5]:
from tensorflow.keras.applications import VGG16
model = VGG16(weights='../input/tf-keras-pretrained-model-weights/Top/vgg16_weights_tf_dim_ordering_tf_kernels.h5')
feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output)
feat_extractor.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

After that let's extract the features of each image in the index list and store it in a list. For the dataset of ~76k images, this should take around an hour using Kaggle's GPU. We'll also grab the IDs of each image in a separate list.

In [6]:
features = []
ids = []
for i, image_path in enumerate(tqdm(index_images)):
    img, x = load_image(image_path);
    feat = feat_extractor.predict(x)[0]
    features.append(feat)
    ids.append(image_path.split('/')[-1][:-4])

print('finished extracting features for %d images' % len(index_images))

  0%|          | 0/76176 [00:00<?, ?it/s]

finished extracting features for 76176 images


Apply PCA to reduce the dimensionality of the image features, keeping top 300 principal components.

In [7]:
from sklearn.decomposition import PCA
import numpy as np
features = np.array(features)
pca = PCA(n_components=300)
pca.fit(features)
pca_features = pca.transform(features)

The next function wwas grabbed from DV Schultz's notebook and returns the closest images based on a given distance threshold.

In [8]:
from scipy.spatial import distance
# function to return the largest and closest looking images from the index set
def get_closest_images(test_feat, threshold=0.2):
    distances = [distance.cosine(test_feat, feat) for feat in pca_features]
    dct = {i: distances[i] for i in range(0, len(distances), 1)}
    sorted_dict = dict(sorted(dct.items(), key=lambda item: item[1]))
    valid_dict = {key:val for key, val in sorted_dict.items() if val < threshold}
    idx_closest = valid_dict.keys()
    dis_closest = list(valid_dict.values())
    return idx_closest, dis_closest

In [9]:
test_paths = []
output = pd.DataFrame(columns = ['id','images'])
for root, dirs, files in os.walk(os.path.join(TEST_IMAGE_DIR)):
    for file in files:
        if file.endswith('.jpg'):
            test_paths.append(os.path.join(root, file))

In [None]:
for i, path in enumerate(tqdm(test_paths)):
    img, x  = load_image(path)
    test_id = path.split('/')[-1][:-4]
    test_feat = feat_extractor.predict(x)[0].reshape(1,-1)
    test_pca = pca.transform(test_feat)
    idx_closest, dis_closest = get_closest_images(test_pca)
    index_id = []
    for idx in idx_closest:
        index_id.append(ids[idx])
    index_ids = ' '.join(index_id)
    output.loc[i] = [test_id, index_ids]
    output.to_csv('submission.csv')

  0%|          | 0/1129 [00:00<?, ?it/s]