# Exploring distributions of distances in latent space of images
---

In [57]:
!pip install tensorflow==2.8.0
!pip install pyarrow==5.0.0
!pip install tensorflow-data-validation==1.7.0

In [48]:
#!pip list

In [51]:
import numpy as np
import time
import os
from sklearn.cluster import KMeans

import pandas as pd
from skimage import transform
import PIL.Image as Image
import matplotlib.pylab as plt

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_data_validation as tfdv

import datetime

%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [15]:
def load(filename):
    np_image = Image.open(filename)
    np_image = np.array(np_image).astype('float32')/255
    np_image = transform.resize(np_image, (224, 224, 3))
    np_image = np.expand_dims(np_image, axis=0)
    return np_image

def get_feature_vector(model, path):
    pred=[]
    for image in os.listdir(path):
        try: 
            img = load(path+'/'+image)
            ather=model.predict(img)
            pred.append((image, ather))
        except:
            continue
    return pred

## Load pre-trained CNN

In [16]:
base_model=tf.keras.applications.MobileNetV2(input_shape=(224,224,3),weights='imagenet')

2022-04-23 11:40:46.452440: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224.h5


In [17]:
fv_model=tf.keras.models.Model(
    inputs=base_model.inputs,
    outputs=tf.keras.layers.Flatten()(base_model.get_layer(name="global_average_pooling2d").output),
)

### Set of buildings

In [22]:
setlist = os.listdir("/kaggle/input/image-matching-challenge-2022/train/")

In [31]:
setlist.remove('scaling_factors.csv').remove('LICENSE.txt')

In [32]:
setlist

['british_museum',
 'piazza_san_marco',
 'trevi_fountain',
 'st_pauls_cathedral',
 'colosseum_exterior',
 'buckingham_palace',
 'temple_nara_japan',
 'sagrada_familia',
 'grand_place_brussels',
 'pantheon_exterior',
 'notre_dame_front_facade',
 'st_peters_square',
 'sacre_coeur',
 'taj_mahal',
 'lincoln_memorial_statue',
 'brandenburg_gate']

### Calculate feature vectors for all images

In [33]:
atest = []
for dire in setlist:
    atest.append(get_feature_vector(fv_model, '/kaggle/input/image-matching-challenge-2022/train/'+dire+'/images/'))

### Distance between feature vectors of the same building

In [35]:
dist00 = []
for m in range(len(setlist)):
    for vec1 in atest[m]:
        for vec2 in atest[m]:
            dist00.append(np.linalg.norm(vec1[1]-vec2[1]))
len(dist00)

2839306

In [70]:
dist00 = [ i for i in dist00 if i!=0.0 ]

In [71]:
len(dist00)

2833628

In [72]:
dist00_stats = tfdv.generate_statistics_from_dataframe(pd.DataFrame(dist00, columns=['distance']), stats_options = tfdv.StatsOptions(feature_allowlist=['distance']))

In [73]:
tfdv.visualize_statistics(dist00_stats)

### Distance between feature vectors of different places

In [60]:
dist01 = []
for m in range(len(setlist)):
    for n in range(len(setlist)):
        if m == n:
            continue
        else:
            for vec1 in atest[m]:
                for vec2 in atest[n]:
                    dist01.append(np.linalg.norm(vec1[1]-vec2[1]))
len(dist01)

29400378

In [61]:
dist01_stats = tfdv.generate_statistics_from_dataframe(pd.DataFrame(dist01, columns=['distance']), stats_options = tfdv.StatsOptions(feature_allowlist=['distance']))

In [62]:
tfdv.visualize_statistics(dist01_stats)

# Conclusion
---

Images of the same building have an average distance of 20.32 with a standard deviation of 4.11.
Images of different buildings are in average 24.33 units in latent space apart with an STD of 3.13.

Distances in latent space (on MobileNetV2 pre-trained on imagenet) are not a indicating well if two images show the same or different buildings.