## Importation des librairies

In [1]:
import tensorflow as tf
tf.__version__

'2.2.0'

In [2]:
import os
import numpy as np
import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
from PIL import Image
import tqdm

# Annoy and Scipy for similarity calculation
from annoy import AnnoyIndex
from scipy import spatial

In [3]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras import Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D

In [4]:
DATA_ROOT = './images/'
OUTPUT = 2048
SIZE = (299, 299)
TREES = 100

## L'encodeur

In [5]:
base_model = InceptionV3(include_top=False, input_shape=SIZE+(3,))

In [6]:
encoder = Sequential([base_model, GlobalAveragePooling2D()])
encoder.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inception_v3 (Model)         (None, 8, 8, 2048)        21802784  
_________________________________________________________________
global_average_pooling2d (Gl (None, 2048)              0         
Total params: 21,802,784
Trainable params: 21,768,352
Non-trainable params: 34,432
_________________________________________________________________


In [7]:
def load_image(image_path):
    image = Image.open(image_path)
    
    image = image.convert('RGB')
    
    image = image.resize(SIZE)
    
    image = np.array(image) / 255.0
    
    image = image.reshape((1,)+SIZE+(3,))
    
    return image

In [8]:
def encode_image(image):
    features = encoder(image)
    
    features = np.squeeze(features)
    
    return features

## Compresion des images

In [9]:
files = os.listdir(DATA_ROOT)
n = len(files)
n

3670

In [None]:
if os.path.exists('features.csv'):
    dt = pd.read_csv('features.csv')
    features_array = dt.iloc[:, 1:].values
else:
    all_features = []

    for i in tqdm.tqdm(range(n)):
        all_features.append({
            'image_id': files[i][:-4],
            'features': encode_image(load_image(os.path.join(DATA_ROOT, files[i])))
        })

    dt = pd.DataFrame(all_features)
    features_array = dt['features']
    features_array = list(features_array)
    features_array = np.array(features_array)

    dt[['feature'+str(i) for i in range(OUTPUT)]] = features_array
    dt = dt.drop(['features'], axis=1)
    # Mise en cache
    dt.to_csv('features.csv', index=False)

dt.head()

 40%|███▉      | 1456/3670 [1:04:06<2:47:14,  4.53s/it]

## Similarité

In [None]:
t = AnnoyIndex(OUTPUT, 'angular')

In [None]:
for i in range(len(features_array)):
    t.add_item(dt['image_id'][i], features_array[i])

In [None]:
t.build(TREES)
t.save('test.ann')

## Applications

## Références
* [https://github.com/spotify/annoy](https://github.com/spotify/annoy)
* [Image Similarity Detection in Action with Tensorflow 2.0](https://towardsdatascience.com/image-similarity-detection-in-action-with-tensorflow-2-0-b8d9a78b2509)