In [None]:
from importation import extract
from LocalitySensitiveHashing import *
import numpy as np
from datasketch import MinHash, MinHashLSH
from datasketch import MinHashLSHEnsemble, MinHash

## Import des images

In [None]:
image,label,classe = extract("DATA")
image.shape

In [None]:
print(classe) # Affiche les classes présentent dans le dossier
print(label[0]) # Correspond au label de la première image. La place du 1 correspond au label, ici : D4.

### Réccupération des images de chaque classe

In [None]:
D21=[]

In [None]:
for img,lb in zip(image,label):
    if classe[np.argmax(lb)]=="D21" : #argmax renvoi dans la liste l'élément le plus gros 
        D21.append(img)

In [None]:
D4=[]

In [None]:
for img,lb in zip(image,label):
    if classe[np.argmax(lb)]=="D4" : #argmax renvoi dans la liste l'élément le plus gros *
        D4.append(img)

# * Puisque le nom de la classe est sous forme [0,0,1] at que la position du 1 correspond à la classe, le maximum de
# la liste nous donne la classe

## Definition des classes Hashtable et LSH

In [None]:
class HashTable:
    def __init__(self, hash_size, inp_dimensions):
        self.hash_size = hash_size
        self.inp_dimensions = inp_dimensions
        self.hash_table = dict()
        self.projections = np.random.randn(self.hash_size, inp_dimensions)
        
    def generate_hash(self, inp_vector):
        bools = (np.dot(inp_vector, self.projections.T) > 0).astype('int')
        return ''.join(bools.astype('str'))

    def __setitem__(self, inp_vec, label):
        hash_value = self.generate_hash(inp_vec)
        self.hash_table[hash_value] = self.hash_table\
            .get(hash_value, list()) + [label]
        
    def __getitem__(self, inp_vec):
        hash_value = self.generate_hash(inp_vec)
        return self.hash_table.get(hash_value, [])

In [None]:
test=HashTable(224,190)
test.generate_hash(image)

In [None]:
class LSH:
    def __init__(self, num_tables, hash_size, inp_dimensions):
        self.num_tables = num_tables
        self.hash_size = hash_size
        self.inp_dimensions = inp_dimensions
        self.hash_tables = list()
        for i in range(self.num_tables):
            self.hash_tables.append(HashTable(self.hash_size, self.inp_dimensions))
    
    def __setitem__(self, inp_vec, label):
        for table in self.hash_tables:
            table[inp_vec] = label
    
    def __getitem__(self, inp_vec):
        results = list()
        for table in self.hash_tables:
            results.extend(table[inp_vec])
        return list(set(results))

## Test de la création d'une classe LSH

In [1]:
# Import des librairies
from PIL import Image
import numpy as np
import os
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from keras.models import Sequential, Model
from keras import applications
from keras.layers import Dropout, Flatten, Dense
from keras import optimizers
from scipy import spatial
import pathlib
from sklearn.model_selection import train_test_split
import pickle
import matplotlib.pyplot as plt


Using TensorFlow backend.


In [2]:
# Import des images en 240 par 240 et noir et blanc
data_dir = pathlib.Path("DATA")
image_count = len(list(data_dir.glob('*/*')))
CLASS_NAMES = np.array([item.name for item in data_dir.glob('*') if item.name != "LICENSE.txt"]) 
image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255) # redimension des images
BATCH_SIZE = image_count
IMG_HEIGHT = 224
IMG_WIDTH = 224
STEPS_PER_EPOCH = np.ceil(image_count/BATCH_SIZE)
train_data_gen = image_generator.flow_from_directory(directory=str(data_dir),
                                                     batch_size=BATCH_SIZE,
                                                     shuffle=True,
                                                     color_mode='grayscale', # Transformation des images en noir et blanc
                                                     target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                     classes = list(CLASS_NAMES))

Found 105 images belonging to 2 classes.


In [3]:
# Séparation images / labels
image_batch, label_batch = next(train_data_gen)
print(image_batch.shape)# Les données sont sur quatres dimensions

(105, 224, 224, 1)


In [4]:
data=image_batch.reshape(105,224*224) # On les veut sur une dimension : On a 988 images qui 
# correspondent chacunes à un vecteur de 50176 pixels
data.shape

don=np.array(data).reshape(1, -1)
don

array([[0.7725491 , 0.7725491 , 0.6627451 , ..., 0.5803922 , 0.58431375,
        0.58431375]], dtype=float32)

In [5]:
from copy import copy
from itertools import combinations
import numpy as np
from pandas import DataFrame
from sklearn.metrics.pairwise import pairwise_distances


class LSH:
    
    def __init__(self, data):
        self.data = data
        self.model = None

    def __generate_random_vectors(self, num_vector, dim):
        return np.random.randn(dim, num_vector)

    def train(self, num_vector, seed=None):
        dim = self.data.shape[1]
        if seed is not None:
            np.random.seed(seed)

        random_vectors = self.__generate_random_vectors(num_vector, dim)
        powers_of_two = 1 << np.arange(num_vector - 1, -1, -1)

        table = {}

        # Partition data points into bins
        bin_index_bits = (self.data.dot(random_vectors) >= 0)

        # Encode bin index bits into integers
        bin_indices = bin_index_bits.dot(powers_of_two)

        # Update `table` so that `table[i]` is the list of document ids with bin index equal to i.
        for data_index, bin_index in enumerate(bin_indices):
            if bin_index not in table:
                # If no list yet exists for this bin, assign the bin an empty list.
                table[bin_index] = []
            # Fetch the list of document ids associated with the bin and add the document id to the end.
            table[bin_index].append(data_index)

        self.model = {'bin_indices': bin_indices, 'table': table,
                      'random_vectors': random_vectors, 'num_vector': num_vector}
        return self

    def __search_nearby_bins(self, query_bin_bits, table, search_radius=2, initial_candidates=set()):
        num_vector = self.model['num_vector']
        powers_of_two = 1 << np.arange(num_vector - 1, -1, -1)

        # Allow the user to provide an initial set of candidates.
        candidate_set = copy(initial_candidates)

        for different_bits in combinations(range(num_vector), search_radius):
            alternate_bits = copy(query_bin_bits)
            for i in different_bits:
                alternate_bits[i] = 1 if alternate_bits[i] == 0 else 0

            # Convert the new bit vector to an integer index
            nearby_bin = alternate_bits.dot(powers_of_two)

            # Fetch the list of documents belonging to the bin indexed by the new bit vector.
            # Then add those documents to candidate_set
            if nearby_bin in table:
                candidate_set.update(table[nearby_bin])

        return candidate_set

    def query(self, query_vec, k, max_search_radius, initial_candidates=set()):

        if not self.model:
            print('Model not yet build. Exiting!')
            exit(-1)

        data = self.data
        table = self.model['table']
        random_vectors = self.model['random_vectors']

        bin_index_bits = (query_vec.dot(random_vectors) >= 0).flatten()

        candidate_set = set()
        # Search nearby bins and collect candidates
        for search_radius in xrange(max_search_radius + 1):
            candidate_set = self.__search_nearby_bins(bin_index_bits, table,
                                                      search_radius, initial_candidates=initial_candidates)
        # Sort candidates by their true distances from the query
        nearest_neighbors = DataFrame({'id': list(candidate_set)})
        candidates = data[np.array(list(candidate_set)), :]
        nearest_neighbors['distance'] = pairwise_distances(candidates, query_vec, metric='cosine').flatten()
        
        return nearest_neighbors.nsmallest(k, 'distance')

In [7]:
#assumes that data is a num_observations by num_features numpy matrix
lsh_model = LSH(don)
num_of_random_vectors = 105
lsh_model.train(num_of_random_vectors)
xrange=range
#find the 5 nearest neighbors of data[1] while searching in 10 buckets 
lsh_model.query(don[1,:], 5, 2)


IndexError: index 1 is out of bounds for axis 0 with size 1