In [1]:
import numpy as np
import cv2 as cv
import pickle
import nbimporter
import prime

Importing Jupyter notebook from prime.ipynb


In [2]:
class MinHashing:
    
    def __init__(self, shingles, signature_size, k):
        self.shingles = shingles
        self.signature_size = signature_size
        self.k = k
        self.N = np.shape(self.shingles)[0]
        
        # Universal Hashing parameters ((ax+b)%p)%N
        self.a = np.random.randint(self.k, size=self.signature_size)
        self.b = np.random.randint(self.k, size=self.signature_size)
    
    # Generates signature matrix from shingle matrix C
    def generate_signature(self):
        signature = np.ones((np.shape(self.shingles)[1],self.signature_size)) * np.inf
        C = np.transpose(self.shingles)
        hash_values = self.__get_all_row_hash()
        for i in range(np.shape(self.shingles)[1]):
            ind = np.argwhere(C[i] > 0)
            if ind.size == 0:
                continue
            min_hash_values = np.amin(hash_values[ind], axis=0)            
            signature[i] = min_hash_values
        return np.transpose(signature)
    
    # Creates signature_size hash functions and returns the values
    def __universal_hash(self,x):
        c = self.a*x+self.b
        return [(item % self.p)%self.N for item in c]

    # Returns al row-hash values of function h(i)
    def __get_row_hash(self,i):
        result = np.ones((self.N), dtype=int)
        for k in range(self.N):
            result[k] = ((self.a[i]*k+self.b[i])%self.p)%self.N
        return result

    # Returns al row-hash values of function h(i)
    def __get_all_row_hash(self):
        p = prime.next_prime(self.N)
        result = np.ones((self.N, self.signature_size), dtype=int)
        for k in range(self.N):
            result[k] = ((self.a*(k)+self.b)%p)%self.N
        return result