In [55]:
import numpy as np
import random

def Generate_Quire_Set(d,qnum):
    """ d:维数 qnum:查询点数量 """
    quires=np.random.uniform(-10,10,size=(qnum,d))
    np.save('DSH_Quires',quires)
    return

def Generate_Data_Set(d,n):
    data=np.random.uniform(-10,10,size=(n,d))
    np.save('DSH_DataSet',data)
    return

Generate_Data_Set(10,20)
Generate_Quire_Set(10,10000)

In [68]:
import numpy as np
import math
import random
import heapq
import matplotlib.pyplot as plt

def E2Distance(x,y):  
    """L2范式下欧几里得空间距离计算"""
    return math.sqrt(sum(map(lambda i,j:math.pow(i-j,2),x,y)))

def kClosestSort(points,k,origin):
    """返回离点origin在L1范式下欧几里得空间距离最靠近的k个点（堆排序） 时间复杂度Θ(nlogn)"""
    return heapq.nsmallest(k, points, lambda p:E2Distance(p,origin) )

def k_means(data,k,p):
    d=len(data[0])
    groups=dict()
    for i in range(p):
        centers=list()
        if i==0:
            centers.append(np.random.uniform(-10,10,size=(1,d))[0])
            weight=[float("inf")] * len(data)
            for v in range(0,k-1):
                for u in range(len(data)):
                    mindis=float("inf")
                    for c in range(v,len(centers)):
                        dis=E2Distance(data[u],centers[c])
                        if dis<mindis:
                            mindis=dis
                    if weight[u]>mindis:
                        weight[u]=dis
                per_weight=1/sum(weight)
                standard_weight=list()
                for w in weight:
                    standard_weight.append(w*per_weight)
                center=np.random.choice(range(len(standard_weight)),1,p=standard_weight)[0]
                centers.append(data[center])
        else:
            for j in groups.keys():
                temp=0
                for point in groups[j]:
                    temp+=point
                center=temp/len(groups[j])
                centers.append(center)
            groups=dict()
        for m in range(len(data)):
            mindis=float("inf")
            flag=0
            for c in range(len(centers)):
                dis=E2Distance(data[m],centers[c])
                if dis<mindis:
                    mindis=dis
                    flag=c
            if flag not in groups.keys():
                groups[flag]=list()
            groups[flag].append(data[m])
    centers=np.array(centers)
    for c in groups.keys():
        groups[c]=np.array(groups[c])
    return groups,centers

def r_adjacent_groups(centers,r):
    num=len(centers)
    W=np.zeros((num,num))
    centers=centers.tolist()
    for c in range(num):
        r_adjacent=kClosestSort(centers,r,centers[c])
        for ad in r_adjacent:
            W[c][centers.index(ad)]=1
    return W

def get_total_projections(W,centers):
    projections=dict()
    m=0
    for i in range(len(W)):
        for j in range(len(W)):
            if W[i][j]==1 and i!=j:
                m+=1
                w=centers[i]-centers[j]
                t=(np.dot((centers[i]+centers[j])/2,(centers[i]-centers[j])))
                projections[m]=(w,t)
    return projections

def select_by_entropy(groups,centers,projections,L):
    total=0
    for g in groups.keys():
        total+=len(groups[g])
    delta=dict()
    for i,wt in projections.items():
        w=wt[0]
        t=wt[1]
        vs=list()
        vt=list()
        for c in range(len(centers)):
            if np.dot(w,centers[c])>=t:
                vs.append(len(groups[c])/total)
            else:
                vt.append(len(groups[c])/total)
        Pi0=sum(vs)
        Pi1=sum(vt)
        delta[i]=-Pi0*math.log(Pi0)-Pi1*math.log(Pi1)
    select=sorted(delta,key=delta.get)[:L]
    model=dict()
    i=0
    for s in select:
        i+=1
        model[i]=projections[s]
    return model

def DSH_Init(p,alpha,L,r):
    data=np.load('DSH_DataSet.npy')
    k=alpha*L
    groups,centers=k_means(data,k,p)
    W=r_adjacent_groups(centers,r)
    projections=get_total_projections(W,centers)
    model=select_by_entropy(groups,centers,projections,L)
    hashtable=dict()
    for i in data:
        hashcode=list()
        for j,wt in model.items():
            w=wt[0]
            t=wt[1]
            if np.dot(w,i)>=t:
                hashcode.append(1)
            else:
                hashcode.append(0)
        hashcode=tuple(hashcode)
        if hashcode not in hashtable.keys():
            hashtable[hashcode]=set()
        hashtable[hashcode].add(tuple(i))
    np.save('DSH_HashFunction',model)
    return hashtable

def DSH_Inquire(hashtable,K=20):
    quires=np.load('DSH_Quires.npy')
    model=np.load('DSH_HashFunction.npy').item()
    result=dict()
    for q in range(len(quires)):
        hashcode=list()
        S=set()
        for j,wt in model.items():
            w=wt[0]
            t=wt[1]
            if np.dot(w,quires[q])>=t:
                hashcode.append(1)
            else:
                hashcode.append(0)
        hashcode=tuple(hashcode)
        if hashcode in hashtable.keys():
            S=S|hashtable[hashcode]
        S=list(S)
        SK=list()
        if len(S)>K:
            SK=kClosestSort(S,K,quires[q])
        else:
            SK=S
        result[q]=SK
    return result

def Brute_ForceLinearSearch(K=20):
    """线性暴力搜索k近点"""
    data=np.load('DSH_DataSet.npy')
    quires=np.load('DSH_Quires.npy')
    data_temp=data.tolist()
    data=list()
    for i in data_temp:
        data.append(tuple(i))
    qnum=len(quires)
    real=dict()
    for q in range(qnum):
        q_real=kClosestSort(data,K,quires[q])
        real[q]=q_real
    return real

def Recall(result,real):
    quires=np.load('DSH_Quires.npy')
    qnum=len(real)
    temp1=0
    temp2=0
    for q in range(qnum):
        K=len(real[q])
        kdis=E2Distance(quires[q],real[q][len(real[q])-1])
        for point in result[q]:
            if E2Distance(quires[q],point)<=kdis:
                temp1+=1
        temp2+=K
    recall=temp1/temp2
    return recall


r=10
p=100
L=4
alpha=5
hashtable=DSH_Init(p,alpha,L,r)
result=DSH_Inquire(hashtable)
real=Brute_ForceLinearSearch()
recall=Recall(result,real)
print(recall)

0.43986
