Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add simhash #24

Merged
merged 18 commits into from
Apr 21, 2015
11 changes: 11 additions & 0 deletions Smile/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,17 @@
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>18.0</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
<scope>test</scope>
</dependency>
</dependencies>

<distributionManagement>
Expand Down
49 changes: 49 additions & 0 deletions Smile/src/main/java/smile/hash/SimHash.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package smile.hash;

import com.google.common.base.Charsets;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;

import java.util.Set;

import static smile.util.Tokenizer.words;

/**
* @author Qiyang Zuo
* @since 15/4/9
*/
public class SimHash {
private static HashFunction hf = Hashing.murmur3_128();

public static long simhash64(String text) {
Set<String> set = words(text);
return simhash64(set);
}

public static long simhash64(Set<String> words) {
final int BITS = 64;
if (words == null || words.isEmpty()) {
return 0;
}
int[] bits = new int[BITS];
for (String s : words) {
long hc = hf.hashString(s, Charsets.UTF_8).padToLong();
for (int i = 0; i < BITS; i++) {
if (((hc >>> i) & 1) == 1) {
bits[i]++;
} else {
bits[i]--;
}
}
}
long hash = 0;
long one = 1;
for (int i = 0; i < BITS; i++) {
if (bits[i] >= 0) {
hash |= one;
}
one <<= 1;
}
return hash;
}
}
189 changes: 189 additions & 0 deletions Smile/src/main/java/smile/neighbor/SNLSH.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
/**
* ****************************************************************************
* Copyright (c) 2010 Haifeng Li
* <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* *****************************************************************************
*/
package smile.neighbor;

import com.google.common.collect.Lists;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import smile.math.distance.HammingDistance;
import smile.util.MaxHeap;

import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.Set;
import java.util.List;
import java.util.LinkedList;
import java.util.HashSet;
import java.util.LinkedHashMap;

import static smile.hash.SimHash.simhash64;

/**
*
* Locality-Sensitive Hashing for Signatures.
* LSH is an efficient algorithm for approximate nearest neighbor search
* in high dimensional spaces by performing probabilistic dimension reduction of data.
* The basic idea is to hash the input items so that similar items are mapped to the same
* buckets with high probability (the number of buckets being much smaller
* than the universe of possible input items).
* To avoid computing the similarity of every pair of sets or their signatures.
* If we are given signatures for the sets, we may divide them into bands, and only
* measure the similarity of a pair of sets if they are identical in at least one band.
* By choosing the size of bands appropriately, we can eliminate from
* consideration most of the pairs that do not meet our threshold of similarity.
*
* <h2>References</h2>
* <ol>
* <li>Moses S. Charikar. Similarity Estimation Techniques from Rounding Algorithms</li>
* </ol>
*
* @see LSH
* @author Qiyang Zuo
*
*/
public class SNLSH<E> implements NearestNeighborSearch<String, E>, KNNSearch<String, E>, RNNSearch<String, E> {


private final int bandSize;
private final long mask;
private static final int BITS = 64;
/**
* Signature fractions
*/
private Band[] bands;
/**
* universal hash function
*/
private static HashFunction hf = Hashing.murmur3_128();
/**
* The data objects.
*/
private List<E> data;
/**
* The keys of data objects.
*/
private List<String> keys;
/**
* signatures generated by simhash
*/
private List<Long> signs;

/**
* Whether to exclude query object self from the neighborhood.
*/
private boolean identicalExcluded = true;

@SuppressWarnings("unchecked")
public SNLSH(int bandSize) {
if (bandSize < 2 || bandSize > 32) {
throw new IllegalArgumentException("Invalid band size!");
}
this.bandSize = bandSize;
bands = (Band[]) Array.newInstance(Band.class, bandSize);
Arrays.fill(bands, new Band());
this.mask = -1 >>> (BITS / bandSize * (bandSize - 1));
data = Lists.newArrayList();
keys = Lists.newArrayList();
signs = Lists.newArrayList();
}

public void put(String k, E v) {
int index = data.size();
data.add(v);
keys.add(k);
long sign = simhash64(k);
signs.add(sign);
for (int i = 0; i < bands.length; i++) {
long bandKey = bandHash(sign, i);
Bucket bucket = bands[i].get(bandKey);
if (bucket == null) {
bucket = new Bucket();
}
bucket.add(index);
bands[i].put(bandKey, bucket);
}
}

public Neighbor<String, E>[] knn(String q, int k) {
if(k < 1) {
throw new IllegalArgumentException("Invalid k: " + k);
}
long fpq = simhash64(q);
Set<Integer> candidates = obtainCandidates(q);
@SuppressWarnings("unchecked")
Neighbor<String, E>[] neighbors = (Neighbor<String, E>[])Array.newInstance(Neighbor.class, k);
MaxHeap<Neighbor<String, E>> heap = new MaxHeap<Neighbor<String, E>>(neighbors);
for (int index : candidates) {
long sign = signs.get(index);
double distance = HammingDistance.d(fpq, sign);
if (!keys.get(index).equals(q) && identicalExcluded) {
heap.add(new Neighbor<String, E>(keys.get(index), data.get(index), index, distance));
}
}
return heap.toSortedArray();
}

public Neighbor<String, E> nearest(String q) {
Neighbor<String, E>[] ns = knn(q, 1);
if(ns.length>0) {
return ns[0];
}
return new Neighbor<String, E>(null, null, -1, Double.MAX_VALUE);
}

public void range(String q, double radius, List<Neighbor<String, E>> neighbors) {
if (radius <= 0.0) {
throw new IllegalArgumentException("Invalid radius: " + radius);
}
long fpq = simhash64(q);
Set<Integer> candidates = obtainCandidates(q);
for (int index : candidates) {
double distance = HammingDistance.d(fpq, signs.get(index));
if (distance <= radius) {
if (keys.get(index).equals(q) && identicalExcluded) {
continue;
}
neighbors.add(new Neighbor<String, E>(keys.get(index), data.get(index), index, distance));
}
}
}

private class Band extends LinkedHashMap<Long, Bucket> {}

private class Bucket extends LinkedList<Integer> {}

private long bandHash(long hash, int bandNum) {
return hash >>> ((bandNum * (BITS / this.bandSize))) & mask;
}




private Set<Integer> obtainCandidates(String q) {
Set<Integer> candidates = new HashSet<Integer>();
long sign = simhash64(q);
for (int i = 0; i < bands.length; i++) {
long bandKey = bandHash(sign, i);
Bucket bucket = bands[i].get(bandKey);
if (bucket != null) {
candidates.addAll(bucket);
}
}
return candidates;
}
}
137 changes: 137 additions & 0 deletions Smile/src/main/java/smile/util/MaxHeap.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/**
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you really want to keep it? HeapSelect doesn't meet your needs?

* ****************************************************************************
* Copyright (c) 2010 Haifeng Li
* <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* *****************************************************************************
*/
package smile.util;

import java.util.Arrays;


/**
*
* A fixed-size heap track k-smallest elements of a one-pass stream.
* The top of heap is the biggest elements in the heap.
* This heap only contains k or less than k smallest elements in stream.
*
*
* @author Qiyang Zuo
*/
public class MaxHeap<E extends Comparable<? super E>> {

/**
* The heap size.
*/
private int k;

/**
* The number of objects that have been added into heap.
*/
private int n;

/**
* The max heap array.
* Top of the heap is 0th element of the heap.
*/
private E[] heap;

/**
* Constructor.
*
* @param heap the array to store smallest values to track.
*/
public MaxHeap(E[] heap) {
this.heap = heap;
k = heap.length;
n = 0;
}

/**
* add to heap
* @param e element add to heap
*/
public void add(E e) {
if (n < k) {
heap[n++] = e;
bubbleUp();
} else {
if (e.compareTo(top()) < 0) {
heap[0] = e;
bubbleDown();
}
}
}

/**
* get but don't remove the top element of the heap
*
* @return
*/
public E top() {
if (n > 0) {
return heap[0];
} else {
return null;
}
}

/**
* Get the heap data sorted increasingly
* @return elements array in heap in increasing order.
*/
@SuppressWarnings("unchecked")
public E[] toSortedArray() {
E[] arr = Arrays.copyOfRange(heap, 0, n);
Arrays.sort(arr, 0, n);
return arr;
}

private int lchild(int i) {
return i * 2 + 1;
}

private int rchild(int i) {
return (i + 1) * 2;
}

private int parent(int i) {
return (i - 1) / 2;
}

private void bubbleUp() {
int i = n - 1;
E e = heap[i];
while (i > 0 && heap[parent(i)].compareTo(e) < 0) {
heap[i] = heap[parent(i)];
i = parent(i);
}
heap[i] = e;
}

private void bubbleDown() {
E e = heap[0];
int i = 0;
while (lchild(i) < n) {
int maxChild = rchild(i) < n && heap[lchild(i)].compareTo(heap[rchild(i)]) < 0 ? rchild(i) : lchild(i);
if (e.compareTo(heap[maxChild]) < 0) {
heap[i] = heap[maxChild];
i = maxChild;
} else {
break;
}
}
heap[i] = e;
}
}