-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add simhash #24
Merged
Merged
add simhash #24
Changes from 12 commits
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
e4c0271
add simhash
kid1412z cd3c7aa
add range and test
kid1412z af9faf2
merge from upstream
kid1412z f254602
add a test of HeapSelect
kid1412z 17d01a1
remove test
kid1412z 1d8a5fd
test knn recall -> 0.6
kid1412z b266640
Merge branch 'master' of https://github.com/haifengl/smile
kid1412z 923867a
add nearest recall
kid1412z 9a971ce
add range recall
kid1412z f9ec487
exclude identical
kid1412z d76a501
Merge branch 'master' of https://github.com/haifengl/smile
kid1412z 8025978
remove google lib and tokenizer
kid1412z d9b1480
refactor add licence
kid1412z ea28b40
recover .gitignore
kid1412z e00f30c
refector
kid1412z 70cb9f2
remove MaxHeap
kid1412z e8e899f
fix import
kid1412z e74b8ac
remove MaxHeap
kid1412z File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/** | ||
* **************************************************************************** | ||
* Copyright (c) 2010 Haifeng Li | ||
* <p/> | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* <p/> | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* <p/> | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* ***************************************************************************** | ||
*/ | ||
package smile.hash; | ||
|
||
|
||
import java.nio.ByteBuffer; | ||
import java.util.List; | ||
|
||
/** | ||
* @author Qiyang Zuo | ||
* @since 15/4/9 | ||
*/ | ||
public class SimHash { | ||
private static final long seed = System.currentTimeMillis(); | ||
|
||
public static long simhash64(List<String> tokens) { | ||
final int BITS = 64; | ||
if (tokens == null || tokens.isEmpty()) { | ||
return 0; | ||
} | ||
int[] bits = new int[BITS]; | ||
for (String s : tokens) { | ||
ByteBuffer buffer = ByteBuffer.wrap(s.getBytes()); | ||
long hc = MurmurHash.hash2_64(buffer, 0, buffer.array().length, seed); | ||
for (int i = 0; i < BITS; i++) { | ||
if (((hc >>> i) & 1) == 1) { | ||
bits[i]++; | ||
} else { | ||
bits[i]--; | ||
} | ||
} | ||
} | ||
long hash = 0; | ||
long one = 1; | ||
for (int i = 0; i < BITS; i++) { | ||
if (bits[i] >= 0) { | ||
hash |= one; | ||
} | ||
one <<= 1; | ||
} | ||
return hash; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
/** | ||
* **************************************************************************** | ||
* Copyright (c) 2010 Haifeng Li | ||
* <p/> | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* <p/> | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* <p/> | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* ***************************************************************************** | ||
*/ | ||
package smile.neighbor; | ||
|
||
import smile.math.distance.HammingDistance; | ||
import smile.util.MaxHeap; | ||
|
||
import java.lang.reflect.Array; | ||
import java.util.*; | ||
|
||
import static smile.hash.SimHash.simhash64; | ||
|
||
/** | ||
* | ||
* Locality-Sensitive Hashing for Signatures. | ||
* LSH is an efficient algorithm for approximate nearest neighbor search | ||
* in high dimensional spaces by performing probabilistic dimension reduction of data. | ||
* The basic idea is to hash the input items so that similar items are mapped to the same | ||
* buckets with high probability (the number of buckets being much smaller | ||
* than the universe of possible input items). | ||
* To avoid computing the similarity of every pair of sets or their signatures. | ||
* If we are given signatures for the sets, we may divide them into bands, and only | ||
* measure the similarity of a pair of sets if they are identical in at least one band. | ||
* By choosing the size of bands appropriately, we can eliminate from | ||
* consideration most of the pairs that do not meet our threshold of similarity. | ||
* | ||
* <h2>References</h2> | ||
* <ol> | ||
* <li>Moses S. Charikar. Similarity Estimation Techniques from Rounding Algorithms</li> | ||
* </ol> | ||
* | ||
* @see LSH | ||
* @author Qiyang Zuo | ||
* | ||
*/ | ||
public class SNLSH<E> implements NearestNeighborSearch<List<String>, E>, KNNSearch<List<String>, E>, RNNSearch<List<String>, E> { | ||
|
||
|
||
private final int bandSize; | ||
private final long mask; | ||
private static final int BITS = 64; | ||
/** | ||
* Signature fractions | ||
*/ | ||
private Band[] bands; | ||
/** | ||
* The data objects. | ||
*/ | ||
private List<E> data; | ||
/** | ||
* The keys of data objects. | ||
*/ | ||
private List<List<String>> keys; | ||
/** | ||
* signatures generated by simhash | ||
*/ | ||
private List<Long> signs; | ||
|
||
/** | ||
* Whether to exclude query object self from the neighborhood. | ||
*/ | ||
private boolean identicalExcluded = true; | ||
|
||
@SuppressWarnings("unchecked") | ||
public SNLSH(int bandSize) { | ||
if (bandSize < 2 || bandSize > 32) { | ||
throw new IllegalArgumentException("Invalid band size!"); | ||
} | ||
this.bandSize = bandSize; | ||
bands = (Band[]) Array.newInstance(Band.class, bandSize); | ||
Arrays.fill(bands, new Band()); | ||
this.mask = -1 >>> (BITS / bandSize * (bandSize - 1)); | ||
data = new ArrayList<E>(); | ||
keys = new ArrayList<List<String>>(); | ||
signs = new ArrayList<Long>(); | ||
} | ||
|
||
public void put(List<String> tokens, E v) { | ||
int index = data.size(); | ||
data.add(v); | ||
keys.add(tokens); | ||
long sign = simhash64(tokens); | ||
signs.add(sign); | ||
for (int i = 0; i < bands.length; i++) { | ||
long bandKey = bandHash(sign, i); | ||
Bucket bucket = bands[i].get(bandKey); | ||
if (bucket == null) { | ||
bucket = new Bucket(); | ||
} | ||
bucket.add(index); | ||
bands[i].put(bandKey, bucket); | ||
} | ||
} | ||
|
||
public Neighbor<List<String>, E>[] knn(List<String> q, int k) { | ||
if(k < 1) { | ||
throw new IllegalArgumentException("Invalid k: " + k); | ||
} | ||
long fpq = simhash64(q); | ||
Set<Integer> candidates = obtainCandidates(q); | ||
@SuppressWarnings("unchecked") | ||
Neighbor<List<String>, E>[] neighbors = (Neighbor<List<String>, E>[])Array.newInstance(Neighbor.class, k); | ||
MaxHeap<Neighbor<List<String>, E>> heap = new MaxHeap<Neighbor<List<String>, E>>(neighbors); | ||
for (int index : candidates) { | ||
long sign = signs.get(index); | ||
double distance = HammingDistance.d(fpq, sign); | ||
if (!keys.get(index).equals(q) && identicalExcluded) { | ||
heap.add(new Neighbor<List<String>, E>(keys.get(index), data.get(index), index, distance)); | ||
} | ||
} | ||
return heap.toSortedArray(); | ||
} | ||
|
||
public Neighbor<List<String>, E> nearest(List<String> q) { | ||
Neighbor<List<String>, E>[] ns = knn(q, 1); | ||
if(ns.length>0) { | ||
return ns[0]; | ||
} | ||
return new Neighbor<List<String>, E>(null, null, -1, Double.MAX_VALUE); | ||
} | ||
|
||
public void range(List<String> q, double radius, List<Neighbor<List<String>, E>> neighbors) { | ||
if (radius <= 0.0) { | ||
throw new IllegalArgumentException("Invalid radius: " + radius); | ||
} | ||
long fpq = simhash64(q); | ||
Set<Integer> candidates = obtainCandidates(q); | ||
for (int index : candidates) { | ||
double distance = HammingDistance.d(fpq, signs.get(index)); | ||
if (distance <= radius) { | ||
if (keys.get(index).equals(q) && identicalExcluded) { | ||
continue; | ||
} | ||
neighbors.add(new Neighbor<List<String>, E>(keys.get(index), data.get(index), index, distance)); | ||
} | ||
} | ||
} | ||
|
||
private class Band extends LinkedHashMap<Long, Bucket> {} | ||
|
||
private class Bucket extends LinkedList<Integer> {} | ||
|
||
private long bandHash(long hash, int bandNum) { | ||
return hash >>> ((bandNum * (BITS / this.bandSize))) & mask; | ||
} | ||
|
||
|
||
|
||
|
||
private Set<Integer> obtainCandidates(List<String> q) { | ||
Set<Integer> candidates = new HashSet<Integer>(); | ||
long sign = simhash64(q); | ||
for (int i = 0; i < bands.length; i++) { | ||
long bandKey = bandHash(sign, i); | ||
Bucket bucket = bands[i].get(bandKey); | ||
if (bucket != null) { | ||
candidates.addAll(bucket); | ||
} | ||
} | ||
return candidates; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
/** | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you really want to keep it? HeapSelect doesn't meet your needs? |
||
* **************************************************************************** | ||
* Copyright (c) 2010 Haifeng Li | ||
* <p/> | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* <p/> | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* <p/> | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* ***************************************************************************** | ||
*/ | ||
package smile.util; | ||
|
||
import java.util.Arrays; | ||
|
||
|
||
/** | ||
* | ||
* A fixed-size heap track k-smallest elements of a one-pass stream. | ||
* The top of heap is the biggest elements in the heap. | ||
* This heap only contains k or less than k smallest elements in stream. | ||
* | ||
* | ||
* @author Qiyang Zuo | ||
*/ | ||
public class MaxHeap<E extends Comparable<? super E>> { | ||
|
||
/** | ||
* The heap size. | ||
*/ | ||
private int k; | ||
|
||
/** | ||
* The number of objects that have been added into heap. | ||
*/ | ||
private int n; | ||
|
||
/** | ||
* The max heap array. | ||
* Top of the heap is 0th element of the heap. | ||
*/ | ||
private E[] heap; | ||
|
||
/** | ||
* Constructor. | ||
* | ||
* @param heap the array to store smallest values to track. | ||
*/ | ||
public MaxHeap(E[] heap) { | ||
this.heap = heap; | ||
k = heap.length; | ||
n = 0; | ||
} | ||
|
||
/** | ||
* add to heap | ||
* @param e element add to heap | ||
*/ | ||
public void add(E e) { | ||
if (n < k) { | ||
heap[n++] = e; | ||
bubbleUp(); | ||
} else { | ||
if (e.compareTo(top()) < 0) { | ||
heap[0] = e; | ||
bubbleDown(); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* get but don't remove the top element of the heap | ||
* | ||
* @return | ||
*/ | ||
public E top() { | ||
if (n > 0) { | ||
return heap[0]; | ||
} else { | ||
return null; | ||
} | ||
} | ||
|
||
/** | ||
* Get the heap data sorted increasingly | ||
* @return elements array in heap in increasing order. | ||
*/ | ||
@SuppressWarnings("unchecked") | ||
public E[] toSortedArray() { | ||
E[] arr = Arrays.copyOfRange(heap, 0, n); | ||
Arrays.sort(arr, 0, n); | ||
return arr; | ||
} | ||
|
||
private int lchild(int i) { | ||
return i * 2 + 1; | ||
} | ||
|
||
private int rchild(int i) { | ||
return (i + 1) * 2; | ||
} | ||
|
||
private int parent(int i) { | ||
return (i - 1) / 2; | ||
} | ||
|
||
private void bubbleUp() { | ||
int i = n - 1; | ||
E e = heap[i]; | ||
while (i > 0 && heap[parent(i)].compareTo(e) < 0) { | ||
heap[i] = heap[parent(i)]; | ||
i = parent(i); | ||
} | ||
heap[i] = e; | ||
} | ||
|
||
private void bubbleDown() { | ||
E e = heap[0]; | ||
int i = 0; | ||
while (lchild(i) < n) { | ||
int maxChild = rchild(i) < n && heap[lchild(i)].compareTo(heap[rchild(i)]) < 0 ? rchild(i) : lchild(i); | ||
if (e.compareTo(heap[maxChild]) < 0) { | ||
heap[i] = heap[maxChild]; | ||
i = maxChild; | ||
} else { | ||
break; | ||
} | ||
} | ||
heap[i] = e; | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will this be reused in other algorithms? If not, shall we move it into SNLSH?