### Akin - Example Usage

In [1]:
from akin import MinHash, LSH

In [2]:
content = [
    'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',
    'A helium atom has about four times as much mass as a hydrogen atom, so the composition changes '
    'when described as the proportion of mass contributed by different atoms.',
    'Jupiter is primarily composed of hydrogen and a quarter of its mass being helium',
    'A helium atom has about four times as much mass as a hydrogen atom and the composition changes '
    'when described as a proportion of mass contributed by different atoms.',
    'Theoretical models indicate that if Jupiter had much more mass than it does at present, it '
    'would shrink.',
    'This process causes Jupiter to shrink by about 2 cm each year.',
    'Jupiter is mostly composed of hydrogen with a quarter of its mass being helium',
    'The Great Red Spot is large enough to accommodate Earth within its boundaries.'
]

In [3]:
content_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]

**Create MinHash object:**

In [4]:
minhash = MinHash(content, n_gram=9, permutations=100, hash_bits=64, seed=3)

**Create LSH object:**

In [5]:
lsh = LSH(minhash, content_labels, no_of_bands=50)

**Query to find near duplicates for text 1:**

In [6]:
lsh.query(1, min_jaccard=0.5)

[8, 4]

**Generate minhash signature and add new texts to LSH model:**

In [14]:
new_text = [
    'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of '
    'inner planets.'
]

In [8]:
new_labels = ['doc1', 'doc2']

In [9]:
new_minhash = MinHash(new_text, n_gram=9, permutations=100, hash_bits=64, seed=3)
lsh.update(new_minhash, new_labels)

**Check contents of documents:**

In [10]:
lsh.contains()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 'doc1', 'doc2']

**Remove text and label from model:**

In [11]:
# Remove text and label from model.
lsh.remove(5)
lsh.contains()

[1, 2, 3, 4, 6, 7, 8, 9, 'doc1', 'doc2']

**Return adjacency list for all similar texts:**

In [12]:
adjacency_list = lsh.adjacency_list(min_jaccard=0.55)
adjacency_list

{1: ['doc1', 4],
 2: ['doc2'],
 3: [],
 4: [1, 'doc1'],
 6: [],
 7: [],
 8: [],
 9: [],
 'doc1': [1, 4],
 'doc2': [2]}