In [21]:
from datasketch import MinHash
from packages.generateDataSets import SyntheticMatcherDataset
from packages.calculateStatistics import DatasetEvaluator
import time

In [3]:
from datasketch import MinHash, MinHashLSH

# Sample records
records = {
    "id1": ["N780", "C459", "H866", "G186", "E487"],
    "id2": ["N780", "C459", "H866", "V186", "S537"],
    "id3": ["L123", "M456", "X789"]
}

# Create LSH index
lsh = MinHashLSH(threshold=0.4, num_perm=128)
minhashes = {}

# Build index
for record_id, tokens in records.items():
    m = MinHash(num_perm=128)
    for token in tokens:
        m.update(token.encode('utf8'))
    minhashes[record_id] = m
    lsh.insert(record_id, m)

# Query: find similar records to id1
query_result = lsh.query(minhashes["id1"])
print(f"Similar records to id1: {query_result}")


Similar records to id1: ['id2', 'id1']


In [6]:
from datasketch import MinHash, MinHashLSH
import pandas as pd

# Your input data
data1 = [
    ['ID00005', 'N039', 'E298', 'Q412', 'V409', 'R232'],
    ['ID00009', 'R822', 'W179', 'H017', 'P323', 'F298'],
    ['ID00007', 'R449', 'X716', 'M948', 'G667', 'S702'],
    ['ID00004', 'N002', 'E396', 'N843', 'I458', 'S719'],
    ['ID10004', 'N002', 'E396', 'N853', 'I623', 'S569'],
    ['NEW72378', 'J547', 'B222', 'G492', 'R551', 'S490'],
    ['ID00008', 'N322', 'K685', 'T442', 'C825', 'W967'],
]

data2 = [
    ['ID00005', 'R746', 'E298', 'Q412', 'L291', 'R232'],
    ['ID00009', 'R822', 'W179', 'H017', 'P323', 'F298'],
    ['ID00007', 'Z011', 'X716', 'M948', 'W967', 'S702'],
    ['ID00004', 'N002', 'E396', 'N843', 'V935', 'S719'],
    ['ID10004', 'N002', 'E396', 'N553', 'I453', 'S459'],
    ['NEW80187', 'J547', 'B222', 'G492', 'W673', 'S490'],
    ['NEW30110', 'N322', 'K685', 'T432', 'C225', 'W967'],
]

# Create MinHash function
def get_minhash(features, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for token in features:
        m.update(token.encode('utf8'))
    return m

# Build LSH index with data2
lsh = MinHashLSH(threshold=0.5, num_perm=128)
minhash_dict = {}

for row in data2:
    idx, features = row[0], row[1:]
    m = get_minhash(features)
    lsh.insert(idx, m)
    minhash_dict[idx] = m

# Query each data1 record
matches = {}
for row in data1:
    idx, features = row[0], row[1:]
    m = get_minhash(features)
    result = lsh.query(m)
    matches[idx] = result

# Print results
import json
print(json.dumps(matches, indent=4))

{
    "ID00005": [
        "ID00005"
    ],
    "ID00009": [
        "ID00009"
    ],
    "ID00007": [
        "ID00007"
    ],
    "ID00004": [
        "ID00004"
    ],
    "ID10004": [],
    "NEW72378": [
        "NEW80187"
    ],
    "ID00008": []
}


In [26]:
dataset = SyntheticMatcherDataset(size=10000 , true_positive_ratio=0.70, threshold=3)
df1, df2 = dataset.df1.values.tolist(), dataset.df2.values.tolist()
expected = dataset.expected

# Create MinHash function
def get_minhash(features, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for token in features:
        m.update(token.encode('utf8'))
    return m

# Build LSH index with data2
lsh = MinHashLSH(threshold=0.5, num_perm=256)
minhash_dict = {}

start_time = time.time() 

for row in df2:
    idx, features = row[0], row[1:]
    m = get_minhash(features)
    lsh.insert(idx, m)
    minhash_dict[idx] = m

# Query each data1 record
matches = {}
for row in df1:
    idx, features = row[0], row[1:]
    m = get_minhash(features)
    result = lsh.query(m)
    matches[idx] = result

elapsed_time = time.time() - start_time

ValueError: Expecting minhash with length 256, got 128

In [23]:
print(json.dumps(matches, indent=4))

{
    "ID00005": [
        "ID00005"
    ],
    "ID00009": [
        "ID00009"
    ],
    "ID00007": [
        "ID00007"
    ],
    "ID00004": [
        "ID00004"
    ],
    "ID10004": [],
    "NEW72378": [
        "NEW80187"
    ],
    "ID00008": []
}
