In [30]:
import numpy as np
from datasketch import MinHash


def generate_minhash_signatures(data, num_perm=128):
    signatures = []
    for i in range(data.shape[0]):
        minhash_obj = MinHash(num_perm=num_perm)
        for j in range(data.shape[1]):
            if data[i, j] > 0:
                minhash_obj.update(np.array([j]))
        signature = minhash_obj.digest()
        signatures.append(signature)
    return np.array([np.fromiter(sig, dtype=np.float32) for sig in signatures])


# 假设data是一个高维稀疏数据列表
data = np.array(
    [
        [1, 0, 0, 1],
        [1, 1, 0, 1],
        [0, 1, 0, 1],
        [0, 1, 1, 1],
        [1, 1, 1, 1],
    ]
)

# 生成MinHash签名
num_perm = 128
minhash_signatures = generate_minhash_signatures(data, num_perm)

In [31]:
minhash_signatures.shape

(5, 128)

In [35]:
minhash_matrix = np.zeros((data.shape[0], data.shape[1] * num_perm), dtype=np.uint8)
for j in range(minhash_signatures.shape[1]):
    for bucket, hash_value in enumerate(set(minhash_signatures[:, j])):
        rows = (minhash_signatures[:, j] == hash_value).nonzero()[0]
        column = j * data.shape[1] + bucket
        minhash_matrix[rows, column] = 1

min_bucket_size = 2
minhash_matrix = minhash_matrix[:, minhash_matrix.sum(axis=0) >= min_bucket_size]

In [36]:
minhash_matrix

array([[1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
        1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
        0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
        1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
        1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
        1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0],
       [1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
        1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
        0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,
        1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,

In [37]:
minhash_matrix[0] @ minhash_matrix[1]

88

In [38]:
minhash_matrix[0] @ minhash_matrix[2]

45

In [39]:
minhash_matrix[3] @ minhash_matrix[4]

95