In [73]:
import hashlib
import numpy as np

def hash_function(seed):
    return lambda x: int(hashlib.md5((str(seed) + x).encode('utf8')).hexdigest(), 16)

def minhash(signature_size, num_hashes):
    def compute_minhash(chars):
        min_hashes = [float('inf')] * num_hashes
        for char in chars:
            for i in range(num_hashes):
                hash_value = hash_function(i)(char)
                if hash_value < min_hashes[i]:
                    min_hashes[i] = hash_value
        return min_hashes
    return compute_minhash

In [89]:
import pandas as pd
import numpy as np

def jaccard_similarity(set1, set2):
    # intersection of two sets
    intersection = len(np.intersect1d(set1, set2))
    # Unions of two sets
    union = len(np.union1d(set1, set2))
     
    return intersection / union




df = pd.read_csv('SGD.csv')
dic = {}

for fr, to, time, type, user_id in zip(df["from"], df["to"], df["timestamp"], df["type"], df["user_id"]):
    if user_id in dic:
        dic[user_id].append([fr, to, type])
    else:
        dic[user_id] = [[fr, to, type]]

signature_size = 100
num_hashes = 1
compute_minhash = minhash(signature_size, num_hashes)

hashes = []
for x in range(1,len(dic)):
    dic[x] = compute_minhash(np.array(dic[x]).flatten())
    hashes.append(dic[x])

df = pd.DataFrame(np.array(hashes).flatten(), columns=['hash'])

# Save to Excel (XLSX)
df.to_csv('./SGD_hash.csv', index=False)  # Save to XLSX instead of CSV

# print(compute_minhash(arr1))
# print(compute_minhash(arr2))
# jaccard_similarity(arr1, arr2)
##ANY value over 0.5 - get placed in same bucket?

# for i in range(1,len(dic)):
#     if jaccard_similarity(dic[i], dic[i+1]) > 0.95:
#         print(i,i+1)


In [1]:

#Convert to k-shingles

import hashlib
import numpy as np


def hash_function(seed):
    return lambda x: int(hashlib.md5((str(seed) + x).encode('utf8')).hexdigest(), 16)

def minhash(shingles, num_hashes=100):
    min_hashes = [float('inf')] * num_hashes
    
    for shingle in shingles:
        for i in range(num_hashes):
            hash_value = hash_function(i)(shingle)
            if hash_value < min_hashes[i]:
                min_hashes[i] = hash_value
                
    return min_hashes

# Example usage:
document1 = {"shingle1", "shingle2", "shingle3"}
document2 = {"shingle2", "shingle3", "shingle4"}

signature1 = minhash(document1, num_hashes=100)
signature2 = minhash(document2, num_hashes=100)

# Calculate Jaccard similarity between the two Minhash signatures
def jaccard_similarity(sig1, sig2):
    return sum(1 for i in range(len(sig1)) if sig1[i] == sig2[i]) / len(sig1)

similarity = jaccard_similarity(signature1, signature2)
print(f"Estimated Jaccard Similarity: {similarity}")



Estimated Jaccard Similarity: 0.43
