In [7]:
import pandas as pd
import numpy as np
from datasketch import MinHash, MinHashLSH


In [10]:
df = pd.read_csv('SGD_books testing.csv')

user_process_dict = df.groupby('user_id').apply(lambda x: set((row['from'], row['to'], row['type']) for _, row in x.iterrows())).to_dict()
print(user_process_dict)

def jaccard_distance(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return 1 - intersection / union

user_ids = list(user_process_dict.keys())

distance_matrix = np.zeros((len(user_ids), len(user_ids)))

for i in range(len(user_ids)):
    for j in range(len(user_ids)):
        if i != j:
            set1 = user_process_dict[user_ids[i]]
            set2 = user_process_dict[user_ids[j]]
            distance_matrix[i, j] = jaccard_distance(set1, set2)
        else:
            distance_matrix[i, j] = 0  # Jaccard distance with itself is 0

jaccard_similarity_matrix = 1 - distance_matrix

jaccard_similarity_df = pd.DataFrame(jaccard_similarity_matrix, index=user_ids, columns=user_ids)

jaccard_similarity_df.to_csv('jaccard_similarity.csv', index=True)

print(jaccard_similarity_df.head())


{1: {(nan, 'S0', 'Req'), ('S4', 'S4_1', 'Req'), ('S4_1', 'S4', 'Res'), ('S0', nan, 'Res'), ('S0', 'S2', 'Req'), ('S4', 'S0', 'Res'), ('S2_2', 'S2', 'Res'), ('S2_1', 'S2', 'Res'), ('S0', 'S4', 'Req'), ('S0', 'S3', 'Req'), ('S4', 'S4_2', 'Req'), ('S2', 'S0', 'Res'), ('S1', 'S1_1', 'Req'), ('S1_1', 'S1', 'Res'), ('S4_2', 'S4', 'Res'), ('S3', 'S0', 'Res'), ('S2', 'S2_3', 'Req'), ('S0', 'S1', 'Req'), ('S3_3', 'S3', 'Res'), ('S3', 'S3_3', 'Req'), ('S2', 'S2_1', 'Req'), ('S2', 'S2_2', 'Req'), ('S1', 'S0', 'Res'), ('S2_3', 'S2', 'Res')}, 2: {(nan, 'S0', 'Req'), ('S4', 'S4_1', 'Req'), ('S4_1', 'S4', 'Res'), ('S0', nan, 'Res'), ('S0', 'S2', 'Req'), ('S4', 'S0', 'Res'), ('S2_2', 'S2', 'Res'), ('S2_1', 'S2', 'Res'), ('S0', 'S4', 'Req'), ('S4', 'S4_3', 'Req'), ('S0', 'S3', 'Req'), ('S2', 'S0', 'Res'), ('S3', 'S0', 'Res'), ('S2', 'S2_3', 'Req'), ('S0', 'S1', 'Req'), ('S1', 'S1_2', 'Req'), ('S3_1', 'S3', 'Res'), ('S3', 'S3_1', 'Req'), ('S2', 'S2_1', 'Req'), ('S2', 'S2_2', 'Req'), ('S1_2', 'S1', 'Res'

In [12]:
minhashes = {}
lsh = MinHashLSH(threshold=0.5, num_perm=128)

for user_id, process_set in user_process_dict.items():
    minhash = MinHash(num_perm=128)
    for process in process_set:
        process_str = ','.join(map(str, process))  # Convert tuple to string
        minhash.update(process_str.encode('utf-8'))  # Encode the string
    minhashes[user_id] = minhash
    lsh.insert(user_id, minhash)

similar_sets = {}
for user_id, minhash in minhashes.items():
    similar_sets[user_id] = lsh.query(minhash)

for user_id, similar_user_ids in similar_sets.items():
    print(f"Similar sets for user {user_id}:")
    for similar_user_id in similar_user_ids:
        if user_id != similar_user_id:
            print(f"    - User {similar_user_id}")

Similar sets for user 1:
    - User 2
    - User 3
    - User 4
    - User 5
Similar sets for user 2:
    - User 1
    - User 3
    - User 4
    - User 5
Similar sets for user 3:
    - User 1
    - User 2
    - User 4
    - User 5
Similar sets for user 4:
    - User 1
    - User 2
    - User 3
    - User 5
Similar sets for user 5:
    - User 1
    - User 2
    - User 3
    - User 4


In [13]:
minhashes = {}
lsh = MinHashLSH(threshold=0.5, num_perm=128)

# Compute MinHash for each set and add it to MinHashLSH
for user_id, process_set in user_process_dict.items():
    minhash = MinHash(num_perm=128)
    for process in process_set:
        process_str = ','.join(map(str, process)) 
        minhash.update(process_str.encode('utf-8'))  
    minhashes[user_id] = minhash
    lsh.insert(user_id, minhash)

# Find similar sets
similar_sets = {}
for user_id, minhash in minhashes.items():
    similar_sets[user_id] = lsh.query(minhash)

minhash_representations = {}
for user_id, similar_user_ids in similar_sets.items():
    minhash_representations[user_id] = minhashes[user_id].digest()

for user_id, minhash_representation in minhash_representations.items():
    print(f"MinHash representation for user number {user_id}: {minhash_representation}")

MinHash representation for user 1: [  1676610  69674835 223371395 186089373 402068757    202566  49736264
 109398140   6336338 142631300  35952084  19011179  29957138 383256750
  99639018  74278649 116596732  28772084 746444409 477728452 184840280
 244073869 149484579  62135177 932889344  54812726 722054857  45676398
  18878923 216942194 102211254  47422630 264753139  46279002   9989119
 325688895  72041476   7822719   3423636  32049267 231705661  29197968
 185715741 219726474 572323037  67448868   7682599  96164636 380147586
 147591171  10860472 152587942 120148635 116911840  74840904 148853043
 244106167 231409722 222261613 144681414  15470093 116976158  54917665
  76360820  92423857  45493194  24449920  83143141 147367173 150671446
  42423509 398944204  72394644  73065434 304567650  91658194 217893900
   8873528 296394220 158086906 264325788 291669474  34950770  73210824
 313611609 279143725  99473229 181675875  51720879  42498777  11853842
 114579177 218116369  49166094 179170661  