# Read Data

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/SGD_TESTING.csv')
print(df)
dic = {}

for fr, to, time, type, user_id in zip(df["from"], df["to"], df["timestamp"], df["type"], df["user_id"]):
    if user_id in dic:
        dic[user_id].append([fr, to, type])
    else:
        dic[user_id] = [[fr, to, type]]


hashes = []
for x in range(1,len(dic)):
    dic[x] = np.array(dic[x]).flatten()
    hashes.append(dic[x])


print(len(dic))

      from    to                timestamp type  user_id
0      NaN    S0  2024-05-26 14:07:50.000  Req        1
1       S0    S1  2024-05-26 14:07:50.000  Req        1
2       S1  S1_2  2024-05-26 14:07:50.000  Req        1
3     S1_2    S1  2024-05-26 14:07:50.003  Res        1
4       S1    S0  2024-05-26 14:07:50.003  Res        1
...    ...   ...                      ...  ...      ...
9381    S0    S4  2024-06-15 09:49:30.024  Req      361
9382    S4  S4_2  2024-06-15 09:49:30.024  Req      361
9383  S4_2    S4  2024-06-15 09:49:30.034  Res      361
9384    S4    S0  2024-06-15 09:49:30.034  Res      361
9385    S0   NaN  2024-06-15 09:49:30.034  Res      361

[9386 rows x 5 columns]
361


# Functions

In [3]:
from random import shuffle
import math
def jaccard_similarity(set1, set2):
    # intersection of two sets
    intersection = len(np.intersect1d(set1, set2))
    # Unions of two sets
    union = len(np.union1d(set1, set2))
    if union == 0:
        return 0
    else:
        return intersection / union

def create_hash_func(size: int, vocab):
    # function for creating the hash vector/function
    hash_ex = list(range(1, len(vocab)+1))
    shuffle(hash_ex)
    return hash_ex

def build_minhash_func(vocab_size: int, nbits: int, vocab : int):
    # function for building multiple minhash vectors
    hashes = []
    for _ in range(nbits):
        hashes.append(create_hash_func(vocab_size, vocab))
    return hashes

def create_hash(vector: list, vocab, minhash_func):
    # use this function for creating our signatures (eg the matching)
    signature = []
    for func in minhash_func:
        for i in range(1, len(vocab)+1):
            idx = func.index(i)
            signature_val = vector[idx]
            if signature_val == 1:
                signature.append(idx)
                break
    return signature


def shingle(text, k):
    shingle_set = []
    for i in range(len(text)-k +1):
        shingle_set.append(text[i:i+k])
    return set(shingle_set)



# Minhash and LSH

In [4]:
from datasketch import MinHash, MinHashLSH

lsh = MinHashLSH(threshold=0.8, num_perm=128)
minhashes = {}
hashvalues = {}

for user_id in dic:
    # Convert all elements to strings
    flattened_list = ["".join(map(str, sublist)) for sublist in dic[user_id]]
    shingles = shingle("".join(flattened_list), 3)
    m = MinHash(num_perm=128)
    for shingle_item in shingles:
        m.update(shingle_item.encode('utf8'))
    minhashes[user_id] = m
    lsh.insert(f"{user_id}", m)

def find_neighbors(user_id):
    if user_id in minhashes:
        result = lsh.query(minhashes[user_id])
        return result
    else:
        return []

user_id_check = 2
neighbors = find_neighbors(user_id_check)
print(f"LSH neighbors for user {user_id_check}:", neighbors)

LSH neighbors for user 2: ['112', '354', '338', '3', '8', '14', '290', '258', '309', '115', '79', '281', '330', '143', '20', '131', '148', '182', '289', '252', '66', '207', '47', '61', '262', '237', '160', '236', '360', '21', '43', '244', '155', '29', '326', '288', '109', '282', '145', '10', '37', '225', '261', '270', '319', '5', '31', '305', '234', '214', '193', '230', '55', '94', '68', '227', '56', '154', '45', '70', '247', '54', '96', '248', '40', '36', '142', '38', '17', '235', '287', '260', '34', '297', '301', '132', '334', '88', '42', '276', '215', '83', '331', '146', '310', '251', '233', '198', '200', '140', '124', '219', '69', '275', '320', '174', '129', '149', '52', '76', '128', '75', '324', '294', '311', '30', '337', '224', '48', '210', '136', '278', '359', '39', '333', '152', '231', '65', '119', '158', '183', '217', '186', '19', '316', '93', '82', '240', '308', '150', '4', '357', '177', '91', '277', '303', '336', '162', '2', '257', '168', '114', '195', '27', '178', '239', '2

In [9]:

buckets = {}
for i in range(1, len(dic)):
    initial_bucket = find_neighbors(i)
    end_bucket = []
    
    for item in initial_bucket:
        item_exists = False
        for values in buckets.values():
            if item in values:
                item_exists = True
                break
        if not item_exists:
            end_bucket.append(item)
    if end_bucket:
        buckets[i] = end_bucket
print(buckets)

print(buckets[1])
print(minhashes)


{1: ['225', '100', '306', '283', '295', '9', '187', '11', '253', '79', '101', '1', '71', '304', '189', '33', '345', '94', '80', '250', '104', '125', '243', '27', '153', '194', '322', '226', '63', '246', '291', '249', '92', '39', '333', '206', '190', '235', '64', '238', '106', '157', '111', '188', '88', '327', '353', '32', '202', '185', '244', '232', '241', '4', '325', '240', '74', '97', '208', '346', '315', '141', '299', '57', '50', '72', '37', '123'], 2: ['112', '354', '338', '3', '8', '14', '290', '258', '309', '115', '281', '330', '143', '20', '131', '148', '182', '289', '252', '66', '207', '47', '61', '262', '237', '160', '236', '360', '21', '43', '155', '29', '326', '288', '109', '282', '145', '10', '261', '270', '319', '5', '31', '305', '234', '214', '193', '230', '55', '68', '227', '56', '154', '45', '70', '247', '54', '96', '248', '40', '36', '142', '38', '17', '287', '260', '34', '297', '301', '132', '334', '42', '276', '215', '83', '331', '146', '310', '251', '233', '198', '2

In [7]:
sims = []
for key, value in buckets.items():
    for user_id_1 in buckets[key]:
        for user_id_2 in buckets[key]:
            if user_id_1 != user_id_2:
                sig_1 = minhashes[int(user_id_1)]
                sig_2 = minhashes[int(user_id_2)]
                sim = MinHash.jaccard(sig_1, sig_2)
                print(user_id_1, user_id_2, sim)
                sims.append([key, user_id_1, user_id_2, sim])

df = pd.DataFrame(sims, columns=["bucket","user_1", "user_2", "jaccard-value"], index=None)
df.to_csv("output/SGD_sim.csv")
df.head()

225 100 0.890625
225 306 0.7734375
225 283 0.859375
225 295 0.765625
225 9 0.765625
225 187 0.8828125
225 11 0.78125
225 253 0.7890625
225 79 0.9609375
225 101 0.859375
225 1 0.859375
225 71 0.7890625
225 304 0.921875
225 189 0.8203125
225 33 0.8046875
225 345 0.8515625
225 94 1.0
225 80 0.8125
225 250 0.7578125
225 104 0.796875
225 125 0.8125
225 243 0.8046875
225 27 0.9453125
225 153 0.796875
225 194 0.8125
225 322 0.8125
225 226 0.8046875
225 63 0.8359375
225 246 0.859375
225 291 0.7734375
225 249 0.921875
225 92 0.8515625
225 39 1.0
225 333 0.9453125
225 206 0.8515625
225 190 0.875
225 235 0.8125
225 64 0.859375
225 238 0.734375
225 106 0.8125
225 157 0.921875
225 111 0.78125
225 188 0.84375
225 88 0.9453125
225 327 0.8125
225 353 0.8125
225 32 0.8359375
225 202 0.96875
225 185 0.8046875
225 244 0.9453125
225 232 0.890625
225 241 0.765625
225 4 0.9453125
225 325 0.7578125
225 240 0.9453125
225 74 0.7109375
225 97 0.78125
225 208 0.921875
225 346 0.78125
225 315 0.8125
225 141 0.945

Unnamed: 0,bucket,user_1,user_2,approximate jaccard
0,1,225,100,0.890625
1,1,225,306,0.773438
2,1,225,283,0.859375
3,1,225,295,0.765625
4,1,225,9,0.765625


# TRASH

In [None]:
# #shingle
# print("".join(dic[1]))
# print("".join(dic[6]))

# arr1 = shingle("".join(dic[1]), 3)
# arr2 = shingle("".join(dic[6]), 3)

# print(arr1)
# print(arr2)
# #vocab for 1hot
# vocab = list(arr1.union(arr2))
# #print(vocab)

# #1hot-encoding
# x = [1 if item in arr1 else 0 for item in vocab]
# y = [1 if item in arr2 else 0 for item in vocab]

# print(x,y)


# minhash_func = build_minhash_func(len(vocab), 100, vocab)

# arr1_sig = create_hash(x, vocab, minhash_func)
# arr2_sig = create_hash(y, vocab, minhash_func)

# print(arr1_sig)
# print(arr2_sig)

# print(jaccard_similarity(arr1_sig, arr2_sig))
# print(jaccard_similarity(dic[1], dic[6]))



In [None]:
# import numpy as np
# from sklearn.utils import murmurhash3_32
# import pandas as pd
# from collections import defaultdict

# def lsh_hash_signature(signature, num_bands, rows_per_band):
#     """
#     Hashes a MinHash signature into buckets using Locality Sensitive Hashing (LSH).

#     Parameters:
#     signature (list): The MinHash signature.
#     num_bands (int): Number of bands.
#     rows_per_band (int): Number of rows per band.

#     Returns:
#     list: A list of hash values, one for each band.
#     """
#     hash_bands = []
#     for i in range(num_bands):
#         start_index = i * rows_per_band
#         end_index = start_index + rows_per_band
#         band = tuple(signature[start_index:end_index])
#         hash_bands.append(murmurhash3_32(band))
#     return hash_bands

# def bucket_signatures(signatures, num_bands, rows_per_band):
#     """
#     Buckets MinHash signatures into LSH buckets.

#     Parameters:
#     signatures (list): List of MinHash signatures.
#     num_bands (int): Number of bands.
#     rows_per_band (int): Number of rows per band.

#     Returns:
#     dict: A dictionary where keys are bucket identifiers and values are lists of indices of signatures in that bucket.
#     """
#     buckets = defaultdict(list)
#     for index, signature in enumerate(signatures):
#         hash_bands = lsh_hash_signature(signature, num_bands, rows_per_band)
#         for band_hash in hash_bands:
#             buckets[band_hash].append(index)
#     return buckets

# def compare_buckets(buckets, signatures):
#     """
#     Compare items within the same LSH bucket to identify similar pairs.

#     Parameters:
#     buckets (dict): LSH buckets.
#     signatures (list): List of MinHash signatures.

#     Returns:
#     list: List of tuples representing pairs of similar items and their Jaccard similarity.
#     """
#     similar_pairs = []
#     for bucket, indices in buckets.items():
#         for i in range(len(indices)):
#             for j in range(i + 1, len(indices)):
#                 index1, index2 = indices[i], indices[j]
#                 jacc = jaccard_similarity(signatures[index1], signatures[index2])
#                 similar_pairs.append((index1, index2, jacc))
#     return similar_pairs




# # Assuming you already have a function to create MinHash signatures, e.g., create_minhash_signature
# # and a function to calculate Jaccard similarity, e.g., jaccard_similarity

# # Example: MinHash signatures for your items
# #signatures = [create_minhash_signature(shingle("".join(dic[i]), 3), 100) for i in range(1, len(dic))]#

# # Parameters for LSH
# num_bands = 20
# rows_per_band = 5

# # Bucket signatures using LSH
# buckets = bucket_signatures(signatures, num_bands, rows_per_band)

# # Compare items within each bucket
# similar_pairs = compare_buckets(buckets, signatures)

# # Create a DataFrame to store the results
# df = pd.DataFrame(similar_pairs, columns=["Index1", "Index2", "JaccardSimilarity"])
# print(df.head())

# signatures = []
# for x in range(1,len(dic)):
#     for y in range(x+1, len(dic)-1):
#         arr1 = shingle("".join(dic[x]), 3)
#         arr2 = shingle("".join(dic[y]), 3)
#         vocab = list(arr1.union(arr2))
#         arr1_hot = [1 if item in arr1 else 0 for item in vocab]
#         arr2_hot = [1 if item in arr2 else 0 for item in vocab]
#         minhash_func = build_minhash_func(len(vocab), 20, vocab)
#         arr1_sig = create_hash(arr1_hot, vocab, minhash_func)
#         arr2_sig = create_hash(arr2_hot, vocab, minhash_func)
#         jacc = jaccard_similarity(arr1_sig, arr2_sig)
#         print(x,y,jacc)
#         signatures.append([x,y, jacc])

# df = pd.DataFrame(signatures)
# df.head()
        