In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("test").getOrCreate()

In [3]:
#df = spark.read.csv("data/SGD_TESTING.csv").show()

def shingle(text, k):
    shingle_set = []
    for i in range(len(text)-k +1):
        shingle_set.append(text[i:i+k])
    return set(shingle_set)


In [4]:
data = spark.read.csv("data/SGD.csv", header=True, inferSchema=True)

## First we find highly similar users with jaccard of 0.95

In [5]:
#from pyspark.ml.feature import MinHashLSH
from pyspark.ml.feature import Word2Vec, StringIndexer, VectorAssembler, DataFrame
from pyspark.sql.functions import col, array, explode, concat_ws, collect_list, udf, lit
import numpy as np 
from pyspark.sql.types import StringType, ArrayType, BooleanType
from datasketch import MinHash, MinHashLSH


df = data.withColumn("arrayColumn", concat_ws("","from", "to", "type")).withColumn("Minhash", lit(""))

df_grouped = df.groupBy("user_id").agg(
    concat_ws("",collect_list("arrayColumn")).alias("features")
)

print(f"Amount of processes: {df_grouped.count()}")

#replace stuff with 0.95 jacc with same rep
lsh = MinHashLSH(threshold=0.95, num_perm=128)
minhashes = {}

for features in df_grouped.collect():
    shingles = shingle(features["features"], 3)
    m = MinHash(num_perm=128)
    for shingle_item in shingles:
        m.update(shingle_item.encode("utf8"))
    minhashes[int(features["user_id"])] = m
    lsh.insert(int(features["user_id"]), m)

replacement_candidates = {}
for key in lsh.keys: 
    replacement_candidates[key] = lsh.query(minhashes[key]) 



Amount of processes: 38438


In [6]:
for i in range(1,4):
    print(len(replacement_candidates[i]),replacement_candidates[i])

visited_processes = set()
new_process_dictionary = {}

for key, values in replacement_candidates.items():
    new_values = []
    for value in values:
        if value not in visited_processes:
            visited_processes.add(value)
            new_values.append(value)
    if new_values:  # Only add non-empty lists
        new_process_dictionary[key] = sorted(new_values)


print(len(replacement_candidates))
print(len(new_process_dictionary))


375 [1, 34821, 4108, 34837, 32806, 20519, 30764, 12334, 14383, 2098, 34871, 18490, 16443, 28730, 26686, 64, 24656, 20575, 2143, 101, 16504, 22656, 12416, 20620, 10384, 10388, 4246, 30874, 16542, 2217, 28848, 34999, 26810, 10429, 35011, 10438, 10444, 35026, 2262, 37085, 10469, 12524, 8432, 6392, 4354, 37134, 12566, 20761, 22813, 12581, 22826, 37162, 16692, 6454, 33083, 18748, 35149, 4432, 2389, 4440, 33115, 20830, 24926, 2408, 22888, 22891, 31083, 18812, 6539, 10642, 10645, 8606, 27039, 24998, 10666, 8650, 2507, 8665, 475, 6624, 491, 33259, 2542, 6638, 33270, 10752, 2562, 14854, 35335, 23051, 29195, 10765, 527, 31253, 27159, 14871, 2588, 2591, 35362, 4645, 10789, 2603, 29236, 4674, 33348, 10825, 21074, 37461, 21086, 31333, 4710, 4715, 37488, 25201, 8818, 29302, 27255, 652, 2702, 33431, 666, 15014, 37555, 12985, 10943, 15043, 10948, 4806, 15046, 712, 37578, 4814, 8912, 31459, 23270, 31466, 29420, 21234, 4854, 37626, 11004, 23294, 4865, 13058, 27409, 21266, 8984, 19233, 8998, 11053, 21295

## Replace with same represenation

In [7]:
users = []
for key in new_process_dictionary:
    users.append(key)

# We do minhashlsh again on the smaller dataset to find similar items with jacc 0.8

In [43]:
lsh = MinHashLSH(threshold=0.8, num_perm=128)
minhashes = {}
hashvalues = {}


final_buckets = {}
filtered_df = df_grouped[df_grouped['user_id'].isin(users)]
print(filtered_df.count())

for features in filtered_df.collect():
    shingles = shingle(features["features"], 3)
    m = MinHash(num_perm=128)
    for shingle_item in shingles:
        m.update(shingle_item.encode("utf8"))
    minhashes[int(features["user_id"])] = m
    lsh.insert(int(features["user_id"]), m)
    neigbours = lsh.query(m)
    print(features["user_id"], neigbours)
    final_buckets[features["user_id"]] = neigbours

print(final_buckets)

118
496 [496]
243 [243]
392 [392]
31 [392, 31]
85 [243, 85]
580 [496, 580]
808 [580, 808, 392, 243, 31]
65 [392, 65, 808, 31]
53 [65, 580, 496, 85, 53]
255 [580, 808, 496, 243, 255]
133 [243, 85, 133]
296 [296, 85, 53, 133]
78 [65, 580, 392, 808, 78, 496, 53, 31, 255]
322 [322, 580, 133, 296, 496, 243, 85]
362 [808, 362, 580, 255]
673 [65, 673]
976 [673, 322, 580, 133, 296, 976, 85, 53]
108 [580, 808, 392, 108, 78, 496, 243, 53, 255]
34 [65, 34, 255, 580, 392, 808, 362, 31]
193 [193, 34, 65, 392, 808, 31]
211 [580, 392, 108, 78, 976, 496, 211, 53]
796 [193, 34, 673, 322, 243, 85, 796]
126 [65, 673, 193, 34, 392, 808, 78, 796, 126, 31]
81 [673, 322, 133, 296, 976, 81, 243, 53, 85]
28 [65, 580, 108, 78, 976, 81, 496, 211, 53, 28]
210 [193, 65, 34, 392, 362, 108, 210, 211, 126, 31]
300 [580, 296, 300, 108, 78, 496, 81, 211, 53, 85, 28, 126, 31]
2094 [580, 108, 300, 2094, 78, 976, 496, 211, 53, 28, 126, 31]
76 [65, 34, 193, 580, 255, 392, 808, 362, 76, 210, 126, 31]
688 [322, 580, 133, 296

## Verification

In [39]:
sims = {}
for key, value in final_buckets.items():
    for user_id_1 in final_buckets[key]:
        for user_id_2 in final_buckets[key]:
            if user_id_1 != user_id_2:
                sig_1 = minhashes[int(user_id_1)]
                sig_2 = minhashes[int(user_id_2)]
                sim = MinHash.jaccard(sig_1, sig_2)
                if key not in sims:
                    sims[key] = [sim]
                else:
                    sims[key].append(sim)

In [42]:
#Get averege jaccard value per bucket
from numpy import average

sims = dict(sorted(sims.items()))
for key,value in sims.items():
    print(key, average(sims[key]))


1 0.8404820261437909
3 0.8546811995967742
6 0.8566881613756614
12 0.859375
13 0.8548450630252101
16 0.8520653735632184
20 0.8580938057040999
22 0.8682598039215687
26 0.8404829545454545
27 0.8638888888888889
28 0.8585069444444444
31 0.828125
34 0.8643973214285714
40 0.8595142602495544
44 0.8810096153846154
47 0.8553605769230769
52 0.8513621794871795
53 0.8296875
54 0.8406708595387841
57 0.8478767641129032
65 0.8841145833333334
76 0.8631628787878788
78 0.8500434027777778
81 0.8745659722222222
85 0.875
86 0.8479959239130435
94 0.8509915329768271
96 0.8599443319838057
103 0.8712797619047619
108 0.85546875
111 0.8406704695767195
120 0.8560118140243902
122 0.8541209795321637
126 0.8609375
128 0.8593098958333333
133 0.890625
139 0.8611018270944741
157 0.861328125
159 0.8699404761904762
164 0.8497840447154471
168 0.8569556451612903
169 0.8526542467948718
185 0.8570449561403509
192 0.8395089285714286
193 0.88125
206 0.8520665322580645
210 0.8552083333333333
211 0.8557477678571429
212 0.84517457

In [11]:
# indexer = StringIndexer(inputCol="features", outputCol="from_to_type_index")
# indexed_data = indexer.fit(actual_routes_feature).transform(actual_routes_feature)
# assembler = VectorAssembler(inputCols=["from_to_type_index"], outputCol="vector")
# actual_feature_data = assembler.transform(indexed_data)

# actual_feature_data.show()
# # def is_non_zero_vector(vector):
# #     return vector.numNonzeros() > 0

# # is_non_zero_vector_udf = udf(is_non_zero_vector, BooleanType())

# # filtered_data = actual_feature_data.filter(is_non_zero_vector_udf(col("vector")))


# mh = MinHashLSH(inputCol="vector", outputCol="hashes", numHashTables=5, seed=1003)
# model = mh.fit(actual_feature_data)

# #transformed_filtered_data = model.transform(actual_feature_data).head()
# test = model.approxNearestNeighbors()

# # transformed_filtered_data.show(truncate=False, n=50)


# #similar_items.show(truncate=False)

# def is_non_zero_vector(vector):
#     return vector.numNonzeros() > 0

# from collections import defaultdict

# representative_mapping = {}

# group_mapping = defaultdict(list)

# # Iterate over the user neighbors dictionary
# for user, neighbors in new_process_dictionary.items():
#     neighbors_sorted = tuple(sorted(neighbors))
#     if neighbors_sorted in representative_mapping:
#         representative = representative_mapping[neighbors_sorted]
#     else:
#         representative = neighbors_sorted[0]
#         for neighbor in neighbors_sorted:
#             representative_mapping[neighbor] = representative
    
#     representative_mapping[user] = representative
#     group_mapping[representative].append(user)

# new_user_neighbors = {}
# for representative, users in group_mapping.items():
#     new_user_neighbors[representative] = users

# #print(new_user_neighbors)
# new_users = []
# for key, value in new_user_neighbors.items():
#     new_users.append(key)

# print(len(new_users))
# filtered_df = df_grouped[df_grouped['user_id'].isin(new_users)]
# print(f"Amount of processes: {filtered_df.count()}")
