In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("test").getOrCreate()

In [2]:
#df = spark.read.csv("data/SGD_TESTING.csv").show()

def shingle(text, k):
    shingle_set = []
    for i in range(len(text)-k +1):
        shingle_set.append(text[i:i+k])
    return set(shingle_set)


In [3]:
data = spark.read.csv("data/SGD_TESTING.csv", header=True, inferSchema=True)

## First we find highly similar users with jaccard of 0.95

In [20]:
#from pyspark.ml.feature import MinHashLSH
from pyspark.ml.feature import Word2Vec, StringIndexer, VectorAssembler, DataFrame
from pyspark.sql.functions import col, array, explode, concat_ws, collect_list, udf, lit
import numpy as np 
from pyspark.sql.types import StringType, ArrayType, BooleanType
from datasketch import MinHash, MinHashLSH


df = data.withColumn("arrayColumn", concat_ws("","from", "to", "type")).withColumn("Minhash", lit(""))

df_grouped = df.groupBy("user_id").agg(
    concat_ws("",collect_list("arrayColumn")).alias("features")
)

#replace stuff with 0.95 jacc with same rep
lsh = MinHashLSH(threshold=0.95, num_perm=128)
minhashes = {}
hashvalues = {}


replacement_candidates = {}

for features in df_grouped.collect():
    shingles = shingle(features["features"], 3)
    m = MinHash(num_perm=128)
    for shingle_item in shingles:
        m.update(shingle_item.encode("utf8"))
    minhashes[int(features["user_id"])] = m
    lsh.insert(int(features["user_id"]), m)
    neigbours = lsh.query(m)
    print(features["user_id"], neigbours)
    replacement_candidates[features["user_id"]] = neigbours

print(replacement_candidates)
print(minhashes)
# sig_1 = minhashes[int(1)]
# sig_2 = minhashes[int(2)]
# print(MinHash.jaccard(sig_1, sig_2))



1 [1]
2 [2]
3 [3]
4 [4]
5 [4, 5]
6 [6]
7 [7]
8 [8]
9 [9]
10 [10, 4, 5]
11 [11]
12 [12]
13 [13]
14 [14]
15 [15]
16 [16, 2]
17 [17]
18 [18]
19 [19]
20 [20]
21 [10, 5, 21]
22 [13, 22]
23 [23]
24 [24, 7]
25 [25]
26 [26]
27 [10, 27, 4, 5]
28 [28]
29 [29]
30 [17, 30]
31 [25, 31]
32 [32]
33 [33]
34 [34]
35 [8, 35]
36 [36]
37 [4, 5, 37, 10, 27]
38 [38, 14]
39 [39]
40 [40, 38, 14]
41 [41]
42 [42, 29]
43 [43]
44 [44]
45 [45]
46 [46]
47 [47]
48 [48, 20]
49 [49]
50 [49, 50]
51 [51, 13, 22]
52 [52]
53 [28, 53]
54 [54]
55 [40, 14, 38, 55]
56 [56, 42, 29]
57 [57]
58 [58]
59 [59]
60 [60]
61 [4, 5, 37, 10, 21, 27, 61]
62 [62]
63 [32, 63]
64 [64, 1]
65 [65, 17, 30]
66 [66, 4, 5, 37, 10, 21, 27, 61]
67 [67]
68 [56, 42, 68, 29]
69 [69]
70 [38, 70, 40, 14, 55]
71 [71]
72 [72]
73 [73]
74 [74]
75 [68, 42, 75, 56, 29]
76 [76]
77 [25, 77, 31]
78 [62, 78]
79 [4, 37, 79, 21, 27]
80 [72, 80]
81 [81, 9]
82 [68, 42, 75, 82, 56, 29]
83 [38, 70, 40, 14, 83, 55]
84 [51, 84, 13, 22]
85 [74, 85]
86 [49, 50, 86]
87 [87]


## Replace with same represenation

In [33]:
from collections import defaultdict

representative_mapping = {}

group_mapping = defaultdict(list)

# Iterate over the user neighbors dictionary
for user, neighbors in replacement_candidates.items():
    neighbors_sorted = tuple(sorted(neighbors))
    if neighbors_sorted in representative_mapping:
        representative = representative_mapping[neighbors_sorted]
    else:
        representative = neighbors_sorted[0]
        for neighbor in neighbors_sorted:
            representative_mapping[neighbor] = representative
    
    representative_mapping[user] = representative
    group_mapping[representative].append(user)

new_user_neighbors = {}
for representative, users in group_mapping.items():
    new_user_neighbors[representative] = users

print(new_user_neighbors)
new_users = []
for key, value in new_user_neighbors.items():
    new_users.append(key)

filtered_df = df_grouped[df_grouped['user_id'].isin(new_users)]

filtered_df.show()


{1: [1, 64, 101], 2: [2, 16, 96, 205, 257, 305, 349], 3: [3, 148, 160, 252, 270, 309, 326], 4: [4, 5, 10, 27, 37, 61, 66, 79, 88, 126, 132, 141, 240, 244, 251, 299, 333], 6: [6, 144, 209, 250], 7: [7, 24, 110, 138, 204, 268, 329, 332, 347], 8: [8, 35, 129, 197, 207, 282, 316], 9: [9, 81, 153, 241], 11: [11, 111, 118], 12: [12, 243, 264, 274, 318], 13: [13, 22, 51, 84, 176], 14: [14, 38, 40, 55, 70, 83, 91, 93, 109, 124, 140, 142, 145, 150, 152, 182, 183, 276, 288, 313, 324], 15: [15, 173, 181], 17: [17, 30, 65, 149, 198, 233, 260, 277, 278, 287, 290], 18: [18, 267], 19: [19, 168, 177, 210, 214, 258], 20: [20, 48, 119, 136, 146, 297, 301, 354], 5: [21], 23: [23, 122, 130, 213, 340], 25: [25, 31, 77, 115, 155, 186, 203, 221, 261, 281, 337], 26: [26, 137], 28: [28, 53, 158, 222, 248, 336], 29: [29, 42, 56, 68, 75, 82, 105, 112, 162, 230, 234, 262, 294, 359], 32: [32, 63, 189, 295], 33: [33], 34: [34, 134], 36: [36, 95, 231], 39: [39, 94, 193, 202, 225, 239], 41: [41, 184, 187, 254, 255], 

# We do minhashlsh again on the smaller dataset to find similar items with jacc 0.8

In [34]:
lsh = MinHashLSH(threshold=0.8, num_perm=128)
minhashes = {}
hashvalues = {}


final_buckets = {}

for features in filtered_df.collect():
    shingles = shingle(features["features"], 3)
    m = MinHash(num_perm=128)
    for shingle_item in shingles:
        m.update(shingle_item.encode("utf8"))
    minhashes[int(features["user_id"])] = m
    lsh.insert(int(features["user_id"]), m)
    neigbours = lsh.query(m)
    print(features["user_id"], neigbours)
    final_buckets[features["user_id"]] = neigbours

print(final_buckets)

85 [85]
296 [296, 85]
34 [34]
81 [296, 81, 85]
28 [81, 28]
76 [34, 76]
26 [296, 76, 81, 85, 26]
44 [28, 85, 44]
192 [192, 296, 76, 81, 85, 26]
253 [34, 253]
103 [28, 44, 103]
12 [192, 296, 12, 44, 81, 85, 26]
22 [103, 296, 44, 85, 22, 28]
128 [128, 34, 76]
319 [103, 44, 22, 26, 28, 319]
157 [253, 157, 319]
47 [192, 128, 34, 76, 47, 26, 28, 319]
291 [192, 291, 296, 44, 85, 26, 28]
1 [1, 291, 157, 253]
52 [128, 34, 76, 47, 52, 157, 253, 319]
212 [192, 34, 103, 76, 44, 47, 212, 52, 22, 26, 28, 319]
13 [34, 103, 296, 12, 13, 44, 76, 212, 85, 22, 28, 319]
6 [192, 6, 296, 12, 81, 85, 26]
168 [192, 128, 34, 168, 76, 44, 47, 212, 52, 26]
3 [128, 34, 3, 168, 76, 47, 212, 52, 319]
20 [128, 34, 3, 103, 168, 76, 47, 52, 20, 212, 22, 157, 319]
169 [1, 34, 6, 103, 169, 44, 13, 76, 212, 22, 28, 319]
57 [1, 20, 52, 57, 157, 253, 319]
54 [192, 103, 168, 169, 44, 76, 13, 47, 212, 54, 22, 26, 28, 319]
120 [1, 103, 168, 169, 12, 13, 44, 212, 85, 22, 54, 120, 28, 319]
96 [96, 128, 34, 3, 168, 76, 47, 52, 2

## Verification

In [35]:
sims = []
for key, value in final_buckets.items():
    for user_id_1 in final_buckets[key]:
        for user_id_2 in final_buckets[key]:
            if user_id_1 != user_id_2:
                sig_1 = minhashes[int(user_id_1)]
                sig_2 = minhashes[int(user_id_2)]
                sim = MinHash.jaccard(sig_1, sig_2)
                print(user_id_1, user_id_2, sim)
                sims.append([key, user_id_1, user_id_2, sim])


296 85 0.9609375
85 296 0.9609375
296 81 0.921875
296 85 0.9609375
81 296 0.921875
81 85 0.9140625
85 296 0.9609375
85 81 0.9140625
81 28 0.8125
28 81 0.8125
34 76 0.890625
76 34 0.890625
296 76 0.765625
296 81 0.921875
296 85 0.9609375
296 26 0.9140625
76 296 0.765625
76 81 0.7265625
76 85 0.8046875
76 26 0.8203125
81 296 0.921875
81 76 0.7265625
81 85 0.9140625
81 26 0.84375
85 296 0.9609375
85 76 0.8046875
85 81 0.9140625
85 26 0.875
26 296 0.9140625
26 76 0.8203125
26 81 0.84375
26 85 0.875
28 85 0.8515625
28 44 0.9765625
85 28 0.8515625
85 44 0.875
44 28 0.9765625
44 85 0.875
192 296 0.8671875
192 76 0.8671875
192 81 0.828125
192 85 0.90625
192 26 0.953125
296 192 0.8671875
296 76 0.765625
296 81 0.921875
296 85 0.9609375
296 26 0.9140625
76 192 0.8671875
76 296 0.765625
76 81 0.7265625
76 85 0.8046875
76 26 0.8203125
81 192 0.828125
81 296 0.921875
81 76 0.7265625
81 85 0.9140625
81 26 0.84375
85 192 0.90625
85 296 0.9609375
85 76 0.8046875
85 81 0.9140625
85 26 0.875
26 192 0.95

In [None]:
# indexer = StringIndexer(inputCol="features", outputCol="from_to_type_index")
# indexed_data = indexer.fit(actual_routes_feature).transform(actual_routes_feature)
# assembler = VectorAssembler(inputCols=["from_to_type_index"], outputCol="vector")
# actual_feature_data = assembler.transform(indexed_data)

# actual_feature_data.show()
# # def is_non_zero_vector(vector):
# #     return vector.numNonzeros() > 0

# # is_non_zero_vector_udf = udf(is_non_zero_vector, BooleanType())

# # filtered_data = actual_feature_data.filter(is_non_zero_vector_udf(col("vector")))


# mh = MinHashLSH(inputCol="vector", outputCol="hashes", numHashTables=5, seed=1003)
# model = mh.fit(actual_feature_data)

# #transformed_filtered_data = model.transform(actual_feature_data).head()
# test = model.approxNearestNeighbors()

# # transformed_filtered_data.show(truncate=False, n=50)


# #similar_items.show(truncate=False)

# def is_non_zero_vector(vector):
#     return vector.numNonzeros() > 0