In [97]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec, StringIndexer, VectorAssembler, DataFrame
from pyspark.sql.functions import col, array, explode, concat_ws, collect_list, udf, lit
import numpy as np 
from pyspark.sql.types import StringType, ArrayType, BooleanType
from datasketch import MinHash, MinHashLSH
from sklearn.cluster import KMeans
from numpy import average
import shutil
import os


# All functions used

In [None]:
def shingle(text, k):
    shingle_set = []
    for i in range(len(text)-k +1):
        shingle_set.append(text[i:i+k])
    return set(shingle_set)

def minhash_lsh(df, k_shingle,threshold):

    lsh = MinHashLSH(threshold=threshold, num_perm=128)
    minhashes = {}

    for features in df.collect():
        shingles = shingle(features["features"], k_shingle)
        m = MinHash(num_perm=128)
        for shingle_item in shingles:
            m.update(shingle_item.encode("utf8"))
        minhashes[int(features["user_id"])] = m
        lsh.insert(int(features["user_id"]), m)

    replacement_candidates = {}
    for key in lsh.keys: 
        replacement_candidates[key] = lsh.query(minhashes[key]) 

    #Key: New representative, value: Similar items
    return replacement_candidates

#Iteratively bucket unique processes together
def bucketing(replacement_candidates):
    visited_processes = set()
    new_process_dictionary = {}
    for key, values in replacement_candidates.items():
        new_values = []
        for value in values:
            if value not in visited_processes:
                visited_processes.add(value)
                new_values.append(value)
        if new_values:  # Only add non-empty lists
            new_process_dictionary[key] = sorted(new_values)
    return new_process_dictionary


def kmeans_clustering(df, n_clusters, max_iter):
    minhashes = []
    #for jaccard verification
    minhash_dict = {}
    user_ids = []
    final_buckets = {}
    for features in df.collect():
        shingles = shingle(features["features"], 5)
        m = MinHash(num_perm=128)
        for shingle_item in shingles:
            m.update(shingle_item.encode("utf8"))
        minhashes.append(m.hashvalues)
        minhash_dict[int(features["user_id"])] = m
        user_ids.append(int(features["user_id"]))

    kmeans = KMeans(n_clusters=n_clusters, max_iter=max_iter).fit(minhashes)

    user_clusters = dict(zip(user_ids, kmeans.labels_))
    final_buckets = {}
    for key, value in user_clusters.items():
        if value in final_buckets:
            final_buckets[value].append(key)
        else:
            final_buckets[value] = [key]

    return final_buckets, minhash_dict


#Get averege jaccard value per bucket

def get_averege_jaccard_sim(final_buckets, minhashes):
    sims = {}
    for key, value in final_buckets.items():
        for user_id_1 in final_buckets[key]:
            for user_id_2 in final_buckets[key]:
                if user_id_1 != user_id_2:
                    sig_1 = minhashes[int(user_id_1)]
                    sig_2 = minhashes[int(user_id_2)]
                    sim = MinHash.jaccard(sig_1, sig_2)
                    if key not in sims:
                        sims[key] = [sim]
                    else:
                        sims[key].append(sim)
    total_sum = 0
    total_count = 0
    sims = dict(sorted(sims.items()))
    for key, value in sims.items():
        avg_sim = average(value)
        print(key, avg_sim)
        total_sum += sum(value)
        total_count += len(value)
    
    overall_average = total_sum / total_count if total_count != 0 else 0
    print("Overall Average Jaccard Similarity:", overall_average)


# Experiments

In [142]:
spark = SparkSession.builder.appName("spark_session_1").getOrCreate()
data = spark.read.csv("data/SDG_dataset2.csv", header=True, inferSchema=True)


df = data.withColumn("arrayColumn", concat_ws("","to")).withColumn("Minhash", lit(""))
df_filtered_m = df.filter(df.type.isin(['Req']))
df_grouped = df_filtered_m.groupBy("user_id").agg(
    concat_ws("",collect_list("arrayColumn")).alias("features")
)


In [143]:
df_grouped.show()

+-------+--------------------+
|user_id|            features|
+-------+--------------------+
|     12|S0S6S7S5S5_1S5_1_...|
|     22|S0S5S5_1S5_1_1S6S...|
|     26|S0S2S2_4S5S5_1S5_1_1|
|     27|S0S2S2_3S2_1S2_4S...|
|     28|S0S7S3S3_1S3_2S3_...|
|     31|S0S3S3_1S3_2S3_3S...|
|     34|S0S6S7S1S1_3S3S3_...|
|     44|          S0S7S1S1_1|
|     47|S0S4S4_1S7S5S5_2S...|
|     53|S0S1S1_3S3S3_1S3_...|
|     65|S0S5S5_1S5_1_1S4S...|
|     76|S0S3S3_1S3_2S3_3S...|
|     78|S0S5S5_2S5_2_1S3S...|
|     81|S0S2S2_2S4S4_2S7S...|
|     85|S0S3S3_1S3_2S3_3S...|
|     91|S0S2S2_2S2_4S2_1S...|
|     93|S0S4S4_3S3S3_1S3_...|
|    101|S0S3S3_1S3_2S3_3S...|
|    103|S0S1S1_3S2S2_4S2_...|
|    108|S0S3S3_1S3_2S3_3S...|
+-------+--------------------+
only showing top 20 rows



                                                                                

In [139]:
df_filtered_m.show()

+----+------+--------------------+----+-------+
|from|    to|           timestamp|type|user_id|
+----+------+--------------------+----+-------+
|null|    S0| 2024-06-03 09:50:18| Req|      1|
|  S0|    S3| 2024-06-03 09:50:18| Req|      1|
|  S3|  S3_1| 2024-06-03 09:50:18| Req|      1|
|  S3|  S3_2|2024-06-03 09:50:...| Req|      1|
|  S3|  S3_3|2024-06-03 09:50:...| Req|      1|
|  S0|    S4|2024-06-03 09:50:...| Req|      1|
|  S4|  S4_3|2024-06-03 09:50:...| Req|      1|
|  S0|    S5|2024-06-03 09:50:...| Req|      1|
|  S5|  S5_1|2024-06-03 09:50:...| Req|      1|
|S5_1|S5_1_3|2024-06-03 09:50:...| Req|      1|
|  S0|    S1| 2024-06-03 09:50:36| Req|      1|
|  S1|  S1_1| 2024-06-03 09:50:36| Req|      1|
|null|    S0| 2024-06-03 09:48:27| Req|      2|
|  S0|    S7| 2024-06-03 09:48:27| Req|      2|
|  S0|    S4| 2024-06-03 09:48:27| Req|      2|
|  S4|  S4_3| 2024-06-03 09:48:27| Req|      2|
|  S0|    S2|2024-06-03 09:48:...| Req|      2|
|  S2|  S2_3|2024-06-03 09:48:...| Req| 

In [134]:
df.show()

+------+------+--------------------+----+-------+-----------+-------+
|  from|    to|           timestamp|type|user_id|arrayColumn|Minhash|
+------+------+--------------------+----+-------+-----------+-------+
|  null|    S0| 2024-06-03 09:50:18| Req|      1|     nullS0|       |
|    S0|    S3| 2024-06-03 09:50:18| Req|      1|       S0S3|       |
|    S3|  S3_1| 2024-06-03 09:50:18| Req|      1|     S3S3_1|       |
|  S3_1|    S3|2024-06-03 09:50:...| Res|      1|     S3_1S3|       |
|    S3|  S3_2|2024-06-03 09:50:...| Req|      1|     S3S3_2|       |
|  S3_2|    S3|2024-06-03 09:50:...| Res|      1|     S3_2S3|       |
|    S3|  S3_3|2024-06-03 09:50:...| Req|      1|     S3S3_3|       |
|  S3_3|    S3|2024-06-03 09:50:...| Res|      1|     S3_3S3|       |
|    S3|    S0|2024-06-03 09:50:...| Res|      1|       S3S0|       |
|    S0|    S4|2024-06-03 09:50:...| Req|      1|       S0S4|       |
|    S4|  S4_3|2024-06-03 09:50:...| Req|      1|     S4S4_3|       |
|  S4_3|    S4|2024-

In [136]:
df_grouped.show()

[Stage 155:=====>                                                 (1 + 10) / 11]

+-------+--------------------+
|user_id|            features|
+-------+--------------------+
|     12|S0S6S0S7S0S5S5_1S...|
|     26|S0S2S2_4S0S5S5_1S...|
|     27|S0S2S2_3S2_1S2_4S...|
|     28|S0S7S0S3S3_1S3S3_...|
|     31|S0S3S3_1S3S3_2S3S...|
|     34|S0S6S0S7S0S1S1_3S...|
|     44|S0S7S0S1S1_1S1S0null|
|     53|S0S1S1_3S1S0S3S3_...|
|     65|S0S5S5_1S5_1_1S5_...|
|     76|S0S3S3_1S3S3_2S3S...|
|     78|S0S5S5_2S5_2_1S5_...|
|     81|S0S2S2_2S0S4S4_2S...|
|     85|S0S3S3_1S3S3_2S3S...|
|     91|S0S2S2_2S2_4S2_1S...|
|    101|S0S3S3_1S3S3_2S3S...|
|    103|S0S1S1_3S1S0S2S2_...|
|    108|S0S3S3_1S3S3_2S3S...|
|    115|S0S2S2_2S0S5S5_1S...|
|    126|S0S3S3_1S3S3_2S3S...|
|    133|S0S2S2_5S2_3S0S7S...|
+-------+--------------------+
only showing top 20 rows



                                                                                

## Parameter-tuning for k-shingles

In [111]:
replacement_candidates3 = minhash_lsh(df_grouped,3,0.95)
new_process_dictionary3 = bucketing(replacement_candidates3)
print(f"Initial processes: {len(replacement_candidates3)}")
print(f"After merging processes: {len(new_process_dictionary3)}")

replacement_candidates5 = minhash_lsh(df_grouped,5,0.95)
new_process_dictionary5 = bucketing(replacement_candidates5)
print(f"Initial processes: {len(replacement_candidates5)}")
print(f"After merging processes: {len(new_process_dictionary5)}")

replacement_candidates7 = minhash_lsh(df_grouped,7,0.99)
new_process_dictionary7 = bucketing(replacement_candidates7)
print(f"Initial processes: {len(replacement_candidates7)}")
print(f"After merging processes: {len(new_process_dictionary7)}")

Initial processes: 45579
After merging processes: 8244
Initial processes: 45579
After merging processes: 12410
Initial processes: 45579
After merging processes: 33562


In [144]:
replacement_candidates7 = minhash_lsh(df_grouped,5,0.98)
new_process_dictionary7 = bucketing(replacement_candidates7)
print(f"Initial processes: {len(replacement_candidates7)}")
print(f"After merging processes: {len(new_process_dictionary7)}")

                                                                                

Initial processes: 45525
After merging processes: 35443


In [146]:
new_process_dictionary7[26]
# = data.filter(df.user_id.isin(user_ids))

[26, 30205, 32948, 44407]

In [151]:
data1 = data.filter(df.user_id.isin([30205]))
data2 = data.filter(df.user_id.isin([32948]))
data3 = data.filter(df.user_id.isin([26]))
data3.show()


+------+------+-------------------+----+-------+
|  from|    to|          timestamp|type|user_id|
+------+------+-------------------+----+-------+
|  null|    S0|2024-06-03 10:44:00| Req|     26|
|    S0|    S2|2024-06-03 10:44:00| Req|     26|
|    S2|  S2_4|2024-06-03 10:44:00| Req|     26|
|    S2|    S0|2024-06-03 10:44:01| Res|     26|
|    S0|    S5|2024-06-03 10:44:01| Req|     26|
|    S5|  S5_1|2024-06-03 10:44:01| Req|     26|
|  S5_1|S5_1_1|2024-06-03 10:44:01| Req|     26|
|S5_1_1|  S5_1|2024-06-03 10:44:08| Res|     26|
|  S5_1|    S5|2024-06-03 10:44:08| Res|     26|
|    S5|    S0|2024-06-03 10:44:08| Res|     26|
|    S0|  null|2024-06-03 10:44:08| Res|     26|
+------+------+-------------------+----+-------+



In [150]:
data2.show()

+------+------+--------------------+----+-------+
|  from|    to|           timestamp|type|user_id|
+------+------+--------------------+----+-------+
|  null|    S0| 2024-06-03 09:16:09| Req|  32948|
|    S0|    S2| 2024-06-03 09:16:09| Req|  32948|
|    S2|  S2_4| 2024-06-03 09:16:09| Req|  32948|
|    S2|    S0|2024-06-03 09:16:...| Res|  32948|
|    S0|    S5|2024-06-03 09:16:...| Req|  32948|
|    S5|  S5_1|2024-06-03 09:16:...| Req|  32948|
|  S5_1|S5_1_1|2024-06-03 09:16:...| Req|  32948|
|S5_1_1|  S5_1|2024-06-03 09:16:...| Res|  32948|
|  S5_1|    S5|2024-06-03 09:16:...| Res|  32948|
|    S5|    S0|2024-06-03 09:16:...| Res|  32948|
|    S0|  null|2024-06-03 09:16:...| Res|  32948|
+------+------+--------------------+----+-------+



In [148]:
import numpy as np
desired_column_list1 = data1.select("to").rdd.flatMap(lambda x: x).collect()
#desired_column_list1
desired_column_list2 = data2.select("to").rdd.flatMap(lambda x: x).collect()
#desired_column_list2
common_elements = np.intersect1d(desired_column_list1, desired_column_list2)
union_elements = np.union1d(desired_column_list1, desired_column_list2)
#print(common_elements,union_elements)


In [149]:
print(len(common_elements)/len(union_elements))

1.0


['S0',
 'S4',
 'S4_4',
 'S4',
 'S0',
 'S5',
 'S5_2',
 'S5_2_2',
 'S5_2',
 'S5',
 'S0',
 'S7',
 'S0',
 'S6',
 'S0',
 'S2',
 'S2_1',
 'S2_4',
 'S2_2',
 'S2_5',
 'S0',
 'S3',
 'S3_1',
 'S3',
 'S3_2',
 'S3',
 'S3_3',
 'S3',
 'S0',
 'S1',
 'S1_1',
 'S1',
 'S0',
 'null']

In [110]:
import shutil
import os

def write_df(df,file_name):
    os.makedirs('Output', exist_ok=True)
    #temporary folder to save all the temporaty files created by write.csv
    os.makedirs('temp', exist_ok=True)
    df.write.csv('temp/temp_outoput', header=True, mode="overwrite")
    part_file = [f for f in os.listdir('temp/temp_outoput') if f.startswith("part-")][0]

    shutil.move(os.path.join('temp/temp_outoput', part_file), file_name)
    shutil.rmtree('temp/temp_outoput')
    shutil.rmtree('temp')

def output_part1(dataset,k,threshold):
    data = spark.read.csv(dataset, header=True, inferSchema=True)
    df = data.withColumn("arrayColumn", concat_ws("","from", "to")).withColumn("Minhash", lit(""))
    df_grouped = df.groupBy("user_id").agg(
                                            concat_ws("",collect_list("arrayColumn")).alias("features"))
    
    replacement_candidates = minhash_lsh(df_grouped,k,threshold)
    new_process_dictionary = bucketing(replacement_candidates)    

    user_ids = list(new_process_dictionary.keys())
    output_df = data.filter(df.user_id.isin(user_ids))
    final_df = output_df.coalesce(1)
    return write_df(final_df,'Output/part1Output.csv')

# output_part1("data/SDG_dataset2.csv",3,0.95)
# output_part1("data/SDG_dataset2.csv",5,0.95)
output_part1("data/SDG_dataset2.csv",7,0.95)


                                                                                

## Parameter tuning for Kmeans

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

minhashes = []
user_ids = []
final_buckets = {}
for features in df_grouped.collect():
    shingles = shingle(features["features"], 5)
    m = MinHash(num_perm=128)
    for shingle_item in shingles:
        m.update(shingle_item.encode("utf8"))
    minhashes.append(m.hashvalues)
    user_ids.append(int(features["user_id"]))

param_grid = {
    'n_clusters': [100, 250, 500],
    'max_iter': [100, 500, 1000],
}

kmeans = KMeans()

grid_search = GridSearchCV(kmeans, param_grid, cv=5)

grid_search.fit(minhashes)

best_param = grid_search.best_params_
best_model = grid_search.best_estimator_

print("The best parameters are: " , best_param )
print("The best model is: ", best_model)

## STEP 1: Find and merge

In [None]:
replacement_candidates = minhash_lsh(df_grouped,7)
new_process_dictionary = bucketing(replacement_candidates)
print(len(replacement_candidates))
print(len(new_process_dictionary))

## STEP 2: Find/cluster similar items

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

users = []
for key in new_process_dictionary:
    users.append(key)

filtered_df = df_grouped[df_grouped['user_id'].isin(users)]

final_buckets, minhashes = kmeans_clustering(filtered_df,500,100)

## Verification

In [None]:
get_averege_jaccard_sim(final_buckets, minhashes)

In [None]:
# indexer = StringIndexer(inputCol="features", outputCol="from_to_type_index")
# indexed_data = indexer.fit(actual_routes_feature).transform(actual_routes_feature)
# assembler = VectorAssembler(inputCols=["from_to_type_index"], outputCol="vector")
# actual_feature_data = assembler.transform(indexed_data)

# actual_feature_data.show()
# # def is_non_zero_vector(vector):
# #     return vector.numNonzeros() > 0

# # is_non_zero_vector_udf = udf(is_non_zero_vector, BooleanType())

# # filtered_data = actual_feature_data.filter(is_non_zero_vector_udf(col("vector")))


# mh = MinHashLSH(inputCol="vector", outputCol="hashes", numHashTables=5, seed=1003)
# model = mh.fit(actual_feature_data)

# #transformed_filtered_data = model.transform(actual_feature_data).head()
# test = model.approxNearestNeighbors()

# # transformed_filtered_data.show(truncate=False, n=50)


# #similar_items.show(truncate=False)

# def is_non_zero_vector(vector):
#     return vector.numNonzeros() > 0

# from collections import defaultdict

# representative_mapping = {}

# group_mapping = defaultdict(list)

# # Iterate over the user neighbors dictionary
# for user, neighbors in new_process_dictionary.items():
#     neighbors_sorted = tuple(sorted(neighbors))
#     if neighbors_sorted in representative_mapping:
#         representative = representative_mapping[neighbors_sorted]
#     else:
#         representative = neighbors_sorted[0]
#         for neighbor in neighbors_sorted:
#             representative_mapping[neighbor] = representative
    
#     representative_mapping[user] = representative
#     group_mapping[representative].append(user)

# new_user_neighbors = {}
# for representative, users in group_mapping.items():
#     new_user_neighbors[representative] = users

# #print(new_user_neighbors)
# new_users = []
# for key, value in new_user_neighbors.items():
#     new_users.append(key)

# print(len(new_users))
# filtered_df = df_grouped[df_grouped['user_id'].isin(new_users)]
# print(f"Amount of processes: {filtered_df.count()}")
#     shingles = shingle(features["features"], 5)
#     m = MinHash(num_perm=128)
#     for shingle_item in shingles:
#         m.update(shingle_item.encode("utf8"))
#     minhashes[int(features["user_id"])] = m
#     lsh.insert(int(features["user_id"]), m)
#     neigbours = lsh.query(m)
#     print(features["user_id"], neigbours)
#     final_buckets[features["user_id"]] = neigbours

# print(final_buckets)