In [None]:
from data_generation import *
from main import *

# Synthetic Dataset Generation

In [None]:
tasks = {'log_in': ['one',['credentials check','sing up','recover pw and log in'],(0,10)],
            'search_book': ['rand',['history','fantasy','crime','poetry','biography'],(5,15)],
            'shipment' : ['all',['adress','door number','zip code'],(15,50)],
            'payment' : ['one',['visa','master card','revolut','paypal','apple pay'],(10,100)],
            'new_site' : ['con',['site1','site2'],(10,100)],
            'site1' : ['opt',['site1_1','site1_2','site1_3'],(10,100)],
            'site2' : ['opt',['site2_1','site2_2'],(10,100)]
}   
start_time = datetime(2024, 6, 3, 9, 0, 0)  
end_time = datetime(2024, 6, 3, 10, 45, 0) 
 
#generate_dataset(tasks, 1000000,start_time,end_time,file_name="dataset")
#data = spark.read.csv("data/SDG_dataset.csv", header=True, inferSchema=True)  
generate_dataset(tasks, 100000,start_time,end_time,file_name="dataset_test")  
data = spark.read.csv("data/SDG_dataset_test.csv", header=True, inferSchema=True)

# Part 1

## k = 7

In [114]:
def shingle(text, k=7):
    shingle_set = []
    for i in range(len(text)-k +1):
        shingle_set.append(text[i:i+k])
    return list(set(shingle_set))

spark = SparkSession.builder.getOrCreate()

df_filtered1_7 = data.filter(data.type.isin(['Req']))
df_grouped1_7 = df_filtered1_7.groupBy("user_id").agg(concat_ws("",collect_list("to")).alias("features"))

shingles_udf1_7 = udf(shingle, ArrayType(StringType()))
df_shingles1_7 = df_filtered1_7.groupBy("user_id").agg(concat_ws("", collect_list("to")).alias("trace")) \
    .withColumn("shingles", shingles_udf1_7(col("trace"))) \
    .select("user_id", "shingles")

average_length1_7 = df_grouped1_7.select(avg(length(col('features')))).collect()[0][0]
average_shingles7_1 = df_shingles1_7.withColumn("list_length", size(col("shingles"))) \
                     .agg(avg("list_length").alias("average_list_length")).collect()[0][0]

print('average trace length:',average_length1_7)
print('average trace # shingles:',average_shingles7_1)

average trace length: 38.158701120512234
average trace # shingles: 32.158701120512234


In [115]:
print(f"Initial number of cases: {df_grouped1_7.count()}")
threshold = (int(average_shingles7_1)-1)/int(average_shingles7_1)
ans = minhash_lsh(df_grouped1_7,7,threshold)
replacement_candidates1_7, minhash_dic1_7 = ans[0],ans[1]
new_process_dictionary1_7= bucketing(replacement_candidates1_7)
print(f"Number of unique processes after merging them with {threshold} threshold using 7-shingles: {len(new_process_dictionary1_7)}")

Initial number of cases: 4373
Number of unique processes after merging them with 0.96875 threshold using 7-shingles: 3615


In [116]:
sims1_7 = get_averege_jaccard_sim(replacement_candidates1_7, minhash_dic1_7,get=False)

In [117]:
if len(set(value for key,values in sims1_7.items() for value in values if value != 1.0)) != 0:
    ans1_7 = min(set(value for key,values in sims1_7.items() for value in values if value != 1.0))
    final_values = []
    for key,values in sims1_7.items():
        for value in values:
            if value == ans1_7:
                final_values.append(key)

    dissimilar1_7 = set(final_values)
    new_sims = []
    for key in dissimilar1_7:
        for value in replacement_candidates1_7[key]:
            new_sims.append((key,value,jaccard_similarity(get_shingles(value,df_shingles1_7),get_shingles(key,df_shingles1_7))))
    investigate1_7 = [case for case in new_sims if case[-1]!=1.0]
    for case in investigate1_7:
        print(f'######################### {case[0]} vs {case[1]} ################################')
        print('jaccard similarity:',jaccard_similarity(get_shingles(case[0],df_shingles1_7), get_shingles(case[1],df_shingles1_7)))
        print(get_traces(case[0],df_grouped1_7))
        print(get_traces(case[1],df_grouped1_7))
        print('#######################################################################')

else:
    print('all processes have approximate jaccard sim = 1')

######################### 3737 vs 594 ################################
jaccard similarity: 0.9310344827586207
S0S3S3_1S3_2S3_3S5S5_1S5_1_2S4S4_2
S0S3S3_1S3_2S3_3S5S5_1S5_1_2S4S4_3
#######################################################################
######################### 594 vs 3737 ################################
jaccard similarity: 0.9310344827586207
S0S3S3_1S3_2S3_3S5S5_1S5_1_2S4S4_3
S0S3S3_1S3_2S3_3S5S5_1S5_1_2S4S4_2
#######################################################################


# Part 2

### Approach 1

In [None]:
def shingle(text, k=7):
    shingle_set = []
    for i in range(len(text)-k +1):
        shingle_set.append(text[i:i+k])
    return list(set(shingle_set))

spark = SparkSession.builder.getOrCreate()

df_filtered2_1 = data.filter(data.type.isin(['Req']))
df_grouped2_1 = df_filtered2_1.groupBy("user_id").agg(concat_ws("",collect_list("to")).alias("features"))

shingles_udf2_1 = udf(shingle, ArrayType(StringType()))
df_shingles2_1 = df_filtered2_1.groupBy("user_id").agg(concat_ws("", collect_list("to")).alias("trace")) \
    .withColumn("shingles", shingles_udf2_1(col("trace"))) \
    .select("user_id", "shingles")

average_length2_1 = df_grouped2_1.select(avg(length(col('features')))).collect()[0][0]
average_shingles2_1 = df_shingles2_1.withColumn("list_length", size(col("shingles"))) \
                     .agg(avg("list_length").alias("average_list_length")).collect()[0][0]

print('average trace length:',average_length2_1)
print('average trace # shingles:',average_shingles2_1)

### Approach 2

In [None]:
def shingle2(text, k=3):
    shingle_set = []
    for i in range(len(text)-k +1):
        shingle_set.append(text[i:i+k])
    return distinct_elements_in_order(shingle_set)

spark = SparkSession.builder.appName("Filter Dataset").getOrCreate()

df_filtered2_2 = data.filter((col("from") == "S0") & (~col("to").contains("null")) & (~col("to").contains("_")))
df_grouped2_2 = df_filtered2_2.groupBy("user_id").agg(concat_ws("",collect_list("to")).alias("features"))

# shingles_udata2_2 = udf(shingle2, ArrayType(StringType()))
# df_shingles2_2 = df_filtered2_2.groupBy("user_id").agg(concat_ws("", collect_list("to")).alias("trace")) \
#     .withColumn("shingles", shingles_udata2_2(col("trace"))) \
#     .select("user_id", "shingles")

# average_length2_2 = df_grouped2_2.select(avg(length(col('features')))).collect()[0][0]
# average_shingles2_2 = df_shingles2_2.select(avg(size(col('shingles')))).collect()[0][0]

# print('average trace length:',average_length2_2)
# print('average trace # shingles:',average_shingles2_2)

In [None]:
print(f"Initial number of cases: {df_grouped2_2.count()}")
ans = minhash_lsh(df_grouped2_2,3,0.98)
replacement_candidates2_2, minhash_dic2_2 = ans[0],ans[1]
new_process_dictionary2_2= bucketing(replacement_candidates2_2)
print(f"Number of unique processes after merging them with 0.97 threshold using 7-shingles: {len(new_process_dictionary2_2)}")

In [None]:
sims2_2 = get_averege_jaccard_sim(new_process_dictionary2_2, minhash_dic2_2,get=False)

In [None]:
if len(set(value for key,values in sims2_2.items() for value in values if value != 1.0)) != 0:
    ans = min(set(value for key,values in sims2_2.items() for value in values if value != 1.0))
    final_values = []
    for key,values in sims.items():
        for value in values:
            if value == ans:
                final_values.append(key)

    dissimilar2_2 = set(final_values)
else:
    print('all processes have approximate jaccard sim = 1')

## Perfomance evaluation