In [None]:
!pip install jellyfish

import os
import sys
from pyspark.sql import SparkSession, functions as F

# Setup paths
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    
from packages.pyspark.entity_matching_pipeline import run_entity_matching



This experiment tests the refactored entity matching pipeline with a small synthetic dataset to verify basic functionality.


In [2]:
# Create small synthetic datasets for testing
# Data for df1
data1 = [
    ["ID00005", "N039", "E298", "Q412", "V409", "R232"],  # TP1
    ["ID00009", "R822", "W179", "H017", "P323", "F298"],  # TP2
    ["ID00007", "R449", "X716", "M948", "G667", "S702"],  # TP3
    ["ID00004", "N002", "E396", "N843", "I458", "S719"],  # TP4
    ["ID10004", "N002", "E396", "N853", "I623", "S569"],  # FN1
    ["ID50004", "J547", "B222", "G492", "R551", "S490"],  # FP1
    ["IDTIE00", "N322", "K685", "T442", "C225", "W967"],  # FP-tie: this should be skipped
    ["ID50008", "N322", "K685", "T442", "C225", "W967"],  # FP2
    ["ID00000", "W815", "L281", "R155", "F768", "B914"],
    ["ID00001", "C172", "B326", "X400", "M508", "O776"],
    ["ID00002", "V683", "C265", "J127", "D589", "F482"],
    ["ID00003", "E851", "P721", "F745", "D863", "K229"],
    ["ID00016", "T873", "D670", "U046", "Z181", "X621"],
    ["ID00017", "F327", "G856", "E567", "O929", "Q721"],
    ["ID00010", "O283", "T723", "Z034", "V319", "X338"],
]

# Data for df2
data2 = [
    ["ID00005", "R746", "E298", "Q412", "L291", "R232"],  # TP1
    ["ID00009", "R822", "W179", "H017", "P323", "F298"],  # TP2
    ["ID00007", "Z011", "X716", "M948", "W967", "S702"],  # TP3
    ["ID00004", "N002", "E396", "N843", "V935", "S719"],  # TP4
    ["ID10004", "N002", "E396", "N553", "I453", "S459"],  # FN1
    ["NEW80187", "J547", "B222", "G492", "W673", "S490"],  # FP1
    ["NEW30110", "N322", "K685", "T432", "C225", "W967"],  # FP2
    ["NEW72832", "F875", "Q768", "H822", "Z154", "X678"],
    ["NEW30110", "R560", "C434", "M687", "Q689", "Q863"],
    ["NEW81243", "R762", "N687", "A109", "K476", "R637"],
    ["NEW52689", "A089", "V733", "W158", "A640", "H331"],
    ["NEW67368", "Z079", "J617", "G878", "W111", "Q500"],
    ["NEW72348", "J547", "B222", "G492", "R551", "S490"],
    ["NEW34469", "Y990", "H898", "W673", "L967", "M829"],
    ["NEW34462", "Y990", "H898", "W673", "L967", "M829"],
]


In [3]:
# Initialize Spark session
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("SmallSampleExperiment") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# Convert to Spark DataFrames
df1 = spark.createDataFrame(data1, ["0", "1", "2", "3", "4", "5"])
df2 = spark.createDataFrame(data2, ["0", "1", "2", "3", "4", "5"])

print("df1 (Left dataset):")
df1.show()
print("df2 (Right dataset):")
df2.show()


# Run entity matching pipeline
print("Running entity matching pipeline...")
buckets, metrics = run_entity_matching(
    spark=spark,
    left_df=df1,
    right_dataframes=[df2],
    similarity_threshold=0.6,
    min_matching_columns=3
)

print("\nBuckets (Matching Results):")
buckets.show()

print("\nEvaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value}")

    
# Expected results analysis
print("\nExpected Results Analysis:")
print("- Ground truth matches: 4 (ID00005, ID00009, ID00007, ID00004)")
print("- Expected True Positives: 4")
print("- Expected False Positives: 3")
print("- Expected False Negatives: 1")
print("- Expected Precision: 0.5714285714285714")
print("- Expected Recall: 0.8")
print("- Expected F1-Score: 0.6666666666666666")

# Stop Spark session
spark.stop()


df1 (Left dataset):
+-------+----+----+----+----+----+
|      0|   1|   2|   3|   4|   5|
+-------+----+----+----+----+----+
|ID00005|N039|E298|Q412|V409|R232|
|ID00009|R822|W179|H017|P323|F298|
|ID00007|R449|X716|M948|G667|S702|
|ID00004|N002|E396|N843|I458|S719|
|ID10004|N002|E396|N853|I623|S569|
|ID50004|J547|B222|G492|R551|S490|
|IDTIE00|N322|K685|T442|C225|W967|
|ID50008|N322|K685|T442|C225|W967|
|ID00000|W815|L281|R155|F768|B914|
|ID00001|C172|B326|X400|M508|O776|
|ID00002|V683|C265|J127|D589|F482|
|ID00003|E851|P721|F745|D863|K229|
|ID00016|T873|D670|U046|Z181|X621|
|ID00017|F327|G856|E567|O929|Q721|
|ID00010|O283|T723|Z034|V319|X338|
+-------+----+----+----+----+----+

df2 (Right dataset):
+--------+----+----+----+----+----+
|       0|   1|   2|   3|   4|   5|
+--------+----+----+----+----+----+
| ID00005|R746|E298|Q412|L291|R232|
| ID00009|R822|W179|H017|P323|F298|
| ID00007|Z011|X716|M948|W967|S702|
| ID00004|N002|E396|N843|V935|S719|
| ID10004|N002|E396|N553|I453|S459|
|NEW8