In [None]:
!pip install jellyfish

# DF1 vs DF2, DF3, DF4, DF5 Multi-Dataset Experiment

# This experiment tests the entity matching pipeline using df1 as the left dataset and df2, df3, df4, df5 as the right datasets, demonstrating the multi-dataset matching capability.

import os
import sys
from pyspark.sql import SparkSession, functions as F

# Setup paths
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    
from packages.pyspark.entity_matching_pipeline import run_entity_matching

# Initialize Spark session with optimized configuration for larger datasets
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("DF1vsDF2345MultiExperiment") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()


# Load all datasets
print("Loading datasets...")

def load_dataset(filename):
    """Helper function to load and standardize dataset format"""
    return spark.read.csv(os.path.join(project_root, f"data/{filename}"), header=False, inferSchema=True) \
        .select("_c0", "_c1", "_c2", "_c3", "_c4", "_c5") \
        .toDF("0", "1", "2", "3", "4", "5")

Loading datasets...


In [4]:
# Load left dataset
df1 = load_dataset("df1.csv")

# Load right datasets
df2 = load_dataset("df2.csv")
df3 = load_dataset("df3.csv")
df4 = load_dataset("df4.csv")
df5 = load_dataset("df5.csv")

print(f"df1 (left) size: {df1.count()}")
print(f"df2 size: {df2.count()}")
print(f"df3 size: {df3.count()}")
print(f"df4 size: {df4.count()}")
print(f"df5 size: {df5.count()}")

total_right_records = df2.count() + df3.count() + df4.count() + df5.count()
print(f"Total right dataset records: {total_right_records}")

# Run multi-dataset entity matching pipeline
print("Running multi-dataset entity matching pipeline...")
print("This will match df1 against the union of df2, df3, df4, and df5")

buckets, metrics = run_entity_matching(
    spark=spark,
    left_df=df1,
    right_dataframes=[df2, df3, df4, df5],
    similarity_threshold=0.6,
    min_matching_columns=3
)

print("\nPipeline completed successfully!")

# Analyze results
print("Multi-Dataset Matching Results:")
print("=" * 50)

print(f"Total buckets created: {buckets.count()}")

# Show top buckets by size
print("\nTop 10 largest buckets:")
buckets.select("bucket_id", "bucket_size", "avg_similarity") \
    .orderBy(F.desc("bucket_size")) \
    .show(10)

print("\nEvaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value}")

print(f"\nPerformance Summary:")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1-Score: {metrics['f1_score']:.4f}")
print(f"Average Bucket Size: {metrics['average_bucket_size']:.2f}")


# Additional analysis
print("\nDetailed Analysis:")
print("=" * 50)

# Bucket size distribution
bucket_sizes = buckets.select("bucket_size").rdd.map(lambda row: row[0]).collect()
import numpy as np

print(f"Bucket Size Statistics:")
print(f"  Min: {min(bucket_sizes)}")
print(f"  Max: {max(bucket_sizes)}")
print(f"  Mean: {np.mean(bucket_sizes):.2f}")
print(f"  Median: {np.median(bucket_sizes):.2f}")
print(f"  Std Dev: {np.std(bucket_sizes):.2f}")

# Stop Spark session
spark.stop()

df1 (left) size: 100000
df2 size: 100000
df3 size: 100000
df4 size: 100000
df5 size: 100000
Total right dataset records: 400000
Running multi-dataset entity matching pipeline...
This will match df1 against the union of df2, df3, df4, and df5
This may take several minutes depending on dataset sizes...
Starting Multi-Dataset Entity Matching Pipeline...
Processing 1 left dataset vs 4 right datasets
Step 1: Preprocessing left dataframe...
Step 1b: Preprocessing and union right dataframes...
Unified right dataset has 400000 total records
Step 2: Creating entity keys...
Step 3: Calculating similarity matrix...
Step 4: Applying similarity threshold...
Step 5: Assigning entities to buckets...
Step 6: Calculating ground truth...
Step 7: Evaluating results...
Multi-dataset pipeline completed successfully!

Pipeline completed successfully!
Multi-Dataset Matching Results:
Total buckets created: 36882

Top 10 largest buckets:
+---------+-----------+------------------+
|bucket_id|bucket_size|    avg