In [1]:
!pip install jellyfish

# DF1 vs DF2 Sample Experiment (Seed 42)

# This experiment tests the entity matching pipeline using a 10% sample from df1 and df2 datasets with a fixed random seed for reproducibility.

import os
import sys
from pyspark.sql import SparkSession, functions as F
import pandas as pd

# Setup paths
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    
from packages.pyspark.entity_matching_pipeline import run_entity_matching

# Initialize Spark session
spark = SparkSession.builder.master("local[*]").appName("DF1vsDF2SampleSeed42").getOrCreate()

# Load and sample datasets with fixed seed for reproducibility
print("Loading and sampling datasets...")

# Load 10% sample from df1 and df2 with seed 42
df1_pandas = pd.read_csv(os.path.join(project_root, "data/df1.csv"), 
                        usecols=[0,1,2,3,4,5], header=None)[[0,1,2,3,4,5]].sample(frac=0.1, random_state=42)

df2_pandas = pd.read_csv(os.path.join(project_root, "data/df2.csv"), 
                        usecols=[0,1,2,3,4,5], header=None)[[0,1,2,3,4,5]].sample(frac=0.1, random_state=42)

Loading and sampling datasets...


In [3]:

# Convert to Spark DataFrames
df1 = spark.createDataFrame(df1_pandas)
df2 = spark.createDataFrame(df2_pandas)

print(f"df1 sample size: {df1.count()}")
print(f"df2 sample size: {df2.count()}")

# Expected results from previous analysis
expected_results = {"gt": 2531, "tp": 1775, "fp": 92, "fn": 756}
print(f"Expected results: {expected_results}")


# Run entity matching pipeline
print("Running entity matching pipeline...")
buckets, metrics = run_entity_matching(
    spark=spark,
    left_df=df1,
    right_dataframes=[df2],
    similarity_threshold=0.6,
    min_matching_columns=3
)

print("\nBucket Summary:")
print(f"Total buckets: {buckets.count()}")
buckets.select("bucket_id", "bucket_size", "avg_similarity").show(10)

print("\nEvaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value}")

    
# Compare with expected results
print("\nComparison with Expected Results:")
print(f"Expected Ground Truth: {expected_results['gt']}, Actual: {metrics['ground_truth']}")
print(f"Expected True Positives: {expected_results['tp']}, Actual: {metrics['true_positives']}")
print(f"Expected False Positives: {expected_results['fp']}, Actual: {metrics['false_positives']}")
print(f"Expected False Negatives: {expected_results['fn']}, Actual: {metrics['false_negatives']}")

print(f"\nPrecision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1-Score: {metrics['f1_score']:.4f}")

# Stop Spark session
spark.stop()

df1 sample size: 10000
df2 sample size: 10000
Expected results: {'gt': 2531, 'tp': 1775, 'fp': 92, 'fn': 756}
Running entity matching pipeline...
Starting Multi-Dataset Entity Matching Pipeline...
Processing 1 left dataset vs 1 right datasets
Step 1: Preprocessing left dataframe...
Step 1b: Preprocessing and union right dataframes...
Unified right dataset has 10000 total records
Step 2: Creating entity keys...
Step 3: Calculating similarity matrix...
Step 4: Applying similarity threshold...
Step 5: Assigning entities to buckets...
Step 6: Calculating ground truth...
Step 7: Evaluating results...
Multi-dataset pipeline completed successfully!

Bucket Summary:
Total buckets: 1867
+---------+-----------+------------------+
|bucket_id|bucket_size|    avg_similarity|
+---------+-----------+------------------+
| AA100187|          1|               0.6|
| AA100252|          1|               0.6|
| AA100290|          1|               0.8|
| AA100327|          1|               0.6|
| AA100360| 

In [None]:
spark.createDataFrame(df1_pandas).show()