In [None]:
!pip install jellyfish

# DF1 vs DF2 Full Dataset Experiment

# This experiment tests the entity matching pipeline using the complete df1 and df2 datasets without sampling.

import os
import sys
from pyspark.sql import SparkSession, functions as F

# Setup paths
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    
from packages.pyspark.entity_matching_pipeline import run_entity_matching

# Initialize Spark session with optimized configuration for larger datasets
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("DF1vsDF2FullExperiment") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()


# Load full datasets
print("Loading full datasets...")

# Load complete df1 and df2
df1 = spark.read.csv(os.path.join(project_root, "data/df1.csv"), header=False, inferSchema=True) \
    .select("_c0", "_c1", "_c2", "_c3", "_c4", "_c5") \
    .toDF("0", "1", "2", "3", "4", "5")

df2 = spark.read.csv(os.path.join(project_root, "data/df2.csv"), header=False, inferSchema=True) \
    .select("_c0", "_c1", "_c2", "_c3", "_c4", "_c5") \
    .toDF("0", "1", "2", "3", "4", "5")

print(f"df1 size: {df1.count()}")
print(f"df2 size: {df2.count()}")

# Show sample of data
print("\nSample from df1:")
df1.show(5)
print("\nSample from df2:")
df2.show(5)

# Run entity matching pipeline
print("Running entity matching pipeline on full datasets...")
print("This may take several minutes depending on dataset size...")

buckets, metrics = run_entity_matching(
    spark=spark,
    left_df=df1,
    right_dataframes=[df2],
    similarity_threshold=0.6,
    min_matching_columns=3
)

print("\nBucket Summary:")
print(f"Total buckets: {buckets.count()}")
buckets.select("bucket_id", "bucket_size", "avg_similarity").orderBy(F.desc("bucket_size")).show(10)

print("\nEvaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value}")

# Detailed analysis
print("\nDetailed Analysis:")
print(f"Ground Truth Matches: {metrics['ground_truth']}")
print(f"True Positives: {metrics['true_positives']}")
print(f"False Positives: {metrics['false_positives']}")
print(f"False Negatives: {metrics['false_negatives']}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1-Score: {metrics['f1_score']:.4f}")
print(f"Average Bucket Size: {metrics['average_bucket_size']:.2f}")

# Bucket size distribution
print("\nBucket Size Distribution:")
bucket_sizes = buckets.select("bucket_size").rdd.map(lambda row: row[0]).collect()
import numpy as np
print(f"Min bucket size: {min(bucket_sizes)}")
print(f"Max bucket size: {max(bucket_sizes)}")
print(f"Mean bucket size: {np.mean(bucket_sizes):.2f}")
print(f"Median bucket size: {np.median(bucket_sizes):.2f}")

# Stop Spark session
spark.stop()