In [1]:
import pandas as pd
import pickle as pkl
from collections import deque
import time
from blender import BlendER, Table, SqlOrderBy
import blender.resolution_function as resolution_function
import networkx as nx
from itertools import combinations
import numpy as np

In [2]:
# Change to raw dataset path
data = pd.read_csv("datasets/alaska_cameras_small/dataset.csv")

# Change to matches (dataframe with l_id and r_id pairs)
matches = pd.read_csv("datasets/alaska_cameras_small/matches.csv")

# Change to a pickled candidate pairs file (list of pairs)
with open(
    "datasets/alaska_cameras_small/blocking_functions/candidates_sparker.pkl",
    "rb",
) as f:
    candidates = pkl.load(f)

In [3]:
# If your matches aren't using l_id and r_id, or you have a different matching function change here
def matcher(l, r):
    global count
    return (
        ((matches["l_id"] == l) & (matches["r_id"] == r))
        | ((matches["l_id"] == r) & (matches["r_id"] == l))
    ).any()

In [4]:
# Change based on the query and resolution functions you specified on blender, for accurate comparison
def resolve(records):
    entity = {}
    entity["_id"] = records['_id'].mode()[0]
    entity["description"] = records['description'].mode()[0]
    entity["brand"] = records['brand'].mode()[0]
    entity["price"] = records["price"].min()
    entity["mp"] = records["mp"].mean()
    return entity

In [5]:
def match(candidate_pairs):
    seen = set()
    matches_set = set()
    not_matches_set = set()
    result = []
    comparisons = 0
    for idx, row in data.iterrows():
        # Change if your id column is different
        root = row["_id"]
        if root in seen:
            continue

        pairs = [pair for pair in candidate_pairs if root in pair]
        if not pairs:
            continue

        block = set().union(*pairs)

        entity_cluster = set([root])
        to_analyze = deque([root])

        while to_analyze:
            id = to_analyze.popleft()
            for candidate in block:
                if candidate in entity_cluster:
                    continue
                if id == candidate:
                    entity_cluster.add(candidate)
                    continue
                if (id, candidate) in matches_set:
                    entity_cluster.add(candidate)
                    to_analyze.append(candidate)
                    continue
                if (id, candidate) in not_matches_set:
                    continue
                comparisons += 1
                if matcher(id, candidate):
                    entity_cluster.add(candidate)
                    matches_set.add((id, candidate))
                    matches_set.add((candidate, id))
                    to_analyze.append(candidate)
                else:
                    not_matches_set.add((id, candidate))
                    not_matches_set.add((candidate, id))

        seen.update(entity_cluster)
        result.append(entity_cluster)
    return result, comparisons


In [6]:
start = time.time()

results, batch_comparisons = match(candidates)

batch_elapsed_time = time.time() - start

In [7]:
# Change if your id column is different
resolved = pd.DataFrame([resolve(data[data["_id"].isin(entity)]) for entity in results])
# Change to the where query you used on blender, and the order by
batch_filtered = resolved[resolved["brand"] == "sony"].sort_values(
    "mp", ascending=False
)
batch_filtered_pairs = set()
for group in (results[index] for index in batch_filtered.index):
    if len(group) > 1:
        batch_filtered_pairs.update([tuple(sorted(pair)) for pair in combinations(group, 2)])
    else:
        batch_filtered_pairs.add(tuple(group))

In [20]:
batch_filtered_pairs

{('www.ebay.com//45125', 'www.ebay.com//60260'),
 ('www.ebay.com//42340', 'www.ebay.com//53866'),
 ('www.gosale.com//1149', 'www.gosale.com//787'),
 ('www.ebay.com//45981', 'www.walmart.com//800'),
 ('www.ebay.com//23894', 'www.ebay.com//53866'),
 ('www.ebay.com//47161', 'www.ebay.com//56042'),
 ('www.ebay.com//53972', 'www.ebay.com//55723'),
 ('www.ebay.com//47921', 'www.ebay.com//58580'),
 ('www.ebay.com//46046', 'www.gosale.com//483'),
 ('www.ebay.com//44145', 'www.ebay.com//55524'),
 ('www.ebay.com//53441', 'www.henrys.com//43'),
 ('buy.net//6051', 'www.gosale.com//569'),
 ('www.ebay.com//59645', 'www.gosale.com//821'),
 ('www.ebay.com//44798', 'www.ebay.com//60278'),
 ('www.ebay.com//48490',),
 ('buy.net//6417', 'www.ebay.com//55122'),
 ('www.ebay.com//45827', 'www.ebay.com//46754'),
 ('www.ebay.com//46395', 'www.ebay.com//48264'),
 ('www.ebay.com//56042', 'www.gosale.com//1044'),
 ('www.ebay.com//46605', 'www.ebay.com//59859'),
 ('www.ebay.com//43825', 'www.ebay.com//55468'),
 ('

In [8]:
print("Batch comparisons:", batch_comparisons)
print("Batch elapsed time:", batch_elapsed_time)
print("Batch emitted:", len(batch_filtered))

Batch comparisons: 22073
Batch elapsed time: 64.68334484100342
Batch emitted: 206


In [9]:
# Change based on you matches file
def matcher_table(l, r):
    return (
        ((matches["l_id"] == l["table__id"]) & (matches["r_id"] == r["table__id"]))
        | ((matches["l_id"] == r["table__id"]) & (matches["r_id"] == l["table__id"]))
    ).any()

In [51]:
blender_start = None
blender_results = []
def listener(entity, cluster, comparisons):
    global blender_start       

    i = len(blender_results)

    if len(cluster) > 1:
        cluster_pairs = {tuple(sorted(pair)) for pair in combinations(cluster, 2)}
    else:
        cluster_pairs = set((tuple(cluster),))

    # Change based on the attribute you used to sort
    blender_results.append({
        "elapsed_time": time.time() - blender_start,
        "comparisons": comparisons,
        "correct": (round(entity["table_mp"], 3) == round(batch_filtered.iloc[i]['mp'], 3)) if i < len(batch_filtered) else False,
        "tp": len(cluster_pairs & batch_filtered_pairs),
        "fp": len(cluster_pairs - batch_filtered_pairs),
    })

In [52]:
G = nx.Graph()
G.add_edges_from(candidates)
blocks = [list(set(x)) for x in nx.connected_components(G)]

In [53]:
# Configure
blender = (BlendER()
    .from_table(Table(data, blocks, matcher_table, "table", "_id"))
    .select(
        ("table._id", resolution_function.VOTE),
        ("table.description", resolution_function.VOTE),
        ("table.brand", resolution_function.VOTE),
        ("table.price", resolution_function.MIN),
        ("table.mp", resolution_function.AVG),
        order_by=("table.mp", SqlOrderBy.DESC),
    )
    .where('table.brand = "sony"')
    .subscribe(listener)
)

In [54]:
blender_start = time.time()
blender_results = []
blender.run()

In [55]:
blender_dataframe = pd.DataFrame(blender_results)

blender_dataframe["running_tp"] = blender_dataframe["tp"].cumsum()
blender_dataframe["running_fp"] = blender_dataframe["fp"].cumsum()
blender_dataframe["running_fn"] = len(batch_filtered_pairs) - blender_dataframe["running_tp"]
blender_dataframe["running_correct"] = blender_dataframe["correct"].cumsum()

blender_dataframe["recall"] = (blender_dataframe["running_tp"] / (blender_dataframe["running_tp"] + blender_dataframe["running_fn"])).replace(np.nan, 0)
blender_dataframe["precision"] = (blender_dataframe["running_tp"] / (blender_dataframe["running_tp"] + blender_dataframe["running_fp"])).replace(np.nan, 0)

blender_dataframe["accuracy"] = blender_dataframe["running_correct"] / len(blender_dataframe)

blender_dataframe.to_csv("output/blender_results.csv", index=False)

In [56]:
blender_dataframe

Unnamed: 0,elapsed_time,comparisons,correct,tp,fp,running_tp,running_fp,running_fn,running_correct,recall,precision,accuracy
0,1.052290,3,True,3,0,3,0,3505,1,0.000855,1.0,0.004854
1,1.188300,45,True,253,0,256,0,3252,2,0.072976,1.0,0.009709
2,1.270835,75,True,55,0,311,0,3197,3,0.088655,1.0,0.014563
3,1.325509,95,True,1,0,312,0,3196,4,0.088940,1.0,0.019417
4,1.375694,115,True,1,0,313,0,3195,5,0.089225,1.0,0.024272
...,...,...,...,...,...,...,...,...,...,...,...,...
201,29.801769,12631,True,1,0,3504,0,4,202,0.998860,1.0,0.980583
202,29.809757,12638,True,1,0,3505,0,3,203,0.999145,1.0,0.985437
203,29.815067,12645,True,1,0,3506,0,2,204,0.999430,1.0,0.990291
204,29.817820,12652,True,1,0,3507,0,1,205,0.999715,1.0,0.995146
