In [41]:
import pandas as pd
import pickle as pkl
from collections import deque
import time
from blender import BlendER, Table, SqlOrderBy
import blender.resolution_function as resolution_function
import networkx as nx

In [None]:
# Change to raw dataset path
data = pd.read_csv("datasets/alaska_cameras_small/dataset.csv")

# Change to matches (dataframe with l_id and r_id pairs)
matches = pd.read_csv("datasets/alaska_cameras_small/matches.csv")

# Change to a pickled candidate pairs file (list of pairs)
with open(
    "datasets/alaska_cameras_small/blocking_functions/candidates_sparker.pkl",
    "rb",
) as f:
    candidates = pkl.load(f)

In [None]:
# If your matches aren't using l_id and r_id, or you have a different matching function change here
def matcher(l, r):
    global count
    return (
        ((matches["l_id"] == l) & (matches["r_id"] == r))
        | ((matches["l_id"] == r) & (matches["r_id"] == l))
    ).any()

In [None]:
# Change based on the query and resolution functions you specified on blender, for accurate comparison
def resolve(records):
    entity = {}
    entity["_id"] = records['_id'].mode()[0]
    entity["description"] = records['description'].mode()[0]
    entity["brand"] = records['brand'].mode()[0]
    entity["price"] = records["price"].min()
    entity["mp"] = records["mp"].mean()
    return entity

In [None]:
def match(candidate_pairs):
    seen = set()
    matches_set = set()
    not_matches_set = set()
    result = []
    comparisons = 0
    for idx, row in data.iterrows():
        # Change if your id column is different
        root = row["_id"]
        if root in seen:
            continue

        pairs = [pair for pair in candidate_pairs if root in pair]
        if not pairs:
            continue

        block = set().union(*pairs)

        entity_cluster = set([root])
        to_analyze = deque([root])

        while to_analyze:
            id = to_analyze.popleft()
            for candidate in block:
                if candidate in entity_cluster:
                    continue
                if id == candidate:
                    entity_cluster.add(candidate)
                    continue
                if (id, candidate) in matches_set:
                    entity_cluster.add(candidate)
                    to_analyze.append(candidate)
                    continue
                if (id, candidate) in not_matches_set:
                    continue
                comparisons += 1
                if matcher(id, candidate):
                    entity_cluster.add(candidate)
                    matches_set.add((id, candidate))
                    matches_set.add((candidate, id))
                    to_analyze.append(candidate)
                else:
                    not_matches_set.add((id, candidate))
                    not_matches_set.add((candidate, id))

        seen.update(entity_cluster)
        result.append(entity_cluster)
    return result, comparisons


In [None]:
start = time.time()

results, batch_comparisons = match(candidates)
# Change if your id column is different
resolved = pd.DataFrame([resolve(data[data["_id"].isin(entity)]) for entity in results])
# Change to the where query you used on blender, and the order by
batch_filtered = resolved[resolved["brand"] == "sony"].sort_values("mp", ascending=False)

batch_elapsed_time = time.time() - start

In [None]:
batch_filtered.to_csv('output/output_batch.csv', index=False)

In [48]:
print("Batch comparisons:", batch_comparisons)
print("Batch elapsed time:", batch_elapsed_time)
print("Batch emitted:", len(batch_filtered))

Batch comparisons: 22073
Batch elapsed time: 60.75658655166626
Batch emitted: 206


In [None]:
# Change based on you matches file
blender_comparisons = 0
def matcher_table(l, r):
    global blender_comparisons
    blender_comparisons += 1
    return (
        ((matches["l_id"] == l["table__id"]) & (matches["r_id"] == r["table__id"]))
        | ((matches["l_id"] == r["table__id"]) & (matches["r_id"] == l["table__id"]))
    ).any()

In [None]:
blender_start = None
emitted = []
def listener(entity):
    global blender_start
    global emitted
    emitted.append(entity)
    if blender_start is None:
        blender_start = time.time()
    
    # Change based on the attribute you used to sort
    correct = sum(
        round(e["table_mp"], 3) == round(batch_filtered.iloc[i]["mp"], 3) for i, e in enumerate(emitted)
    )
    print("BlendER comparisons:", blender_comparisons)
    print("BlendER correct:", correct)
    print("BlendER elapsed time:", time.time() - blender_start)
    print("BlendER emitted:", len(emitted))
    print("BlendER recall:", len(emitted) / len(batch_filtered))
    print("BlendER precision:", correct / len(batch_filtered))
    print(entity.to_dict())

In [69]:
G = nx.Graph()
G.add_edges_from(candidates)
blocks = [list(set(x)) for x in nx.connected_components(G)]

In [None]:
# Configure
blender = (BlendER()
    .from_table(Table(data, blocks, matcher_table, "table", "_id"))
    .select(
        ("table._id", resolution_function.VOTE),
        ("table.description", resolution_function.VOTE),
        ("table.brand", resolution_function.VOTE),
        ("table.price", resolution_function.MIN),
        ("table.mp", resolution_function.AVG),
        order_by=("table.mp", SqlOrderBy.DESC),
    )
    .where('table.brand = "sony"')
    .subscribe(listener)
)

In [71]:
blender.run()

BlendER comparisons: 11361
BlendER correct: 1
BlendER elapsed time: 0.0002551078796386719
BlendER emitted: 1
BlendER recall: 0.0048543689320388345
BlendER precision: 0.0048543689320388345
{'table__id': 'buy.net//5881', 'table_description': 'sony a7r black interchangeable lens digital slr camera body only (36.4 mp, memory stick/sd card slot) price comparison at buy.net', 'table_brand': 'sony', 'table_price': 1586.99, 'table_mp': 36.4}
BlendER comparisons: 11403
BlendER correct: 2
BlendER elapsed time: 0.11588120460510254
BlendER emitted: 2
BlendER recall: 0.009708737864077669
BlendER precision: 0.009708737864077669
{'table__id': 'www.ebay.com//42034', 'table_description': 'sony a alpha nex 7 24 3 mp digital camera black body only | ebay', 'table_brand': 'sony', 'table_price': 827.99, 'table_mp': 24.3}
BlendER comparisons: 11433
BlendER correct: 3
BlendER elapsed time: 0.19450616836547852
BlendER emitted: 3
BlendER recall: 0.014563106796116505
BlendER precision: 0.014563106796116505
{'ta

In [None]:
pd.DataFrame(emitted).to_csv("output/output_blender.csv", index=False)