# Walkthrough

Take some **fictional** name / address data, which we imagine has come from different sources and therefore isn't always the same, and figure out which ones refer to the same thing.

In [4]:
import sys
sys.path.append('../src/address_deduplication')

In [5]:
import pandas as pd
import matplotlib.pyplot as plt

from normalisation import (
    normalise_names, 
    normalise_addresses, 
    split_address
)
from helpers import (
    save_all_normalised,
    read_normalised,
    deterministic_row_id
)
from deduplication import (
    get_cosine_similarity
)

In [None]:
DATA_FOLDER = '../data'
INPUT_FOLDER = f'{DATA_FOLDER}/input'
NORMALISED_FOLDER = f'{DATA_FOLDER}/normalised'
OUTPUT_FOLDER = f'{DATA_FOLDER}/output'

## Load some (fictional data)

Small amount of LLM-generated names and addresses, covering two postcodes.

In [None]:
restaurants = pd.read_csv(f'{INPUT_FOLDER}/restaurants.csv')

In [None]:

# add a unique identifier for each restaurant
restaurants['id'] = restaurants.apply(deterministic_row_id, axis=1)

# make sure the id in the first column
restaurants.insert(0, 'id', restaurants.pop('id'))



In [None]:
restaurants

## Normalise the data

Consistent capitalisation, split out postcode etc

In [None]:
split = split_address(restaurants['address'])  

restaurants = (
    restaurants
      .assign(
          name = lambda df: normalise_names(df.name),
          address = normalise_addresses(split['address_no_postcode']),
        )
      .join(split['postcode'])
      
)

In [None]:
# We'll group by this when doing the deduplication, so we don't have to 
# compare every pair of restaurants.
restaurants['outcode'] = restaurants['postcode'].str.split(' ').str[0]

In [None]:
ABBREVIATIONS = {
    'St': 'Street',
    'Rd': 'Road',
    'Ave': 'Avenue',
    'Dr': 'Drive',
    'Pl': 'Place',
    'Ln': 'Lane',
    'Sq': 'Square',
    'Terr': 'Terrace',
}

# replace common abbreviations in the address
restaurants['address'] = (
    restaurants['address']
      .replace(ABBREVIATIONS, regex=True)
)

## Save the normalized data

For each group being compared (in this case, defined by outcode), save the normalised data as a csv.

From now on, we'll work with one group at a time.

In [None]:
save_all_normalised(restaurants, NORMALISED_FOLDER)

In [None]:
group = read_normalised('SW4', NORMALISED_FOLDER)

## Pairwise similarity

Within each postcode, compare all pairs and get at similarity score in the 0-1 range.

In [None]:
# compare all pairs of restaurant *names* in the group
name_cosine_sim = get_cosine_similarity(group.name)
address_cosine_sim = get_cosine_similarity(group.address)


## Graph

Create a graph where the nodes are the locations and edges are the similarity scores. Remove edges where similarity is beneath some threshold, then find connected components - which we take to be multiple references to a single location.

(This is also a convenient step to manually add or remove pairs, if we happen to know certain locations are or are not the same)

In [None]:
import networkx as nx

# create a graph from the cosine similarity matrices
G = nx.Graph()

# add edges for pairs of restaurants where both name and address cosine similarity are above a threshold
name_threshold = 0.1
address_threshold = 0.1

# number of restaurants in the group
n = group.shape[0]

for i in range(n):
    for j in range(i + 1, n):
        if name_cosine_sim[i, j] >= name_threshold and address_cosine_sim[i, j] >= address_threshold:
            G.add_edge(group.iloc[i]['id'], group.iloc[j]['id'])

In [None]:
# draw the graph -- just for visualisation (with more than a few nodes it's an illegible hairball)


plt.figure(figsize=(8, 8))
pos = nx.spring_layout(G)  
nx.draw_networkx_nodes(G, pos, node_size=500)
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=10)
plt.title('Cosine Similarity Graph for Restaurants in SW4')
plt.show()

## Manual changes

The easiest way to link two restarurants that aren't being matched is to add an edge at this point. We can also remove edges if things are being matched that shouldn't be.

In [None]:
# make sure these nodes are connected
node_a = 'e97ca3f4e4ee26f55b4f1e9e58ca0182'
node_b = '2d29b645a21b5a5d9e1809e8c2e48274'
node_c = 'b6413c2b889bf7f4df0255df4dddfb43'
node_d = 'b4941ace111cbd25bc01342bfbbb1103'

# add an edge between the two nodes if they are not already connected
G.add_edge(node_a, node_b)
G.add_edge(node_b, node_c)
G.add_edge(node_c, node_d)

In [None]:
# TODO wrap in a function that takes tuples of node IDs

## Identify duplicates

We'll say that connected components in our graph are duplicates.

In [None]:
connected_components = list(nx.connected_components(G))

In [None]:
# convert the connected components to a lookup dictionary, where the key is the first restaurant id in the component and the value is a list of all restaurant ids in that component (including the key)
def connected_components_to_lookup(components):
    lookup = {}
    for component in components:
        first_id = next(iter(component))  # get the first id in the component
        lookup[first_id] = list(component)  # convert the set to a list
    return lookup

lookup = connected_components_to_lookup(connected_components)

In [None]:
# create a new group ID column in the group DataFrame
group['group_id'] = group['id'].map(lambda x: next((k for k, v in lookup.items() if x in v), x))

# make sure the id in the first column
group.insert(0, 'group_id', group.pop('group_id'))

In [None]:
# sort by the group id so we can see the groups together
group = group.sort_values(by='group_id').reset_index(drop=True)

In [None]:
# TODO wrap in a function

In [None]:
# TODO main.py that runs the whole process from the command line

## Results

In [None]:
for group_id, group_df in group.groupby('group_id'):
    print(f'Group ID: {group_id}')
    display(group_df[['id', 'name', 'address', 'postcode']])