Clustering approaches to create numerical data for blocking rules. 

This is a general example, however semantic clustering based on tf-idf may be inappropiate for matching organisation names.
1. Names may be too short to have meaningful tf-idf values.
2. Names may have common or semantically similar words related to the domain, which may not be useful for matching.

In [36]:
import json
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.random_projection import GaussianRandomProjection
from sklearn.cluster import KMeans
from collections import defaultdict
import matplotlib.pyplot as plt

# from mlxtend.plotting import plot_decision_regions
import os
import sys
from pprint import pprint
import ast

def move_working_dir_to_repo_root(repo_name="orgsync"):
    """
    Move the current working directory to the root of the repository.
    """
    current_dir = os.getcwd()
    while os.path.basename(current_dir).lower() != repo_name:
        current_dir = os.path.dirname(current_dir)
    os.chdir(current_dir)
    print("Current working directory: ", os.getcwd())

move_working_dir_to_repo_root(repo_name="orgsync")


Current working directory:  c:\Users\dec2g\GitHub\OrgSync


In [14]:
data_dir = os.path.join("data", "splink")

with open(os.path.join(data_dir, "all_data.json"), "r") as f:
    uk_data = json.load(f)

# remove any data with "dataset" == "gtr"
# uk_data = [d for d in uk_data if d["dataset"] != "gtr"]


Clustering method taken from `fuzzy-llm.py` on main branch.  

In [None]:
# Function to preprocess the names
def preprocess_name(name):
    name = name.lower()  # Convert to lowercase
    name = re.sub(r'\s+', ' ', name)  # Replace multiple spaces with a single space
    name = re.sub(r'[^\w\s]', '', name)  # Remove punctuation
    return name.strip()  # Strip leading and trailing whitespace

# Combine the name, short_name, and standardized_name fields
def combine_names(entry):
    combined_name = ' '.join(filter(None, [entry.get('name', ''), entry.get('short_name', ''), entry.get('standardized_name', '')]))
    return preprocess_name(combined_name)

# Group similar names based on the specified threshold
def group_similar_names(unique_names, indices, distances, threshold=0.3, ):
    grouped_names = defaultdict(list)
    used_names = set()

    for i, name in enumerate(unique_names):
        if name in used_names:
            continue

        # Get similar names based on the cosine similarity
        similar_names = [unique_names[idx] for j, idx in enumerate(indices[i]) if distances[i][j] <= threshold and idx != i]

        if similar_names:  # Only consider if there are matches
            # Add to the group and mark as used
            grouped_names[name].extend(similar_names)
            used_names.add(name)
            used_names.update(similar_names)

    return grouped_names



In [None]:
# get field "name" as list from .json containing list of dicts
all_names = [entry['name'] for entry in uk_data]
unique_names = list(set(all_names))
# sort by smallest to largest and alphabetically
unique_names = sorted(all_names, key=lambda x: (len(x), x))
pprint(unique_names[:1000])

['bq',
 'cw',
 'do',
 'i4',
 'r3',
 'rd',
 're',
 'rg',
 '3ie',
 'aad',
 'adi',
 'afm',
 'ams',
 'aql',
 'art',
 'asr',
 'avl',
 'aww',
 'axa',
 'bce',
 'bdo',
 'bos',
 'btg',
 'bvt',
 'byd',
 'c4s',
 'cbi',
 'cbi',
 'cdp',
 'cel',
 'ceo',
 'cgg',
 'cgi',
 'cpi',
 'crv',
 'dha',
 'dhl',
 'dri',
 'dsm',
 'dsv',
 'e3g',
 'eat',
 'eef',
 'eef',
 'eia',
 'eko',
 'emc',
 'emg',
 'ewm',
 'f2g',
 'fic',
 'flf',
 'fxg',
 'gfm',
 'gkn',
 'gom',
 'gsi',
 'gsk',
 'gw4',
 'gwe',
 'heg',
 'hst',
 'ibm',
 'ibm',
 'ics',
 'idd',
 'igo',
 'iic',
 'isn',
 'jcb',
 'kek',
 'key',
 'kla',
 'ktn',
 'law',
 'lux',
 'lux',
 'mbs',
 'mdc',
 'mod',
 'mrc',
 'mtn',
 'nct',
 'nhh',
 'nkt',
 'npl',
 'nrc',
 'nva',
 'odi',
 'odi',
 'odi',
 'odi',
 'odi',
 'ofs',
 'ogi',
 'ona',
 'one',
 'pax',
 'pi3',
 'psi',
 'pss',
 'qi3',
 'r20',
 'rag',
 'rms',
 'rwe',
 's3c',
 's4b',
 's4c',
 'skf',
 'slb',
 'smc',
 'snv',
 'spf',
 'srg',
 'ssp',
 'tep',
 'tg4',
 'thp',
 'tmf',
 'tpp',
 'trw',
 'ubs',
 'uui',
 'wmg',
 'wpp',


In [17]:
# Create a list of names
# names = [preprocess_name(entry.get("name")) for entry in uk_data]
names = [entry.get('name') for entry in uk_data]

Perform NearestNeighbors clustering on Tf-Idf vectors of the text data. Could also do this on Levenshtein distance or other similarity metrics.

In [None]:
# Remove exact duplicates to keep only unique combined names
unique_names = list(set(names))
total_unique_names = len(unique_names)

# Vectorize the unique combined names using TF-IDF
# vectorizer = TfidfVectorizer().fit(unique_names)
# name_vectors = vectorizer.transform(unique_names)
name_vectors = TfidfVectorizer().fit_transform(unique_names)




In [41]:
# use kmeans 
n_clusters = min(100, len(unique_names)//10)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(name_vectors)
name_to_cluster = dict(zip(unique_names, cluster_labels))
pprint(name_to_cluster)

# pprint clusters of names
cluster_to_names = defaultdict(list)

{'0 infinity limited': np.int32(28),
 '1 mutation 1 medicine': np.int32(37),
 '100 per cent open': np.int32(85),
 '100 percent it ltd': np.int32(48),
 '100 resilient cities': np.int32(30),
 '100 years of coconuts': np.int32(71),
 '1000 days llc': np.int32(28),
 '1000 women trust': np.int32(1),
 '1000livesplus': np.int32(8),
 '100daysofnocode ltd': np.int32(84),
 '100open': np.int32(33),
 '1010': np.int32(28),
 '1010 climate action': np.int32(44),
 '108center military hospital': np.int32(37),
 '10be5 ltd': np.int32(28),
 '10x genomics inc': np.int32(94),
 '113 botanicals': np.int32(48),
 '11kbw': np.int32(1),
 '123 limited': np.int32(70),
 '1248 limited': np.int32(54),
 '12tree finance gmbh': np.int32(71),
 '1715labs': np.int32(37),
 '1771 guitars ltd': np.int32(1),
 '1796 ltd': np.int32(1),
 '17cicada ltd': np.int32(7),
 '1947 partition archive': np.int32(28),
 '198 contemporary arts and learning': np.int32(28),
 '1aesthetic limited': np.int32(27),
 '1qbit': np.int32(85),
 '1spatial gr

In [42]:
# create dataframe of unique names and cluster labels
unique_names_df = pd.DataFrame({"name": unique_names, "cluster": cluster_labels})

# groupby cluster
for cluster, group in unique_names_df.groupby("cluster"):
    cluster_to_names[cluster] = list(group["name"])

pprint(cluster_to_names) 

defaultdict(<class 'list'>,
            {0: ['bdo',
                 'cifas',
                 'hillrom',
                 'optophono',
                 'coptrz ltd',
                 'her bodywear',
                 'apoha limited',
                 'plexus systems',
                 'stylemania ltd',
                 'img partnership',
                 'hot knife media',
                 'quantum imaging',
                 'imperative space',
                 'zbd displays ltd',
                 'open medical ltd',
                 'bio data networks',
                 'childrens society',
                 'anax technologies',
                 'messrs avxkyocera',
                 'manholemetrics ltd',
                 'birzeit university',
                 'arup group limited',
                 'fray julián garcés',
                 'profit in focus ltd',
                 'lavision uk limited',
                 '3d industries limited',
                 'oakwood fuels limited',
     

In [None]:
# Use Nearest Neighbors to find similar names
nbrs = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute').fit(name_vectors)
distances, indices = nbrs.kneighbors(name_vectors)

In [19]:
# sort unique_names by shortes to longest and alphabetical
unique_names = sorted(unique_names, key=len)
pprint(unique_names[:1000])

['do',
 'r3',
 'rg',
 'i4',
 'bq',
 're',
 'cw',
 'rd',
 'f2g',
 'gsi',
 'byd',
 'pss',
 '3ie',
 'spf',
 'wpp',
 'igo',
 'cdp',
 'dhl',
 'nkt',
 'ibm',
 'kla',
 'c4s',
 'tpp',
 'uui',
 's3c',
 'wmg',
 'axa',
 'bos',
 'mdc',
 'eko',
 'avl',
 'gfm',
 'cpi',
 'asr',
 'snv',
 'dsv',
 'smc',
 'bdo',
 'tg4',
 'adi',
 'odi',
 's4b',
 'srg',
 'rwe',
 'eat',
 'gwe',
 'isn',
 'emc',
 'dsm',
 'rag',
 'flf',
 'cbi',
 'crv',
 'mod',
 'cgg',
 'cgi',
 'dha',
 'aql',
 'idd',
 'rms',
 'btg',
 'tep',
 'gkn',
 'tmf',
 'psi',
 'jcb',
 'mrc',
 'ona',
 'wrc',
 's4c',
 'ams',
 'fxg',
 'gw4',
 'pax',
 'pi3',
 'bce',
 'key',
 'aad',
 'hst',
 'afm',
 'r20',
 'eef',
 'cel',
 'dri',
 'ceo',
 'nrc',
 'ypo',
 'qi3',
 'ofs',
 'skf',
 'eia',
 'ssp',
 'art',
 'ewm',
 'gsk',
 'one',
 'heg',
 'law',
 'nhh',
 'mbs',
 'slb',
 'kek',
 'ogi',
 'thp',
 'nva',
 'nct',
 'gom',
 'lux',
 'emg',
 'e3g',
 'npl',
 'aww',
 'fic',
 'ics',
 'bvt',
 'trw',
 'ubs',
 'mtn',
 'iic',
 'ktn',
 'agas',
 'rift',
 'iter',
 'eut0',
 'daye',
 'w

* distances: similarity scores (0-1, lower = more similar)
* indices: positions of similar names in unique_names list

In [21]:
# vectors_as_array = name_vectors.toarray()

In [22]:
len(distances) == len(unique_names)

True

In [34]:
grouped_names = group_similar_names(unique_names, indices, distances, threshold=0.3)

some weird ones

```
'shell international trading and shipping company limited': 
    ['shell '
    'international '
    'trading '
    'and '
    'shipping '
    'company '
    'ltd',
    'shell '
    'international '
    'trading '
    'shipping '
    'c'],
```

In [35]:
pprint(grouped_names)

{'100 per cent open': ['100 per cent open', 'arthur sanderson and sons'],
 '1000 women trust': ['1000 women trust',
                      'betty and taylors of harrogate limited'],
 '1000livesplus': ['1000livesplus',
                   'swadhinata trust',
                   'molecular biology institute of barcelona'],
 '1010': ['1010', 'cornish audio visual archive cava'],
 '10x genomics inc': ['scibotics limited', '10x genomics inc'],
 '1715labs': ['senckenberg research institute and nature museum '
              'senckenberganlage',
              'benchmark plc',
              '1715labs'],
 '1947 partition archive': ['institut pierre richet ipr',
                            'dundee institute of architects dia',
                            'turintech ai',
                            '1947 partition archive'],
 '1spatial group limited': ['1spatial group limited',
                            'china executive leadership academy pudong'],
 '1via ltd': ['rts life science',
              '1

In [None]:
# map clusters to indices, and lookup 