In [1]:
import json
import pandas as pd
from fuzzywuzzy import fuzz, process
from collections import defaultdict

# Load the UK names data
with open('../data/raw/uk_data.json', 'r') as file:
    uk_data = json.load(file)

# Extract unique names
uk_names = list(set(entry['name'] for entry in uk_data))

# Function to group similar names using fuzzy matching
def group_similar_names(names, threshold=80):
    grouped_names = defaultdict(list)
    used_names = set()

    for name in names:
        if name in used_names:
            continue

        # Find the best matches for the current name
        matches = process.extract(name, names, scorer=fuzz.token_sort_ratio)

        # Filter matches that meet the threshold
        similar_names = [match[0] for match in matches if match[1] >= threshold and match[0] != name]

        # Add the name and its matches to the grouped names
        grouped_names[name].extend(similar_names)
        
        # Mark these names as used
        used_names.add(name)
        used_names.update(similar_names)

    return grouped_names

# Group the UK names
threshold = 80  # Adjust the threshold as needed
grouped_names = group_similar_names(uk_names, threshold=threshold)

# Convert the grouped names into a DataFrame for easier viewing
grouped_names_df = pd.DataFrame({
    'Name': grouped_names.keys(),
    'Similar Names': ['; '.join(names) for names in grouped_names.values()]
})

# Display the first few groups
grouped_names_df.head(20)



KeyboardInterrupt: 

In [None]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict

# Load the UK names data
with open('../data/raw/uk_data.json', 'r') as file:
    uk_data = json.load(file)

# Extract unique names
uk_names = list(set(entry['name'] for entry in uk_data))

# Vectorize the names using TF-IDF
vectorizer = TfidfVectorizer().fit(uk_names)
name_vectors = vectorizer.transform(uk_names)

# Use Nearest Neighbors to find similar names
nbrs = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute').fit(name_vectors)
distances, indices = nbrs.kneighbors(name_vectors)

# Group similar names
threshold = 0.9  # Cosine similarity threshold (0 to 1, lower means more similar)
grouped_names = defaultdict(list)
used_names = set()

for i, name in enumerate(uk_names):
    if name in used_names:
        continue

    # Get similar names based on the cosine similarity
    similar_names = [uk_names[idx] for j, idx in enumerate(indices[i]) if distances[i][j] <= threshold and idx != i]
    
    if similar_names:  # Only consider if there are matches
        # Add to the group and mark as used
        grouped_names[name].extend(similar_names)
        used_names.add(name)
        used_names.update(similar_names)

# Convert the grouped names into a DataFrame for easier viewing
grouped_names_df = pd.DataFrame({
    'Name': grouped_names.keys(),
    'Similar Names': ['; '.join(names) for names in grouped_names.values()]
})

# Filter to show only items with matches
grouped_names_df = grouped_names_df[grouped_names_df['Similar Names'].str.len() > 0]

# Output statistics
total_unique_names = len(uk_names)
total_matched_names = len(grouped_names_df)
total_matches = grouped_names_df['Similar Names'].apply(lambda x: len(x.split('; '))).sum()

print(f"Total unique UK names: {total_unique_names}")
print(f"Total unique names with matches: {total_matched_names}")
print(f"Total number of matches found: {total_matches}")
print(f"Percentage of unique names with matches: {total_matched_names / total_unique_names * 100:.2f}%")

# Display the first few groups with matches
grouped_names_df.head(20)


Total unique UK names: 5422
Total unique names with matches: 2684
Total number of matches found: 14620
Percentage of unique names with matches: 49.50%


Unnamed: 0,Name,Similar Names
0,CHAS A BLATCHFORD AND SONS LIMITED,G R WRIGHT AND SONS LTD; John Wiley and Sons L...
1,LIMITEDSKIES LIMITED,I.C.T.S.(U.K.) LIMITED; C.D. LIMITED
2,PIXELMILL LTD,T.L.R. LTD; B G RESEARCH LTD; Q TECHNOLOGIES LTD
3,BFM LIMITED,I.C.T.S.(U.K.) LIMITED; C.D. LIMITED
4,DESAP SYSTEM SOLUTIONS LIMITED,X-SYSTEM; DESAP ENTERPRISES LIMITED; INTEGRATE...
5,REPORTBRAIN LIMITED,I.C.T.S.(U.K.) LIMITED; C.D. LIMITED
6,MAIER UK LTD,O.S. Energy (UK) Ltd; T.L.R. LTD; DAIRY UK LTD...
7,Nile HQ Ltd,T.L.R. LTD
8,CLOUDWEAVERS LTD,T.L.R. LTD; B G RESEARCH LTD; Q TECHNOLOGIES LTD
9,ICONAL TECHNOLOGY LTD,INTEGRATION TECHNOLOGY LTD; MICROWAVE TECHNOLO...


In [None]:
# save the df to tmp/uk_names_grouped.csv
# grouped_names_df.to_csv('tmp/uk_names_grouped.csv', index=False)

In [2]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict

# Load the UK names data
with open('../data/raw/uk_data.json', 'r') as file:
    uk_data = json.load(file)

# Combine the name, short_name, and standardized_name fields
def combine_names(entry):
    return ' '.join(filter(None, [entry.get('name', ''), entry.get('short_name', ''), entry.get('standardized_name', '')]))

# Create a list of combined names
combined_names = [combine_names(entry) for entry in uk_data]

# Remove exact duplicates to keep only unique combined names
unique_combined_names = list(set(combined_names))

# Vectorize the unique combined names using TF-IDF
vectorizer = TfidfVectorizer().fit(unique_combined_names)
name_vectors = vectorizer.transform(unique_combined_names)

# Use Nearest Neighbors to find similar names
nbrs = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute').fit(name_vectors)
distances, indices = nbrs.kneighbors(name_vectors)

# Group similar names
threshold = 0.2  # Cosine similarity threshold (0 to 1, lower means more similar)
grouped_names = defaultdict(list)
used_names = set()

for i, name in enumerate(unique_combined_names):
    if name in used_names:
        continue

    # Get similar names based on the cosine similarity
    similar_names = [unique_combined_names[idx] for j, idx in enumerate(indices[i]) if distances[i][j] <= threshold and idx != i]
    
    if similar_names:  # Only consider if there are matches
        # Add to the group and mark as used
        grouped_names[name].extend(similar_names)
        used_names.add(name)
        used_names.update(similar_names)

# Convert the grouped names into a DataFrame for easier viewing
grouped_names_df = pd.DataFrame({
    'Name': grouped_names.keys(),
    'Similar Names': ['; '.join(names) for names in grouped_names.values()]
})

# Filter to show only items with matches
grouped_names_df = grouped_names_df[grouped_names_df['Similar Names'].str.len() > 0]

# Output statistics
total_unique_names = len(unique_combined_names)
total_matched_names = len(grouped_names_df)
total_matches = grouped_names_df['Similar Names'].apply(lambda x: len(x.split('; '))).sum()

print(f"Total unique combined names: {total_unique_names}")
print(f"Total unique combined names with matches: {total_matched_names}")
print(f"Total number of matches found: {total_matches}")
print(f"Percentage of unique combined names with matches: {total_matched_names / total_unique_names * 100:.2f}%")

# Display the first few groups with matches
grouped_names_df.head(20)


Total unique combined names: 5422
Total unique combined names with matches: 60
Total number of matches found: 64
Percentage of unique combined names with matches: 1.11%


Unnamed: 0,Name,Similar Names
0,UK CENTRE FOR ECOLOGY & HYDROLOGY,Centre for Ecology and Hydrology
1,ELEMENT SIX LIMITED,ELEMENT SIX (UK) LIMITED
2,IAN CATLING CONSULTANCY,Ian Catling Consultancy
3,SCRIPTORIA LTD,Scriptoria
4,Rockwell Collins (UK) Ltd.,ROCKWELL COLLINS UK LIMITED
5,Medicines and Healthcare products Regulatory Agency,MEDICINES AND HEALTHCARE PRODUCTS REGULATORY AGENCY
6,THE QUEEN'S UNIVERSITY OF BELFAST,Queen's University Belfast
7,Diamond Biopharm Limited,DIAMOND BIOPHARM LIMITED
8,TRL Limited,TRL LIMITED
9,TATA STEEL UK LIMITED,TATA STEEL UK CONSULTING LIMITED


In [9]:
import pandas as pd

# Function to group similar names based on a specified threshold
def group_similar_names(threshold=0.2):
    grouped_names = defaultdict(list)
    used_names = set()

    for i, name in enumerate(unique_combined_names):
        if name in used_names:
            continue

        # Get similar names based on the cosine similarity
        similar_names = [unique_combined_names[idx] for j, idx in enumerate(indices[i]) if distances[i][j] <= threshold and idx != i]

        if similar_names:  # Only consider if there are matches
            # Add to the group and mark as used
            grouped_names[name].extend(similar_names)
            used_names.add(name)
            used_names.update(similar_names)

    # Convert to DataFrame
    grouped_names_df = pd.DataFrame({
        'Unique Combination': grouped_names.keys(),
        'Matched Combinations': ['; '.join(names) for names in grouped_names.values()]
    })

    # Filter to show only items with matches
    grouped_names_df = grouped_names_df[grouped_names_df['Matched Combinations'].str.len() > 0]

    # Set the max column width to display full content
    pd.set_option('display.max_colwidth', None)

    # Output statistics
    total_matched_names = len(grouped_names_df)
    total_matches = grouped_names_df['Matched Combinations'].apply(lambda x: len(x.split('; '))).sum()

    print(f"Threshold: {threshold}")
    print(f"Total unique combined names with matches: {total_matched_names}")
    print(f"Total number of matches found: {total_matches}")
    print(f"Percentage of unique combined names with matches: {total_matched_names / total_unique_names * 100:.2f}%")

    # Display the first few groups with matches
    display(grouped_names_df.head(20))

# Example usage: play with the threshold
group_similar_names(threshold=0.1)
group_similar_names(threshold=0.3)
group_similar_names(threshold=0.5)


Threshold: 0.1
Total unique combined names with matches: 26
Total number of matches found: 28
Percentage of unique combined names with matches: 0.48%


Unnamed: 0,Unique Combination,Matched Combinations
0,ERICSSON LIMITED,Ericsson Limited
1,Queen's University Belfast,THE QUEEN'S UNIVERSITY OF BELFAST
2,OFFSHORE RENEWABLE ENERGY CATAPULT,Offshore Renewable Energy Catapult Limited
3,TRL LIMITED,TRL Limited
4,BLACK MOUNTAIN INSULATION LTD,Black Mountain Insulation Ltd
5,THE CHANCELLOR MASTERS AND SCHOLARS OF THE UNIVERSITY OF CAMBRIDGE,"The Chancellor, Masters and Scholars of the University of Cambridge; THE CHANCELLOR, MASTERS AND SCHOLARS OF THE UNIVERSITY OF OXFORD"
6,CPL SCIENTIFIC PUBLISHING SERVICES LTD,CPL Scientific Publishing Services Ltd
7,UK CENTRE FOR ECOLOGY & HYDROLOGY,Centre for Ecology and Hydrology
8,MEDICINES AND HEALTHCARE PRODUCTS REGULATORY AGENCY,Medicines and Healthcare products Regulatory Agency
9,PRINTED ELECTRONICS LIMITED,Printed Electronics Ltd


Threshold: 0.3
Total unique combined names with matches: 146
Total number of matches found: 155
Percentage of unique combined names with matches: 2.69%


Unnamed: 0,Unique Combination,Matched Combinations
0,STOLI CATALYSTS LTD,I.G. CATALYSTS LTD
1,POLICE AND CRIME COMMISSIONER FOR NORTH YORKSHIRE,POLICE AND CRIME COMMISSIONER FOR WEST YORKSHIRE; THE POLICE AND CRIME COMMISSIONER FOR SOUTH YORKSHIRE
2,ERICSSON LIMITED,Ericsson Limited
3,UNIVERSITY HOSPITALS SOUTHAMPTON NHS FOUNDATION TRUST,Cambridge University Hospitals NHS Foundation Trust; OXFORD UNIVERSITY HOSPITALS NHS FOUNDATION TRUST
4,Queen's University Belfast,THE QUEEN'S UNIVERSITY OF BELFAST
5,OPEN SOURCE INNOVATION LTD,OPEN SOURCE MANAGEMENT LIMITED
6,P1VITAL LIMITED,P1VITAL PRODUCTS LIMITED
7,BAE SYSTEMS INTEGRATED SYSTEM TECHNOLOGIES LTD,INTEGRATED SYSTEM TECHNOLOGIES LTD
8,COSTAIN GROUP PLC,COSTAIN LTD
9,WEST MIDLANDS POLICE AND CRIME COMMISSIONER,POLICE AND CRIME COMMISSIONER FOR WEST YORKSHIRE


Threshold: 0.5
Total unique combined names with matches: 732
Total number of matches found: 1136
Percentage of unique combined names with matches: 13.50%


Unnamed: 0,Unique Combination,Matched Combinations
0,OXFORD NANOPORE TECHNOLOGIES LTD,OXFORD TECHNOLOGIES LTD
1,MINERVA PUBLIC RELATIONS & COMMUNICATIONS LIMITED,MINERVA HEALTH & CARE COMMUNICATIONS LTD
2,URBAN HAWK LIMITED,Hawk Associates Limited
3,FIRE SERVICE COLLEGE LIMITED,SCOTTISH FIRE AND RESCUE SERVICE
4,DATA CENTRE ALLIANCE LIMITED,RESEARCH DATA ALLIANCE FOUNDATION
5,STOLI CATALYSTS LTD,I.G. CATALYSTS LTD
6,APPLIED GENOMICS LTD,Genomics England Limited
7,Preston EV Limited,PRESTON SOLUTIONS LIMITED
8,THE ROYAL COLLEGE OF GENERAL PRACTITIONERS,RCGP RSC-The Royal College of General Practitioners and Surveillance Centre; THE ROYAL COLLEGE OF ART
9,OLDHAM METROPOLITAN BOROUGH COUNCIL,BARNSLEY METROPOLITAN BOROUGH COUNCIL; DONCASTER METROPOLITAN BOROUGH COUNCIL; STOCKPORT METROPOLITAN BOROUGH COUNCIL


In [1]:
import json
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict

# Load the UK names data
with open('../data/raw/uk_data.json', 'r') as file:
    uk_data = json.load(file)

# Function to preprocess the names
def preprocess_name(name):
    name = name.lower()  # Convert to lowercase
    name = re.sub(r'\s+', ' ', name)  # Replace multiple spaces with a single space
    name = re.sub(r'[^\w\s]', '', name)  # Remove punctuation
    return name.strip()  # Strip leading and trailing whitespace

# Combine the name, short_name, and standardized_name fields
def combine_names(entry):
    combined_name = ' '.join(filter(None, [entry.get('name', ''), entry.get('short_name', ''), entry.get('standardized_name', '')]))
    return preprocess_name(combined_name)

# Create a list of combined names
combined_names = [combine_names(entry) for entry in uk_data]

# Remove exact duplicates to keep only unique combined names
unique_combined_names = list(set(combined_names))
total_unique_names = len(unique_combined_names)

# Vectorize the unique combined names using TF-IDF
vectorizer = TfidfVectorizer().fit(unique_combined_names)
name_vectors = vectorizer.transform(unique_combined_names)

# Use Nearest Neighbors to find similar names
nbrs = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute').fit(name_vectors)
distances, indices = nbrs.kneighbors(name_vectors)

# Group similar names based on the specified threshold
def group_similar_names(threshold=0.2):
    grouped_names = defaultdict(list)
    used_names = set()

    for i, name in enumerate(unique_combined_names):
        if name in used_names:
            continue

        # Get similar names based on the cosine similarity
        similar_names = [unique_combined_names[idx] for j, idx in enumerate(indices[i]) if distances[i][j] <= threshold and idx != i]

        if similar_names:  # Only consider if there are matches
            # Add to the group and mark as used
            grouped_names[name].extend(similar_names)
            used_names.add(name)
            used_names.update(similar_names)

    # Convert to DataFrame
    grouped_names_df = pd.DataFrame({
        'Unique Combination': grouped_names.keys(),
        'Matched Combinations': ['; '.join(names) for names in grouped_names.values()]
    })

    # Filter to show only items with matches
    grouped_names_df = grouped_names_df[grouped_names_df['Matched Combinations'].str.len() > 0]

    # Set the max column width to display full content
    pd.set_option('display.max_colwidth', None)

    # Output statistics
    total_matched_names = len(grouped_names_df)
    total_matches = grouped_names_df['Matched Combinations'].apply(lambda x: len(x.split('; '))).sum()

    print(f"Threshold: {threshold}")
    print(f"Total unique combined names with matches: {total_matched_names}")
    print(f"Total number of matches found: {total_matches}")
    print(f"Percentage of unique combined names with matches: {total_matched_names / total_unique_names * 100:.2f}%")

    # Display the first few groups with matches
    display(grouped_names_df.head(20))

# Example usage: play with the threshold
# group_similar_names(threshold=0.1)
# group_similar_names(threshold=0.3)
group_similar_names(threshold=0.5)
# group_similar_names(threshold=0.7)
# group_similar_names(threshold=0.9)


Threshold: 0.5
Total unique combined names with matches: 670
Total number of matches found: 1010
Percentage of unique combined names with matches: 12.38%


Unnamed: 0,Unique Combination,Matched Combinations
0,comma press,the press association ltd
1,faculty of pharmaceutical medicine of the royal colleges of physicians of the united kingdom,royal college of nursing of the united kingdom royal charter
2,vision sense limited,molecular sense ltd; molecular vision limited
3,stirling dynamics ltd,the university of stirling
4,coal products limited,cymru coal limited
5,innova biosciences limited,innova integra limited
6,royal liverpool and broadgreen university hospitals nhs trust,nottingham university hospitals nhs trust; liverpool university hospital nhs foundation trust; cambridge university hospitals nhs foundation trust; oxford university hospitals nhs foundation trust
7,wilding butler construction ltd,g w butler limited
8,rs consulting limited,rs hydro ltd
9,arralis technologies ltd,q technologies ltd
