In [1]:
import json
import pandas as pd
from fuzzywuzzy import fuzz, process
from collections import defaultdict

# Load the UK names data
with open('../data/raw/uk_data.json', 'r') as file:
    uk_data = json.load(file)

# Extract unique names
uk_names = list(set(entry['name'] for entry in uk_data))

# Function to group similar names using fuzzy matching
def group_similar_names(names, threshold=80):
    grouped_names = defaultdict(list)
    used_names = set()

    for name in names:
        if name in used_names:
            continue

        # Find the best matches for the current name
        matches = process.extract(name, names, scorer=fuzz.token_sort_ratio)

        # Filter matches that meet the threshold
        similar_names = [match[0] for match in matches if match[1] >= threshold and match[0] != name]

        # Add the name and its matches to the grouped names
        grouped_names[name].extend(similar_names)
        
        # Mark these names as used
        used_names.add(name)
        used_names.update(similar_names)

    return grouped_names

# Group the UK names
threshold = 80  # Adjust the threshold as needed
grouped_names = group_similar_names(uk_names, threshold=threshold)

# Convert the grouped names into a DataFrame for easier viewing
grouped_names_df = pd.DataFrame({
    'Name': grouped_names.keys(),
    'Similar Names': ['; '.join(names) for names in grouped_names.values()]
})

# Display the first few groups
grouped_names_df.head(20)



Unnamed: 0,Name,Similar Names
0,CHAS A BLATCHFORD AND SONS LIMITED,
1,LIMITEDSKIES LIMITED,RED SKIES LIMITED
2,PIXELMILL LTD,
3,BFM LIMITED,ABB LIMITED; BOC LIMITED; JVM LIMITED; ARM LIM...
4,DESAP SYSTEM SOLUTIONS LIMITED,DAPHNE WATER SOLUTIONS LIMITED
5,REPORTBRAIN LIMITED,REPOWERING LIMITED
6,MAIER UK LTD,SAHER UK LTD
7,Nile HQ Ltd,
8,CLOUDWEAVERS LTD,
9,ICONAL TECHNOLOGY LTD,Cogent Technology Ltd; FISCAL Technologies Ltd...


In [20]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict

# Load the UK names data
with open('../data/raw/uk_data.json', 'r') as file:
    uk_data = json.load(file)

# Extract unique names
uk_names = list(set(entry['name'] for entry in uk_data))

# Vectorize the names using TF-IDF
vectorizer = TfidfVectorizer().fit(uk_names)
name_vectors = vectorizer.transform(uk_names)

# Use Nearest Neighbors to find similar names
nbrs = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute').fit(name_vectors)
distances, indices = nbrs.kneighbors(name_vectors)

# Group similar names
threshold = 0.9  # Cosine similarity threshold (0 to 1, lower means more similar)
grouped_names = defaultdict(list)
used_names = set()

for i, name in enumerate(uk_names):
    if name in used_names:
        continue

    # Get similar names based on the cosine similarity
    similar_names = [uk_names[idx] for j, idx in enumerate(indices[i]) if distances[i][j] <= threshold and idx != i]
    
    if similar_names:  # Only consider if there are matches
        # Add to the group and mark as used
        grouped_names[name].extend(similar_names)
        used_names.add(name)
        used_names.update(similar_names)

# Convert the grouped names into a DataFrame for easier viewing
grouped_names_df = pd.DataFrame({
    'Name': grouped_names.keys(),
    'Similar Names': ['; '.join(names) for names in grouped_names.values()]
})

# Filter to show only items with matches
grouped_names_df = grouped_names_df[grouped_names_df['Similar Names'].str.len() > 0]

# Output statistics
total_unique_names = len(uk_names)
total_matched_names = len(grouped_names_df)
total_matches = grouped_names_df['Similar Names'].apply(lambda x: len(x.split('; '))).sum()

print(f"Total unique UK names: {total_unique_names}")
print(f"Total unique names with matches: {total_matched_names}")
print(f"Total number of matches found: {total_matches}")
print(f"Percentage of unique names with matches: {total_matched_names / total_unique_names * 100:.2f}%")

# Display the first few groups with matches
grouped_names_df.head(20)


Total unique UK names: 5422
Total unique names with matches: 2684
Total number of matches found: 14620
Percentage of unique names with matches: 49.50%


Unnamed: 0,Name,Similar Names
0,CHAS A BLATCHFORD AND SONS LIMITED,G R WRIGHT AND SONS LTD; John Wiley and Sons L...
1,LIMITEDSKIES LIMITED,I.C.T.S.(U.K.) LIMITED; C.D. LIMITED
2,PIXELMILL LTD,T.L.R. LTD; B G RESEARCH LTD; Q TECHNOLOGIES LTD
3,BFM LIMITED,I.C.T.S.(U.K.) LIMITED; C.D. LIMITED
4,DESAP SYSTEM SOLUTIONS LIMITED,X-SYSTEM; DESAP ENTERPRISES LIMITED; INTEGRATE...
5,REPORTBRAIN LIMITED,I.C.T.S.(U.K.) LIMITED; C.D. LIMITED
6,MAIER UK LTD,O.S. Energy (UK) Ltd; T.L.R. LTD; DAIRY UK LTD...
7,Nile HQ Ltd,T.L.R. LTD
8,CLOUDWEAVERS LTD,T.L.R. LTD; B G RESEARCH LTD; Q TECHNOLOGIES LTD
9,ICONAL TECHNOLOGY LTD,INTEGRATION TECHNOLOGY LTD; MICROWAVE TECHNOLO...


In [21]:
# save the df to tmp/uk_names_grouped.csv
grouped_names_df.to_csv('tmp/uk_names_grouped.csv', index=False)