In [1]:
import numpy as np
import pandas as pd
from fuzzywuzzy import process, fuzz
import re



In [2]:
base = pd.read_csv('base_names.csv')
variations = pd.read_csv('name_variations.csv')

In [3]:
base

Unnamed: 0,Base_Name_ID,Base_Name
0,1,John Smith
1,2,Jennifer Brown
2,3,Michael O'Connor
3,4,Maria Garcia
4,5,Robert Lee
5,6,Linda Johnson
6,7,William Davis
7,8,Elizabeth Wilson
8,9,David Martinez
9,10,Susan Clark


In [4]:
variations

Unnamed: 0,Variation,Matches_With_Base_Name
0,Thomas King,Thomas King
1,ThomasKing,Thomas King
2,Maria Garcia,Maria Garcia
3,MaryLewis,Mary Lewis
4,Nancy W.,Nancy Wright
...,...,...
95,Jennifer- Brown,Jennifer Brown
96,Daniel- Scott,Daniel Scott
97,David M.,David Martinez
98,Paul Allen.,Paul Allen


In [5]:
def preprocess(name):
    name = name.lower()
    name = re.sub(r'\s+', ' ', name)  # Remove extra spaces
    name = re.sub(r'[^\w\s]', '', name)  # Remove punctuation
    return name

base['Base_Name'] = base['Base_Name'].apply(preprocess)
variations['Variation'] = variations['Variation'].apply(preprocess)
variations['Matches_With_Base_Name']=variations['Matches_With_Base_Name'].apply(preprocess)

In [6]:
variations

Unnamed: 0,Variation,Matches_With_Base_Name
0,thomas king,thomas king
1,thomasking,thomas king
2,maria garcia,maria garcia
3,marylewis,mary lewis
4,nancy w,nancy wright
...,...,...
95,jennifer brown,jennifer brown
96,daniel scott,daniel scott
97,david m,david martinez
98,paul allen,paul allen


In [7]:
def fuzzy_match(name, choices, scorer=fuzz.partial_token_sort_ratio, threshold=60):
    """Fuzzy match a name against a list of choices."""
    results = process.extractOne(name, choices, scorer=scorer)
    if results:
        best_match, score = results[0], results[1]
        return best_match if score >= threshold else None
    return None

def match_names(base, variations):
    """Match variations to base names."""
    matches = []
    # Preprocess base names for consistent comparison
    base_names = base['Base_Name'].apply(preprocess).tolist()
    
    for _, row in variations.iterrows():
        variation = preprocess(row['Variation'])
        match = fuzzy_match(variation, base_names)
        matches.append({
            'Variation': row['Variation'],
            'Match_With_Base_Name': match
        })
    return pd.DataFrame(matches)

In [8]:
matches_df = match_names(base, variations)
matches_df

Unnamed: 0,Variation,Match_With_Base_Name
0,thomas king,thomas king
1,thomasking,thomas king
2,maria garcia,maria garcia
3,marylewis,
4,nancy w,nancy wright
...,...,...
95,jennifer brown,jennifer brown
96,daniel scott,daniel scott
97,david m,david martinez
98,paul allen,paul allen


In [9]:
matches_df[matches_df['Match_With_Base_Name'].isna()]

Unnamed: 0,Variation,Match_With_Base_Name
3,marylewis,
