In [6]:
import pandas as pd
from rapidfuzz import process, fuzz
import sys
import argparse

def fuzzy_match(df_a, df_b, col_a, col_b, threshold=80, output_file='matches.csv'):
    """
    Matches names from two DataFrames using fuzzy matching.
    """

    # Ensure columns exist
    if col_a not in df_a.columns:
        print(f"Error: Column '{col_a}' not found in DataFrame A")
        return
    if col_b not in df_b.columns:
        print(f"Error: Column '{col_b}' not found in DataFrame B")
        return

    # Convert to string and handle NaNs
    names_a = df_a[col_a].astype(str).fillna('')
    names_b = df_b[col_b].astype(str).fillna('')
    
    # Create a mapping of name -> index/row for lookup if needed, 
    # but for now we just want to find matches for A in B.
    
    results = []
    
    print(f"Matching {len(names_a)} names from DataFrame A against {len(names_b)} names from DataFrame B...")
    
    # Iterate through names in A and find best match in B
    for idx, name in names_a.items():
        if not name.strip():
            continue
            
        # Approach 1: Token Sort (Good for "Smith, John" vs "John Smith")
        match_sort = process.extractOne(
            name, names_b, scorer=fuzz.token_sort_ratio, score_cutoff=threshold
        )
        
        # Approach 2: Token Set (Good for "John Smith" vs "John Smith (CEO)")
        match_set = process.extractOne(
            name, names_b, scorer=fuzz.token_set_ratio, score_cutoff=threshold
        )

        # Logic: Pick the method that gave the higher score
        best_match = None
        method_used = "None"

        if match_sort and match_set:
            if match_set[1] > match_sort[1]:
                best_match = match_set
                method_used = "token_set"
            else:
                best_match = match_sort
                method_used = "token_sort"
        elif match_sort:
            best_match = match_sort
            method_used = "token_sort"
        elif match_set:
            best_match = match_set
            method_used = "token_set"
        
        if best_match:
            matched_name, score, match_idx = best_match
            results.append({
                'Original Name (A)': name,
                'Matched Name (B)': matched_name,
                'Score': score,
                'Method': method_used,
                'Index A': idx,
                'Index B': match_idx
            })
            
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    
    if not results_df.empty:
        print(f"\nFound {len(results_df)} matches with score >= {threshold}:")
        print(results_df.to_string(index=False))
        
        # Optional: Save to CSV
        results_df.to_csv(output_file, index=False)
        print(f"\nMatches saved to {output_file}")
    else:
        print("\nNo matches found above the threshold.")

if __name__ == "__main__":
    # Create inline DataFrames
    data_a = {
        'id': [1, 2, 3, 4, 5],
        'name': ['John Smith', 'Jane Doe', 'Robert Johnson', 'Michael Brown', 'Emily Davis']
    }
    df_a = pd.DataFrame(data_a)

    data_b = {
        'id': [101, 102, 103, 104, 105, 106],
        'full_name': ['Smith, John', 'J. Doe', 'Bob Johnson', 'Mike Brown', 'Emily J. Davis', 'Unmatched Person']
    }
    df_b = pd.DataFrame(data_b)

    parser = argparse.ArgumentParser(description="Fuzzy match names between two inline DataFrames.")
    parser.add_argument("--threshold", type=int, default=80, help="Matching threshold (0-100)")
    parser.add_argument("--output", default="matches.csv", help="Output CSV filename")
    
    # Handle running in Jupyter/IPython where sys.argv contains kernel args
    if 'ipykernel' in sys.modules:
        args = parser.parse_args([])
    else:
        args = parser.parse_args()
    
    # Pass the inline dataframes and specify the column names directly
    fuzzy_match(df_a, df_b, 'name', 'full_name', args.threshold, args.output)


Matching 5 names from DataFrame A against 6 names from DataFrame B...

Found 3 matches with score >= 80:
Original Name (A) Matched Name (B)      Score    Method  Index A  Index B
       John Smith      Smith, John  95.238095 token_set        0        0
   Robert Johnson      Bob Johnson  80.000000 token_set        2        2
      Emily Davis   Emily J. Davis 100.000000 token_set        4        4

Matches saved to matches.csv


In [1]:
import pandas as pd
from rapidfuzz import process, fuzz

def fuzzy_match(df_a, df_b, col_a, col_b, threshold=80):
    """
    Simplified fuzzy matcher using WRatio for robust comparison.
    """
    # 1. Clean data and convert to lists for speed
    names_a = df_a[col_a].fillna('').astype(str).tolist()
    names_b = df_b[col_b].fillna('').astype(str).tolist()
    
    results = []

    # 2. Iterate and match
    for name in names_a:
        if not name.strip(): continue
        
        # fuzz.WRatio handles "Smith, John" vs "John Smith" (Sort) 
        # AND "John Smith" vs "John Smith (CEO)" (Set) automatically.
        match = process.extractOne(
            name, names_b, scorer=fuzz.WRatio, score_cutoff=threshold
        )
        
        if match:
            # match is a tuple: (matched_string, score, index)
            results.append({
                'Original': name,
                'Matched': match[0],
                'Score': match[1]
            })

    return pd.DataFrame(results)

# --- Usage Example ---
if __name__ == "__main__":
    df_a = pd.DataFrame({'name': ['John Smith', 'Jane Doe', 'Robert Johnson']})
    df_b = pd.DataFrame({'full_name': ['Smith, John', 'J. Doe', 'Bob Johnson', 'Unrelated']})

    matches = fuzzy_match(df_a, df_b, 'name', 'full_name')
    
    print(matches)
    # matches.to_csv("matches.csv", index=False)

         Original      Matched     Score
0      John Smith  Smith, John  90.47619
1  Robert Johnson  Bob Johnson  80.00000


In [4]:
import pandas as pd
import re
import unicodedata
import jellyfish
from rapidfuzz import process, fuzz

class SmartMatcher:
    def __init__(self):
        # Common noise words for companies and people
        self.noise_words = {
            'inc', 'corp', 'llc', 'ltd', 'limited', 'company', 
            'mr', 'mrs', 'ms', 'dr', 'phd', 'jr', 'sr', 'ii', 'iii'
        }

    def normalize(self, text):
        if not isinstance(text, str):
            return ""
        
        # 1. Unicode Normalization (e.g., cafÃ© -> cafe)
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
        
        # 2. Lowercase
        text = text.lower()
        
        # 3. Remove punctuation/symbols (keep only letters and numbers)
        text = re.sub(r'[^a-z0-9\s]', '', text)
        
        # 4. Tokenize and remove noise words
        tokens = text.split()
        clean_tokens = [t for t in tokens if t not in self.noise_words]
        
        # Join back; if empty (e.g., input was just "Inc."), revert to original cleaned
        return " ".join(clean_tokens) if clean_tokens else " ".join(tokens)

    def get_match_score(self, name_a, name_b):
        """
        Returns a composite score based on Text Similarity AND Phonetic Similarity.
        """
        clean_a = self.normalize(name_a)
        clean_b = self.normalize(name_b)
        
        if not clean_a or not clean_b:
            return 0
            
        # --- Metric 1: Text Fuzzy Score (WRatio) ---
        # WRatio handles partial matches and ordering
        text_score = fuzz.WRatio(clean_a, clean_b)
        
        # --- Metric 2: Phonetic Hedging (Metaphone) ---
        # If they sound the same, we boost the score.
        # This fixes "Stephen" vs "Steven" or "Smith" vs "Smyth"
        phone_a = jellyfish.metaphone(clean_a)
        phone_b = jellyfish.metaphone(clean_b)
        
        phonetic_boost = 0
        if phone_a and phone_b and phone_a == phone_b:
            phonetic_boost = 15  # Bonus points for sounding identical
        
        # Cap the final score at 100
        final_score = min(text_score + phonetic_boost, 100)
        
        return final_score

def match_datasets(df_a, df_b, col_a, col_b, threshold=85):
    matcher = SmartMatcher()
    
    # Pre-calculate normalized B names to speed up lookups
    # We store tuples of (OriginalName, CleanName)
    b_lookup = [
        (original, matcher.normalize(original)) 
        for original in df_b[col_b].dropna().unique()
    ]
    
    results = []
    
    # Iterate A
    for name_a in df_a[col_a].dropna().unique():
        clean_a = matcher.normalize(name_a)
        
        best_match = None
        best_score = 0
        
        # Scan B (This can be optimized with blocking for massive datasets)
        for original_b, clean_b in b_lookup:
            
            # Optimization: Quick skip if lengths are wildly different
            if abs(len(clean_a) - len(clean_b)) > 5:
                continue

            # Calculate basic ratio first to avoid expensive phonetic logic if not needed
            # (We inline the logic here for performance)
            score = fuzz.WRatio(clean_a, clean_b)
            
            # Apply Phonetic Boost if close but not perfect
            if 60 < score < 100:
                phone_a = jellyfish.metaphone(clean_a)
                phone_b = jellyfish.metaphone(clean_b)
                if phone_a == phone_b:
                    score += 10 # Boost
            
            if score > best_score:
                best_score = score
                best_match = original_b
        
        if best_score >= threshold:
            results.append({
                'Input Name': name_a,
                'Best Match': best_match,
                'Confidence Score': best_score
            })
            
    return pd.DataFrame(results)

# --- Usage ---
if __name__ == "__main__":
    df_users = pd.DataFrame({'user': ['Cathy Smith', 'Johnathan Doe', 'Apple Inc.']})
    df_db = pd.DataFrame({'db_name': ['Kathy Smyth', 'John Doe', 'Apple']})

    matches = match_datasets(df_users, df_db, 'user', 'db_name')
    print(matches)

      Input Name   Best Match  Confidence Score
0    Cathy Smith  Kathy Smyth         91.818182
1  Johnathan Doe     John Doe         85.500000
2     Apple Inc.        Apple        100.000000


In [None]:
import pandas as pd
import recordlinkage
import jellyfish

# 1. Setup Dummy Data (Scale this to 100k+ rows)
data_a = {
    'id': [1, 2, 3, 4],
    'name': ['Jonathon Smith', 'Cathy E. Jones', 'Robert White', 'William Black'],
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}

data_b = {
    'id': [101, 102, 103, 104],
    'name': ['Johnathan Smyth', 'Kathy Jones', 'Bob White', 'Bill Black'],
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}

df_a = pd.DataFrame(data_a).set_index('id')
df_b = pd.DataFrame(data_b).set_index('id')

# --- STEP 1: Feature Engineering (The "Hedge") ---
# We create a 'blocking_key' based on how the name SOUNDS.
# This ensures 'Smith' and 'Smyth' end up in the same bucket.

def get_phonetic_key(text):
    if not isinstance(text, str): return ""
    # Metaphone is great for English names
    return jellyfish.metaphone(text)

# Apply to both dataframes
# We focus on the last name for blocking (usually more stable)
df_a['phonetic_key'] = df_a['name'].apply(lambda x: get_phonetic_key(x.split()[-1]))
df_b['phonetic_key'] = df_b['name'].apply(lambda x: get_phonetic_key(x.split()[-1]))

print("Blocking Keys Created (A):")
print(df_a[['name', 'phonetic_key']])

# --- STEP 2: Indexing (The "Blocker") ---
indexer = recordlinkage.Index()

# BLOCK: Only compare rows where 'phonetic_key' is identical.
# This reduces comparisons from N*M to a tiny fraction.
indexer.block('phonetic_key')

# Generate candidate pairs
candidate_links = indexer.index(df_a, df_b)

print(f"\nPairs to compare: {len(candidate_links)} (instead of {len(df_a)*len(df_b)})")

# --- STEP 3: Comparison Logic ---
compare_cl = recordlinkage.Compare()

# Exact match on city (High confidence feature)
compare_cl.exact('city', 'city', label='city_match')

# Fuzzy match on full name (The nuanced check)
# 'jarowinkler' is often faster/better for names than Levenshtein
compare_cl.string('name', 'name', method='jarowinkler', threshold=0.85, label='name_score')

# Compute features for candidate pairs
features = compare_cl.compute(candidate_links, df_a, df_b)

# --- STEP 4: Scoring & Filtering ---
# Simple rule: Must have name match >= 0.85 OR (Name >= 0.7 AND City Match)
matches = features[features['name_score'] >= 0.85]

# Add the original names back for readability
results = matches.copy()
results['Name A'] = results.index.map(lambda x: df_a.loc[x[0], 'name']) # Map index level 0
results['Name B'] = results.index.map(lambda x: df_b.loc[x[1], 'name']) # Map index level 1

print("\n--- Final Matches ---")
print(results[['Name A', 'Name B', 'name_score']])