### Evaluate traditional methods:

1. Implement and test Levenshtein distance, N-gram similarity, and Jaro-Winkler similarity on a diverse dataset of business name pairs.
Analyze their performance, strengths, and weaknesses.


2. Explore modern NLP approaches:

- Utilize pre-trained transformer models (e.g., BERT, RoBERTa) for semantic similarity matching.
- Investigate OpenAI's embedding models for business name representation.


3. Leverage LLM reasoning:

- Design prompts for LLMs (like GPT-3 or GPT-4) to perform business name matching.
- Explore few-shot learning techniques to improve performance.
- Analyze the LLM's ability to handle complex cases that traditional methods struggle with.


4. Comparative analysis:

- Create a comprehensive evaluation framework to compare all methods fairly.
- Assess performance metrics such as accuracy, precision, recall, and F1-score.
- Analyze computational efficiency and scalability of each approach.

In [3]:
business_pairs = [
    ("Apple Inc.", "Apple Incorporated"),
    ("Microsoft Corporation", "Microsoft Corp."),
    ("International Business Machines", "IBM"),
    ("McDonald's", "McDonalds"),
    ("Walmart Inc.", "Wal-Mart Stores Inc"),
    ("Amazon.com, Inc.", "Amazon"),
    ("The Coca-Cola Company", "Coca Cola Co"),
    ("Johnson & Johnson", "Johnson and Johnson"),
    ("Procter & Gamble", "P&G Company"),
    ("General Electric Company", "GE")
]

In [1]:
from Levenshtein import distance as levenshtein_distance
from nltk.util import ngrams
from jellyfish import jaro_winkler_similarity

def levenshtein_similarity(s1, s2):
    return 1 - levenshtein_distance(s1, s2) / max(len(s1), len(s2))

def ngram_similarity(s1, s2, n=2):
    s1_ngrams = set(''.join(ng) for ng in ngrams(s1, n))
    s2_ngrams = set(''.join(ng) for ng in ngrams(s2, n))
    return len(s1_ngrams.intersection(s2_ngrams)) / len(s1_ngrams.union(s2_ngrams))

# Jaro-Winkler similarity is already implemented in the jellyfish library

In [7]:
def evaluate_methods(pairs):
    results = []
    for method in [levenshtein_similarity, ngram_similarity, jaro_winkler_similarity]:
        method_name = method.__name__
        for pair in pairs:
            similarity = method(pair[0].lower(), pair[1].lower())
            results.append((method_name, pair[0], pair[1], similarity))
    return results

evaluation_results = evaluate_methods(business_pairs)

# Print results
for method, name1, name2, similarity in evaluation_results:
    print(f"{method}: {name1} vs {name2} = {similarity:.4f}")

levenshtein_similarity: Apple Inc. vs Apple Incorporated = 0.5000
levenshtein_similarity: Microsoft Corporation vs Microsoft Corp. = 0.6667
levenshtein_similarity: International Business Machines vs IBM = 0.0968
levenshtein_similarity: McDonald's vs McDonalds = 0.9000
levenshtein_similarity: Walmart Inc. vs Wal-Mart Stores Inc = 0.5263
levenshtein_similarity: Amazon.com, Inc. vs Amazon = 0.3750
levenshtein_similarity: The Coca-Cola Company vs Coca Cola Co = 0.5238
levenshtein_similarity: Johnson & Johnson vs Johnson and Johnson = 0.8421
levenshtein_similarity: Procter & Gamble vs P&G Company = 0.1250
levenshtein_similarity: General Electric Company vs GE = 0.0833
ngram_similarity: Apple Inc. vs Apple Incorporated = 0.4706
ngram_similarity: Microsoft Corporation vs Microsoft Corp. = 0.6500
ngram_similarity: International Business Machines vs IBM = 0.0000
ngram_similarity: McDonald's vs McDonalds = 0.7000
ngram_similarity: Walmart Inc. vs Wal-Mart Stores Inc = 0.4500
ngram_similarity: Am

In [8]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

def embedding_similarity(s1, s2):
    # Generate embeddings
    embeddings = model.encode([s1, s2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    
    return similarity

# Function to evaluate the method
def evaluate_embedding_method(pairs):
    results = []
    for pair in pairs:
        similarity = embedding_similarity(pair[0], pair[1])
        results.append(("Embedding Similarity", pair[0], pair[1], similarity))
    return results

# Evaluate the method
embedding_results = evaluate_embedding_method(business_pairs)

# Print results
for method, name1, name2, similarity in embedding_results:
    print(f"{method}: {name1} vs {name2} = {similarity:.4f}")

  from tqdm.autonotebook import tqdm, trange


Embedding Similarity: Apple Inc. vs Apple Incorporated = 0.8850
Embedding Similarity: Microsoft Corporation vs Microsoft Corp. = 0.9297
Embedding Similarity: International Business Machines vs IBM = 0.5283
Embedding Similarity: McDonald's vs McDonalds = 0.9524
Embedding Similarity: Walmart Inc. vs Wal-Mart Stores Inc = 0.9239
Embedding Similarity: Amazon.com, Inc. vs Amazon = 0.8189
Embedding Similarity: The Coca-Cola Company vs Coca Cola Co = 0.8977
Embedding Similarity: Johnson & Johnson vs Johnson and Johnson = 0.9458
Embedding Similarity: Procter & Gamble vs P&G Company = 0.4062
Embedding Similarity: General Electric Company vs GE = 0.4141


In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def transformer_classification(s1, s2):
    # Prepare input
    inputs = tokenizer(s1, s2, return_tensors="pt", padding=True, truncation=True)
    
    # Get model output
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get prediction (0 for not match, 1 for match)
    prediction = outputs.logits.argmax().item()
    
    return prediction

# Function to evaluate the method
def evaluate_transformer_method(pairs):
    results = []
    for pair in pairs:
        prediction = transformer_classification(pair[0], pair[1])
        results.append(("Transformer Classification", pair[0], pair[1], prediction))
    return results

# Evaluate the method
transformer_results = evaluate_transformer_method(business_pairs)

# Print results
for method, name1, name2, prediction in transformer_results:
    print(f"{method}: {name1} vs {name2} = {'Match' if prediction == 1 else 'No Match'}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Transformer Classification: Apple Inc. vs Apple Incorporated = Match
Transformer Classification: Microsoft Corporation vs Microsoft Corp. = Match
Transformer Classification: International Business Machines vs IBM = Match
Transformer Classification: McDonald's vs McDonalds = Match
Transformer Classification: Walmart Inc. vs Wal-Mart Stores Inc = Match
Transformer Classification: Amazon.com, Inc. vs Amazon = Match
Transformer Classification: The Coca-Cola Company vs Coca Cola Co = Match
Transformer Classification: Johnson & Johnson vs Johnson and Johnson = Match
Transformer Classification: Procter & Gamble vs P&G Company = Match
Transformer Classification: General Electric Company vs GE = Match


Let's explore how we can leverage Large Language Models (LLMs) like GPT-3 or GPT-4 for business name matching. LLMs offer powerful reasoning capabilities that can potentially handle complex matching scenarios more effectively than traditional or even modern embedding-based methods.
Here's how we can approach using LLMs for this task:

Prompt Engineering:
We'll need to design effective prompts that instruct the LLM to perform business name matching. Here's an example prompt:

You are an expert system for matching business names. Given two business names, your task is to determine if they refer to the same company. Consider variations in spelling, abbreviations, legal suffixes, and word order. Respond with 'Match' if the names likely refer to the same company, or 'No Match' if they likely refer to different companies. Also provide a brief explanation for your decision.

Business Name 1: {name1}
Business Name 2: {name2}

Are these names a match?

**API Integration:**
We'll need to integrate with an LLM API. For this example, let's assume we're using OpenAI's GPT-3.5 or GPT-4 API. Here's a Python function to interact with the API:



SyntaxError: invalid syntax (345355743.py, line 1)

In [13]:
from openai import OpenAI
import os

client = OpenAI()

def llm_match(name1, name2):
    prompt = f"""
    You are an expert system for matching business names. Given two business names, your task is to determine if they refer to the same company. Consider variations in spelling, abbreviations, legal suffixes, and word order. Respond with 'Match' if the names likely refer to the same company, or 'No Match' if they likely refer to different companies. Also provide a brief explanation for your decision.

    Business Name 1: {name1}
    Business Name 2: {name2}

    Are these names a match?
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",  # or "gpt-4" if available
        messages=[
            {"role": "system", "content": "You are an expert in business name matching."},
            {"role": "user", "content": prompt}
        ]
    )

    return response.choices[0].message.content

# Example usage
try:
    result = llm_match("Apple Inc.", "Apple Incorporated")
    print(result)
except Exception as e:
    print(f"An error occurred: {e}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Match

Explanation: The two business names are a match as they refer to the same company, Apple Inc., even though one uses "Inc." and the other uses "Incorporated". The main identifying word "Apple" remains the same.


In [15]:
import pandas as pd
from Levenshtein import distance as levenshtein_distance
from nltk.util import ngrams
from jellyfish import jaro_winkler_similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
import os

# 1. Set up the dataset
businessNames1 = ["HANAN ATHER TRUCKING"]
businessNames2 = [
    "HANAN TAHER TRUCKING",
    "TRUCKING INC HANAN ATHER",
    "ATHER TRUCKING INC",
    "GODBOUT TRUCKING INC",
    "HANAN ATHER PHARMACY INC",
    "Ather INC"
]

# 2. Implement traditional methods
def levenshtein_similarity(s1, s2):
    return 1 - levenshtein_distance(s1.lower(), s2.lower()) / max(len(s1), len(s2))

def ngram_similarity(s1, s2, n=2):
    s1_ngrams = set(''.join(ng) for ng in ngrams(s1.lower(), n))
    s2_ngrams = set(''.join(ng) for ng in ngrams(s2.lower(), n))
    return len(s1_ngrams.intersection(s2_ngrams)) / len(s1_ngrams.union(s2_ngrams))

# Jaro-Winkler similarity is already implemented in the jellyfish library

# 3. Implement modern NLP method
model = SentenceTransformer('all-MiniLM-L6-v2')

def embedding_similarity(s1, s2):
    embeddings = model.encode([s1, s2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

# 4. Implement LLM-based method
client = OpenAI()

def llm_match_with_score(name1, name2):
    prompt = f"""
    You are an expert system for matching business names. Given two business names, 
    determine if they refer to the same company. 
    Consider variations in spelling, abbreviations, legal suffixes, and word order.

    Respond with a confidence score between 0 and 1, where:
    0 means definitely not a match
    1 means definitely a match

    Provide your response in this format:
    Score: [Your score between 0 and 1]
    Explanation: [Brief explanation for your score]

    Business Name 1: {name1}
    Business Name 2: {name2}

    What is your confidence score for these names matching?
    """
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an expert in business name matching."},
                {"role": "user", "content": prompt}
            ]
        )
        content = response.choices[0].message.content.strip()
        score_line = content.split('\n')[0]
        score = float(score_line.split(':')[1].strip())
        return score
    except Exception as e:
        return f"Error: {str(e)}"

def llm_match_with_category(name1, name2):
    prompt = f"""
    You are an expert system for matching business names. 
    Given two business names, determine if they refer to the same company.
    Consider variations in spelling, abbreviations, legal suffixes, and word order.

    Categorize your confidence in the match using one of these categories:
    - Definite Match
    - Likely Match
    - Possible Match
    - Unlikely Match
    - Definite Non-Match

    Provide your response in this format:
    Category: [Your chosen category]
    Explanation: [Brief explanation for your category choice]

    Business Name 1: {name1}
    Business Name 2: {name2}

    What is your confidence category for these names matching?
    """
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an expert in business name matching."},
                {"role": "user", "content": prompt}
            ]
        )
        content = response.choices[0].message.content.strip()
        category_line = content.split('\n')[0]
        category = category_line.split(':')[1].strip()
        return category
    except Exception as e:
        return f"Error: {str(e)}"

# Update the compare_methods function
def compare_methods(name1, name2):
    return {
        'Levenshtein': levenshtein_similarity(name1, name2),
        'N-gram': ngram_similarity(name1, name2),
        'Jaro-Winkler': jaro_winkler_similarity(name1.lower(), name2.lower()),
        'Embedding': embedding_similarity(name1, name2),
        'LLM Score': llm_match_with_score(name1, name2),
        'LLM Category': llm_match_with_category(name1, name2)
    }

# Run comparisons and create a dataframe
results = []
for name2 in businessNames2:
    result = compare_methods(businessNames1[0], name2)
    result['Name 1'] = businessNames1[0]
    result['Name 2'] = name2
    results.append(result)

df = pd.DataFrame(results)

# Reorder columns
columns_order = ['Name 1', 'Name 2', 'Levenshtein', 'N-gram', 'Jaro-Winkler', 'Embedding', 'LLM Score', 'LLM Category']
df = df[columns_order]

# Display the results
print(df.to_string(index=False))

              Name 1                   Name 2  Levenshtein   N-gram  Jaro-Winkler  Embedding  LLM Score   LLM Category
HANAN ATHER TRUCKING     HANAN TAHER TRUCKING     0.900000 0.750000      0.990000   0.929135        0.8   Likely Match
HANAN ATHER TRUCKING TRUCKING INC HANAN ATHER     0.083333 0.695652      0.618254   0.875217        1.0   Likely Match
HANAN ATHER TRUCKING       ATHER TRUCKING INC     0.500000 0.619048      0.729630   0.642829        0.7   Likely Match
HANAN ATHER TRUCKING     GODBOUT TRUCKING INC     0.300000 0.285714      0.548485   0.465248        0.2 Unlikely Match
HANAN ATHER TRUCKING HANAN ATHER PHARMACY INC     0.666667 0.392857      0.893333   0.561208        0.0 Unlikely Match
HANAN ATHER TRUCKING                Ather INC     0.400000 0.300000      0.637963   0.357234        0.2 Unlikely Match


# Bayesian Record Linkage Algorithm

## Step-by-Step Instructions

### **Step 1: Define the Prior**
- Use the similarity scores (e.g., `Levenshtein`, `N-gram`, `Jaro-Winkler`) to compute the prior probability $P(H = \text{Match})$.
- Compute the prior for $P(H = \text{Non-match})$ as:
  $$
  P(H = \text{Non-match}) = 1 - P(H = \text{Match})
  $$
- If combining multiple similarity scores, compute $P(H = \text{Match})$ using a weighted average:
  $$
  P(H = \text{Match}) = w_1 \cdot \text{Levenshtein} + w_2 \cdot \text{N-gram} + w_3 \cdot \text{Jaro-Winkler}
  $$

### **Step 2: Define the Likelihoods**
- Assume the LLM’s accuracy is parameterized by $\theta$:
  $$
  P(D = \text{Match} \mid H = \text{Match}) = \theta, \quad P(D = \text{Non-match} \mid H = \text{Match}) = 1 - \theta
  $$
  $$
  P(D = \text{Match} \mid H = \text{Non-match}) = 1 - \theta, \quad P(D = \text{Non-match} \mid H = \text{Non-match}) = \theta
  $$

### **Step 3: Normalization**
#### **Step 3.1: Compute $P(D = \text{Match})$**
- Compute the marginal probability of observing $D = \text{Match}$:
  $$
  P(D = \text{Match}) = P(D = \text{Match} \mid H = \text{Match}) \cdot P(H = \text{Match}) + P(D = \text{Match} \mid H = \text{Non-match}) \cdot P(H = \text{Non-match})
  $$

#### **Step 3.2: Compute $P(D = \text{Non-match})$**
- Compute the marginal probability of observing $D = \text{Non-match}$:
  $$
  P(D = \text{Non-match}) = P(D = \text{Non-match} \mid H = \text{Match}) \cdot P(H = \text{Match}) + P(D = \text{Non-match} \mid H = \text{Non-match}) \cdot P(H = \text{Non-match})
  $$

### **Step 4: Compute the Final Posterior Probability**
#### **Step 4.1: Compute $P(H = \text{Match} \mid D)$**
- For each record:
  - If $D = \text{Match}$:
    $$
    P(H = \text{Match} \mid D = \text{Match}) = \frac{P(D = \text{Match} \mid H = \text{Match}) \cdot P(H = \text{Match})}{P(D = \text{Match})}
    $$
  - If $D = \text{Non-match}$:
    $$
    P(H = \text{Match} \mid D = \text{Non-match}) = \frac{P(D = \text{Non-match} \mid H = \text{Match}) \cdot P(H = \text{Match})}{P(D = \text{Non-match})}
    $$

- Add the final posterior probability as a new column $P(H = \text{Match} \mid D)$ in the dataset.

---

### Summary of Steps:
1. Compute priors $P(H = \text{Match})$ and $P(H = \text{Non-match})$.
2. Define likelihoods $P(D = \text{Match} \mid H)$ and $P(D = \text{Non-match} \mid H)$.
3. Normalize the evidence by calculating $P(D = \text{Match})$ and $P(D = \text{Non-match})$.
4. Compute the final posterior $P(H = \text{Match} \mid D)$ for each record based on its observed $D$.


In [21]:
import pandas as pd

# Creating the dataset from the provided data
data = {
    "Name 1": [
        "HANAN ATHER TRUCKING",
        "HANAN ATHER TRUCKING TRUCKING INC HANAN ATHER",
        "HANAN ATHER TRUCKING",
        "HANAN ATHER TRUCKING",
        "HANAN ATHER TRUCKING HANAN ATHER PHARMACY INC",
        "HANAN ATHER TRUCKING"
    ],
    "Name 2": [
        "HANAN TAHER TRUCKING",
        "TRUCKING INC HANAN ATHER",
        "ATHER TRUCKING INC",
        "GODBOUT TRUCKING INC",
        "HANAN ATHER PHARMACY INC",
        "Ather INC"
    ],
    "Levenshtein": [0.900000, 0.083333, 0.500000, 0.300000, 0.666667, 0.400000],
    "N-gram": [0.750000, 0.695652, 0.619048, 0.285714, 0.392857, 0.300000],
    "Jaro-Winkler": [0.990000, 0.618254, 0.729630, 0.548485, 0.893333, 0.637963],
    "LLM Score": [0.8, 1.0, 0.7, 0.2, 0.0, 0.2],
    "LLM Category": [
        "Likely Match",
        "Likely Match",
        "Likely Match",
        "Unlikely Match",
        "Unlikely Match",
        "Unlikely Match"
    ]
}

# Converting to DataFrame
df = pd.DataFrame(data)
df


Unnamed: 0,Name 1,Name 2,Levenshtein,N-gram,Jaro-Winkler,LLM Score,LLM Category
0,HANAN ATHER TRUCKING,HANAN TAHER TRUCKING,0.9,0.75,0.99,0.8,Likely Match
1,HANAN ATHER TRUCKING TRUCKING INC HANAN ATHER,TRUCKING INC HANAN ATHER,0.083333,0.695652,0.618254,1.0,Likely Match
2,HANAN ATHER TRUCKING,ATHER TRUCKING INC,0.5,0.619048,0.72963,0.7,Likely Match
3,HANAN ATHER TRUCKING,GODBOUT TRUCKING INC,0.3,0.285714,0.548485,0.2,Unlikely Match
4,HANAN ATHER TRUCKING HANAN ATHER PHARMACY INC,HANAN ATHER PHARMACY INC,0.666667,0.392857,0.893333,0.0,Unlikely Match
5,HANAN ATHER TRUCKING,Ather INC,0.4,0.3,0.637963,0.2,Unlikely Match


In [22]:
# Redefine likelihoods based on the correct interpretation
# "Unlikely Match" corresponds to D = Non-match
df["D"] = df["LLM Category"].apply(lambda x: "Non-match" if x == "Unlikely Match" else "Match")

# Reset all calculations based on this redefinition

# Step 1: Define the Prior
weights = {"Levenshtein": 0.4, "N-gram": 0.3, "Jaro-Winkler": 0.3}
df["P(H=Match)"] = (
    weights["Levenshtein"] * df["Levenshtein"]
    + weights["N-gram"] * df["N-gram"]
    + weights["Jaro-Winkler"] * df["Jaro-Winkler"]
)
df["P(H=Non-match)"] = 1 - df["P(H=Match)"]
df

Unnamed: 0,Name 1,Name 2,Levenshtein,N-gram,Jaro-Winkler,LLM Score,LLM Category,D,P(H=Match),P(H=Non-match)
0,HANAN ATHER TRUCKING,HANAN TAHER TRUCKING,0.9,0.75,0.99,0.8,Likely Match,Match,0.882,0.118
1,HANAN ATHER TRUCKING TRUCKING INC HANAN ATHER,TRUCKING INC HANAN ATHER,0.083333,0.695652,0.618254,1.0,Likely Match,Match,0.427505,0.572495
2,HANAN ATHER TRUCKING,ATHER TRUCKING INC,0.5,0.619048,0.72963,0.7,Likely Match,Match,0.604603,0.395397
3,HANAN ATHER TRUCKING,GODBOUT TRUCKING INC,0.3,0.285714,0.548485,0.2,Unlikely Match,Non-match,0.37026,0.62974
4,HANAN ATHER TRUCKING HANAN ATHER PHARMACY INC,HANAN ATHER PHARMACY INC,0.666667,0.392857,0.893333,0.0,Unlikely Match,Non-match,0.652524,0.347476
5,HANAN ATHER TRUCKING,Ather INC,0.4,0.3,0.637963,0.2,Unlikely Match,Non-match,0.441389,0.558611


In [23]:
# Define the LLM accuracy (theta)
theta = 0.9
# Step 2: Likelihoods
theta = 0.9  # LLM accuracy
df["P(D=Match|H=Match)"] = theta
df["P(D=Non-match|H=Match)"] = 1 - theta
df["P(D=Match|H=Non-match)"] = 1 - theta
df["P(D=Non-match|H=Non-match)"] = theta

# Display the updated table with likelihoods
df


Unnamed: 0,Name 1,Name 2,Levenshtein,N-gram,Jaro-Winkler,LLM Score,LLM Category,D,P(H=Match),P(H=Non-match),P(D=Match|H=Match),P(D=Non-match|H=Match),P(D=Match|H=Non-match),P(D=Non-match|H=Non-match)
0,HANAN ATHER TRUCKING,HANAN TAHER TRUCKING,0.9,0.75,0.99,0.8,Likely Match,Match,0.882,0.118,0.9,0.1,0.1,0.9
1,HANAN ATHER TRUCKING TRUCKING INC HANAN ATHER,TRUCKING INC HANAN ATHER,0.083333,0.695652,0.618254,1.0,Likely Match,Match,0.427505,0.572495,0.9,0.1,0.1,0.9
2,HANAN ATHER TRUCKING,ATHER TRUCKING INC,0.5,0.619048,0.72963,0.7,Likely Match,Match,0.604603,0.395397,0.9,0.1,0.1,0.9
3,HANAN ATHER TRUCKING,GODBOUT TRUCKING INC,0.3,0.285714,0.548485,0.2,Unlikely Match,Non-match,0.37026,0.62974,0.9,0.1,0.1,0.9
4,HANAN ATHER TRUCKING HANAN ATHER PHARMACY INC,HANAN ATHER PHARMACY INC,0.666667,0.392857,0.893333,0.0,Unlikely Match,Non-match,0.652524,0.347476,0.9,0.1,0.1,0.9
5,HANAN ATHER TRUCKING,Ather INC,0.4,0.3,0.637963,0.2,Unlikely Match,Non-match,0.441389,0.558611,0.9,0.1,0.1,0.9


In [24]:
# Step 3.1: Compute P(D=Match)
df["P(D=Match)"] = (
    df["P(D=Match|H=Match)"] * df["P(H=Match)"] +
    df["P(D=Match|H=Non-match)"] * df["P(H=Non-match)"]
)

# Step 3.2: Compute P(D=Non-match)
df["P(D=Non-match)"] = (
    df["P(D=Non-match|H=Match)"] * df["P(H=Match)"] +
    df["P(D=Non-match|H=Non-match)"] * df["P(H=Non-match)"]
)

# Display the updated table with the normalization column
df


Unnamed: 0,Name 1,Name 2,Levenshtein,N-gram,Jaro-Winkler,LLM Score,LLM Category,D,P(H=Match),P(H=Non-match),P(D=Match|H=Match),P(D=Non-match|H=Match),P(D=Match|H=Non-match),P(D=Non-match|H=Non-match),P(D=Match),P(D=Non-match)
0,HANAN ATHER TRUCKING,HANAN TAHER TRUCKING,0.9,0.75,0.99,0.8,Likely Match,Match,0.882,0.118,0.9,0.1,0.1,0.9,0.8056,0.1944
1,HANAN ATHER TRUCKING TRUCKING INC HANAN ATHER,TRUCKING INC HANAN ATHER,0.083333,0.695652,0.618254,1.0,Likely Match,Match,0.427505,0.572495,0.9,0.1,0.1,0.9,0.442004,0.557996
2,HANAN ATHER TRUCKING,ATHER TRUCKING INC,0.5,0.619048,0.72963,0.7,Likely Match,Match,0.604603,0.395397,0.9,0.1,0.1,0.9,0.583683,0.416317
3,HANAN ATHER TRUCKING,GODBOUT TRUCKING INC,0.3,0.285714,0.548485,0.2,Unlikely Match,Non-match,0.37026,0.62974,0.9,0.1,0.1,0.9,0.396208,0.603792
4,HANAN ATHER TRUCKING HANAN ATHER PHARMACY INC,HANAN ATHER PHARMACY INC,0.666667,0.392857,0.893333,0.0,Unlikely Match,Non-match,0.652524,0.347476,0.9,0.1,0.1,0.9,0.622019,0.377981
5,HANAN ATHER TRUCKING,Ather INC,0.4,0.3,0.637963,0.2,Unlikely Match,Non-match,0.441389,0.558611,0.9,0.1,0.1,0.9,0.453111,0.546889


In [25]:
# Step 4: Compute the final posterior probability based on D
df["P(H=Match|D)"] = 0.0

for index, row in df.iterrows():
    if row["D"] == "Match":
        # Use P(H=Match|D=Match)
        posterior = (
            row["P(D=Match|H=Match)"] * row["P(H=Match)"] / row["P(D=Match)"]
        )
    else:
        # Use P(H=Match|D=Non-match)
        posterior = (
            row["P(D=Non-match|H=Match)"] * row["P(H=Match)"] / row["P(D=Non-match)"]
        )
    df.at[index, "P(H=Match|D)"] = posterior

# Display the updated table
display_columns = [
    "Name 1", "Name 2", "D", "P(H=Match)", "P(H=Non-match)",
    "P(H=Match|D)"
]
df


Unnamed: 0,Name 1,Name 2,Levenshtein,N-gram,Jaro-Winkler,LLM Score,LLM Category,D,P(H=Match),P(H=Non-match),P(D=Match|H=Match),P(D=Non-match|H=Match),P(D=Match|H=Non-match),P(D=Non-match|H=Non-match),P(D=Match),P(D=Non-match),P(H=Match|D)
0,HANAN ATHER TRUCKING,HANAN TAHER TRUCKING,0.9,0.75,0.99,0.8,Likely Match,Match,0.882,0.118,0.9,0.1,0.1,0.9,0.8056,0.1944,0.985353
1,HANAN ATHER TRUCKING TRUCKING INC HANAN ATHER,TRUCKING INC HANAN ATHER,0.083333,0.695652,0.618254,1.0,Likely Match,Match,0.427505,0.572495,0.9,0.1,0.1,0.9,0.442004,0.557996,0.870477
2,HANAN ATHER TRUCKING,ATHER TRUCKING INC,0.5,0.619048,0.72963,0.7,Likely Match,Match,0.604603,0.395397,0.9,0.1,0.1,0.9,0.583683,0.416317,0.932258
3,HANAN ATHER TRUCKING,GODBOUT TRUCKING INC,0.3,0.285714,0.548485,0.2,Unlikely Match,Non-match,0.37026,0.62974,0.9,0.1,0.1,0.9,0.396208,0.603792,0.061322
4,HANAN ATHER TRUCKING HANAN ATHER PHARMACY INC,HANAN ATHER PHARMACY INC,0.666667,0.392857,0.893333,0.0,Unlikely Match,Non-match,0.652524,0.347476,0.9,0.1,0.1,0.9,0.622019,0.377981,0.172634
5,HANAN ATHER TRUCKING,Ather INC,0.4,0.3,0.637963,0.2,Unlikely Match,Non-match,0.441389,0.558611,0.9,0.1,0.1,0.9,0.453111,0.546889,0.080709
