## Imports and Utility Functions

In [2]:
import Levenshtein
from nltk.util import ngrams
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel, AdamW
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split

# Function to compute Levenshtein distance
def levenshtein_distance(str1, str2):
    distance = Levenshtein.distance(str1, str2)
    max_len = max(len(str1), len(str2))
    return 1 - (distance / max_len)

# Function to compute N-gram similarity
def n_gram_similarity(str1, str2, n=3):
    str1_ngrams = set(ngrams(str1, n))
    str2_ngrams = set(ngrams(str2, n))
    return len(str1_ngrams & str2_ngrams) / float(len(str1_ngrams | str2_ngrams))

# Function to compute Jaro-Winkler similarity
def jaro_winkler_similarity(str1, str2):
    return Levenshtein.jaro_winkler(str1, str2)


## Dataset Preparation

In [3]:
# Dataset class for business names
class BusinessNamesDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.data = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        name1 = self.data.iloc[idx]['name1']
        name2 = self.data.iloc[idx]['name2']
        label = self.data.iloc[idx]['label']
        inputs = self.tokenizer(name1, name2, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        return input_ids, attention_mask, torch.tensor(label, dtype=torch.float)


## Model Definition

In [4]:
# Model class for business names similarity
class BusinessNamesModel(torch.nn.Module):
    def __init__(self, model_name):
        super(BusinessNamesModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.similarity = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)  # Pool the last hidden state
        similarity_score = self.similarity(pooled_output)
        return similarity_score


## Training the Model

In [5]:
# Training the model
def train_model(model, train_dataloader, val_dataloader=None, num_epochs=3, learning_rate=2e-5):
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs.squeeze(-1), labels.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_dataloader)}')

        # Validation
        if val_dataloader:
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    input_ids, attention_mask, labels = batch
                    input_ids = input_ids.to(device)
                    attention_mask = attention_mask.to(device)
                    labels = labels.to(device)
                    
                    outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs.squeeze(-1), labels.float())
                    val_loss += loss.item()

            print(f'Epoch {epoch+1}, Validation Loss: {val_loss/len(val_dataloader)}')


## Pre-trained Embeddings

In [13]:
# Class for pre-trained embeddings
class PreTrainedEmbedding:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

## Fine-Tuned Embedding Class

In [14]:
# Function to get fine-tuned embeddings
class FineTunedEmbedding:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model.bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()


## Comparing Business Names

In [15]:
# Function to compare different algorithms
def compare_algorithms(records, pre_trained_embedding, fine_tuned_embedding):
    result = []
    for i in range(len(records)):
        for j in range(i + 1, len(records)):
            name1 = records[i]
            name2 = records[j]
            
            lev_dist = levenshtein_distance(name1, name2)
            ngram_sim = n_gram_similarity(name1, name2)
            jw_sim = jaro_winkler_similarity(name1, name2)

            pre_emb1 = pre_trained_embedding.get_embedding(name1)
            pre_emb2 = pre_trained_embedding.get_embedding(name2)
            pre_embedding_sim = cosine_similarity(pre_emb1, pre_emb2)[0, 0]

            fine_emb1 = fine_tuned_embedding.get_embedding(name1)
            fine_emb2 = fine_tuned_embedding.get_embedding(name2)
            fine_embedding_sim = cosine_similarity(fine_emb1, fine_emb2)[0, 0]

            result.append({
                "Record 1": name1,
                "Record 2": name2,
                "Levenshtein Distance": lev_dist,
                "N-Gram Similarity": ngram_sim,
                "Jaro-Winkler Similarity": jw_sim,
                "Pre-trained Embedding Similarity": pre_embedding_sim,
                "Fine-tuned Embedding Similarity": fine_embedding_sim
            })
    return pd.DataFrame(result)

## Main Execution

In [16]:
# Main execution
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Load pre-trained BERT model
pre_trained_model = AutoModel.from_pretrained("distilbert-base-uncased")
pre_trained_embedding = PreTrainedEmbedding(pre_trained_model, tokenizer)

# Load dataset
dataset = BusinessNamesDataset('test_business_names.csv', tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize and train model
fine_tuned_model = BusinessNamesModel("distilbert-base-uncased")
train_model(fine_tuned_model, train_dataloader, val_dataloader)

# Initialize fine-tuned embedding class
fine_tuned_embedding = FineTunedEmbedding(fine_tuned_model, tokenizer)


# Define business names to compare
records = [
    "HANAN TAHER TRUCKING",
    "TRUCKING INC HANAN ATHER",
    "ATHER TRUCKING INC",
    "GODBOUT TRUCKING INC",
    "HANAN ATHER PHARMACY INC",
    "Ather INC"
]

# Compare business names and print results
results = compare_algorithms(records, pre_trained_embedding, fine_tuned_embedding)
print(results)



Epoch 1, Loss: 0.7044497132301331
Epoch 1, Validation Loss: 0.6895326972007751
Epoch 2, Loss: 0.6726543307304382
Epoch 2, Validation Loss: 0.6965304017066956
Epoch 3, Loss: 0.6389493942260742
Epoch 3, Validation Loss: 0.7096562385559082
                    Record 1                  Record 2  Levenshtein Distance  \
0       HANAN TAHER TRUCKING  TRUCKING INC HANAN ATHER              0.083333   
1       HANAN TAHER TRUCKING        ATHER TRUCKING INC              0.500000   
2       HANAN TAHER TRUCKING      GODBOUT TRUCKING INC              0.300000   
3       HANAN TAHER TRUCKING  HANAN ATHER PHARMACY INC              0.583333   
4       HANAN TAHER TRUCKING                 Ather INC              0.200000   
5   TRUCKING INC HANAN ATHER        ATHER TRUCKING INC              0.250000   
6   TRUCKING INC HANAN ATHER      GODBOUT TRUCKING INC              0.166667   
7   TRUCKING INC HANAN ATHER  HANAN ATHER PHARMACY INC              0.083333   
8   TRUCKING INC HANAN ATHER               

In [20]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
import Levenshtein
from nltk.util import ngrams
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

# Ensure we're using CPU
device = torch.device('cpu')

# Initialize OpenAI client
client = OpenAI()

# Your provided dataset
businessNames1 = ["HANAN ATHER TRUCKING"]
businessNames2 = [
    "HANAN TAHER TRUCKING",
    "TRUCKING INC HANAN ATHER",
    "ATHER TRUCKING INC",
    "GODBOUT TRUCKING INC",
    "HANAN ATHER PHARMACY INC",
    "Ather INC"
]

def evaluate_model_on_dataset(model, names1, names2):
    with torch.no_grad():
        embeddings1 = model.encode(names1, convert_to_tensor=True, device=device)
        embeddings2 = model.encode(names2, convert_to_tensor=True, device=device)
    
    similarities = torch.nn.functional.cosine_similarity(embeddings1, embeddings2).cpu().numpy()
    
    return similarities

def levenshtein_distance(str1, str2):
    distance = Levenshtein.distance(str1.lower(), str2.lower())
    max_len = max(len(str1), len(str2))
    return 1 - (distance / max_len)

def n_gram_similarity(str1, str2, n=3):
    str1_ngrams = set(ngrams(str1.lower(), n))
    str2_ngrams = set(ngrams(str2.lower(), n))
    return len(str1_ngrams & str2_ngrams) / float(len(str1_ngrams | str2_ngrams))

def jaro_winkler_similarity(str1, str2):
    return Levenshtein.jaro_winkler(str1.lower(), str2.lower())

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

models = {
    'SBERT': 'all-mpnet-base-v2',
    'RoBERTa': 'all-distilroberta-v1',
    'Mini': 'all-MiniLM-L6-v2',
}

openai_models = [
    'text-embedding-3-small',
    'text-embedding-3-large',
    'text-embedding-ada-002'
]

results = []

# Get embeddings for businessNames1
openai_embeddings1 = {model: get_embedding(businessNames1[0], model) for model in openai_models}

for i, name2 in enumerate(businessNames2):
    result = {
        'Business Name 1': businessNames1[0],
        'Business Name 2': name2,
        'Levenshtein': levenshtein_distance(businessNames1[0], name2),
        'N-gram': n_gram_similarity(businessNames1[0], name2),
        'Jaro-Winkler': jaro_winkler_similarity(businessNames1[0], name2)
    }
    
    for model_name, model_path in models.items():
        model = SentenceTransformer(model_path, device=device)
        similarities = evaluate_model_on_dataset(model, businessNames1, [name2])
        result[f'{model_name}_Pre-fine-tuned'] = similarities[0]
    
    # Calculate OpenAI embeddings similarities
    for openai_model in openai_models:
        embedding2 = get_embedding(name2, openai_model)
        similarity = cosine_similarity([openai_embeddings1[openai_model]], [embedding2])[0][0]
        result[f'OpenAI_{openai_model}'] = similarity
    
    results.append(result)

# Create DataFrame
df_results = pd.DataFrame(results)

# Reorder columns
columns = ['Business Name 1', 'Business Name 2', 'Levenshtein', 'N-gram', 'Jaro-Winkler']
columns.extend([f'{model}_Pre-fine-tuned' for model in models])
columns.extend([f'OpenAI_{model}' for model in openai_models])

df_results = df_results[columns]

print(df_results)

# Optionally, save to CSV
df_results.to_csv('business_name_similarities.csv', index=False)

Python(61308) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


        Business Name 1           Business Name 2  Levenshtein    N-gram  \
0  HANAN ATHER TRUCKING      HANAN TAHER TRUCKING     0.900000  0.636364   
1  HANAN ATHER TRUCKING  TRUCKING INC HANAN ATHER     0.083333  0.600000   
2  HANAN ATHER TRUCKING        ATHER TRUCKING INC     0.500000  0.545455   
3  HANAN ATHER TRUCKING      GODBOUT TRUCKING INC     0.300000  0.241379   
4  HANAN ATHER TRUCKING  HANAN ATHER PHARMACY INC     0.666667  0.333333   
5  HANAN ATHER TRUCKING                 Ather INC     0.400000  0.190476   

   Jaro-Winkler  SBERT_Pre-fine-tuned  RoBERTa_Pre-fine-tuned  \
0      0.990000              0.911981                0.913551   
1      0.618254              0.930841                0.876942   
2      0.729630              0.725133                0.701722   
3      0.548485              0.587563                0.523199   
4      0.893333              0.590314                0.497729   
5      0.637963              0.483883                0.419634   

   Mini_Pre

In [21]:
df_results

Unnamed: 0,Business Name 1,Business Name 2,Levenshtein,N-gram,Jaro-Winkler,SBERT_Pre-fine-tuned,RoBERTa_Pre-fine-tuned,Mini_Pre-fine-tuned,OpenAI_text-embedding-3-small,OpenAI_text-embedding-3-large,OpenAI_text-embedding-ada-002
0,HANAN ATHER TRUCKING,HANAN TAHER TRUCKING,0.9,0.636364,0.99,0.911981,0.913551,0.929135,0.888799,0.898423,0.968436
1,HANAN ATHER TRUCKING,TRUCKING INC HANAN ATHER,0.083333,0.6,0.618254,0.930841,0.876942,0.875216,0.888552,0.896059,0.963726
2,HANAN ATHER TRUCKING,ATHER TRUCKING INC,0.5,0.545455,0.72963,0.725133,0.701722,0.642829,0.75253,0.671729,0.877687
3,HANAN ATHER TRUCKING,GODBOUT TRUCKING INC,0.3,0.241379,0.548485,0.587563,0.523199,0.465248,0.570155,0.539397,0.860756
4,HANAN ATHER TRUCKING,HANAN ATHER PHARMACY INC,0.666667,0.333333,0.893333,0.590314,0.497729,0.561207,0.693619,0.698483,0.904037
5,HANAN ATHER TRUCKING,Ather INC,0.4,0.190476,0.637963,0.483883,0.419634,0.357233,0.350427,0.384743,0.815603
