In [None]:
import Levenshtein
from nltk.util import ngrams
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel, AdamW
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# Function to compute Levenshtein distance
def levenshtein_distance(str1, str2):
    distance = Levenshtein.distance(str1, str2)
    max_len = max(len(str1), len(str2))
    return 1 - (distance / max_len)

def n_gram_similarity(str1, str2, n=3):
    str1_ngrams = set(ngrams(str1, n))
    str2_ngrams = set(ngrams(str2, n))
    return len(str1_ngrams & str2_ngrams) / float(len(str1_ngrams | str2_ngrams))

def jaro_winkler_similarity(str1, str2):
    return Levenshtein.jaro_winkler(str1, str2)

# Dataset class for business names
class BusinessNamesDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.data = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        name1 = self.data.iloc[idx]['name1']
        name2 = self.data.iloc[idx]['name2']
        label = self.data.iloc[idx]['label']
        inputs = self.tokenizer(name1, name2, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        return input_ids, attention_mask, torch.tensor(label, dtype=torch.float)

# Model class for business names similarity
class BusinessNamesModel(torch.nn.Module):
    def __init__(self, model_name):
        super(BusinessNamesModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.similarity = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Assuming the model returns (sequence_output, pooled_output)
        similarity_score = self.similarity(pooled_output)
        return similarity_score

# Training the model
def train_model(model, dataloader, num_epochs=3, learning_rate=2e-5):
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs.squeeze(-1), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}')

# Function to get fine-tuned embeddings
class FineTunedEmbedding:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model.bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Function to compare different algorithms
def compare_algorithms(records, fine_tuned_embedding):
    result = []
    for i in range(len(records)):
        name1 = records[0]
        name2 = records[i]
        
        lev_dist = levenshtein_distance(name1, name2)
        ngram_sim = n_gram_similarity(name1, name2)
        jw_sim = jaro_winkler_similarity(name1, name2)

        emb1 = fine_tuned_embedding.get_embedding(name1)
        emb2 = fine_tuned_embedding.get_embedding(name2)
        embedding_sim = cosine_similarity(emb1, emb2)[0, 0]

        result.append({
            "Record 1": name1,
            "Record 2": name2,
            "Levenshtein Distance": lev_dist,
            "N-Gram Similarity": ngram_sim,
            "Jaro-Winkler Similarity": jw_sim,
            "Embedding Similarity": embedding_sim
        })
    return pd.DataFrame(result)

# Main execution
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
dataset = BusinessNamesDataset('business_names.csv', tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

model = BusinessNamesModel("distilbert-base-uncased")
train_model(model, dataloader)

fine_tuned_embedding = FineTunedEmbedding(model, tokenizer)

records = [
    "HANAN TAHER TRUCKING",
    "TRUCKING INC HANAN ATHER",
    "ATHER TRUCKING INC",
    "GODBOUT TRUCKING INC",
    "HANAN ATHER PHARMACY INC",
    "Ather INC"
]

results = compare_algorithms(records, fine_tuned_embedding)
print(results)
