In [17]:
import Levenshtein
from nltk import edit_distance
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd

In [53]:
# function to compute Levenshtein distance
def levenshtein_distance(str1, str2):
    distance = Levenshtein.distance(str1, str2)
    max_len = max(len(str1), len(str2))
    return 1- (distance / max_len)

def n_gram_similarity(str1, str2, n=3):
    str1_ngrams = set(ngrams(str1, n))
    str2_ngrams = set(ngrams(str2, n))
    return len(str1_ngrams & str2_ngrams) / float(len(str1_ngrams | str2_ngrams))

def jaro_winkler_similarity(str1, str2):
    return Levenshtein.jaro_winkler(str1, str2)

**Tokenization:** The `AutoTokenizer.from_pretained(model_name)` method loads a pre-trained tokenizer. The tokenizer converts text inputs into a format that the model can process. `pt` a argument stands for PyTorch tensors. 

**Model Inference:** the `**` is used to unpack the dictionary `inputs` into keyword arguments. If `inputs` is a dictionary like `{'input_ids': tensor, 'attention_mask': tensor}`, then `self.model(**inputs)` is equivalent to `self.model(inputs_ids= tensor, attention_mask = tensor)`.


**Model Output: Hidden States**

When you pass a sequence of text (like a sentence) through a transformer model, output is a set of vectors. Specifically, the model produces a hidden state for each token (or sub-word) in the input sequence. Taking the mean of the hidden states is a common techique to derive a single, fixed-size vector representation for an entire input sequence. This vector can then be usedfor various downstream tasks such as similarity comparison, classification, etc..



In [54]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

class HuggingFaceEmbedding:
    def __init__(self, model_name = "distilbert-base-uncased", api_key = None):
        # what is a tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = api_key) 
        self.model = AutoModel.from_pretrained(model_name, use_auth_token = api_key)

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors = 'pt')
        outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim =1).detach().numpy()
        

In [55]:
records = [
    "HANAN TAHER TRUCKING",
    "TRUCKING INC HANAN ATHER",
    "ATHER TRUCKING INC",
    "GODBOUT TRUCKING INC",
    "HANAN ATHER PHARMACY INC",
    "Ather INC"
]

In [56]:
def compare_algorithms(records):
    result = []
    hf_embedding = HuggingFaceEmbedding()

    for i in range(0, len(records)):
        name1 = records[0]
        name2 = records[i]
        
        lev_dist = levenshtein_distance(name1, name2)
        ngram_sim = n_gram_similarity(name1, name2)
        jw_sim = jaro_winkler_similarity(name1, name2)

        emb1 = hf_embedding.get_embedding(name1)
        emb2 = hf_embedding.get_embedding(name2)
        embedding_sim = cosine_similarity(emb1, emb2)[0,0]

        result.append({
            "Record 1": name1,
            "Record 2": name2,
            "Levenshtein Distance": lev_dist,
            "N-Gram Similarity": ngram_sim,
            "Jaro-Winkler Similarity": jw_sim,
            "Embedding Similarity": embedding_sim
        })
    return pd.DataFrame(result)

In [57]:
df_results = compare_algorithms(records)
df_results

Unnamed: 0,Record 1,Record 2,Levenshtein Distance,N-Gram Similarity,Jaro-Winkler Similarity,Embedding Similarity
0,HANAN TAHER TRUCKING,HANAN TAHER TRUCKING,1.0,1.0,1.0,1.0
1,HANAN TAHER TRUCKING,TRUCKING INC HANAN ATHER,0.083333,0.37931,0.618254,0.898988
2,HANAN TAHER TRUCKING,ATHER TRUCKING INC,0.5,0.416667,0.72963,0.794868
3,HANAN TAHER TRUCKING,GODBOUT TRUCKING INC,0.3,0.241379,0.548485,0.776235
4,HANAN TAHER TRUCKING,HANAN ATHER PHARMACY INC,0.583333,0.176471,0.880833,0.888444
5,HANAN TAHER TRUCKING,Ather INC,0.2,0.0,0.464815,0.682862


In [None]:
import Levenshtein
from nltk import edit_distance
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd

# function to compute Levenshtein distance
def levenshtein_distance(str1, str2):
    distance = Levenshtein.distance(str1, str2)
    max_len = max(len(str1), len(str2))
    return 1- (distance / max_len)

def n_gram_similarity(str1, str2, n=3):
    str1_ngrams = set(ngrams(str1, n))
    str2_ngrams = set(ngrams(str2, n))
    return len(str1_ngrams & str2_ngrams) / float(len(str1_ngrams | str2_ngrams))

def jaro_winkler_similarity(str1, str2):
    return Levenshtein.jaro_winkler(str1, str2)

import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

class HuggingFaceEmbedding:
    def __init__(self, model_name = "distilbert-base-uncased", api_key = None):
        # what is a tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = api_key) 
        self.model = AutoModel.from_pretrained(model_name, use_auth_token = api_key)

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors = 'pt')
        outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim =1).detach().numpy()


records = [
    "HANAN TAHER TRUCKING",
    "TRUCKING INC HANAN ATHER",
    "ATHER TRUCKING INC",
    "GODBOUT TRUCKING INC",
    "HANAN ATHER PHARMACY INC",
    "Ather INC"
]

def compare_algorithms(records):
    result = []
    hf_embedding = HuggingFaceEmbedding()

    for i in range(0, len(records)):
        name1 = records[0]
        name2 = records[i]
        
        lev_dist = levenshtein_distance(name1, name2)
        ngram_sim = n_gram_similarity(name1, name2)
        jw_sim = jaro_winkler_similarity(name1, name2)

        emb1 = hf_embedding.get_embedding(name1)
        emb2 = hf_embedding.get_embedding(name2)
        embedding_sim = cosine_similarity(emb1, emb2)[0,0]

        result.append({
            "Record 1": name1,
            "Record 2": name2,
            "Levenshtein Distance": lev_dist,
            "N-Gram Similarity": ngram_sim,
            "Jaro-Winkler Similarity": jw_sim,
            "Embedding Similarity": embedding_sim
        })
    return pd.DataFrame(result)