# What are embeddings?
- Embeddings are a fundamental concept from natural language processings; where words, phrases, or entire documents are represented in numerical form.
- More fundamentally, embeddings models  map text onto a multi-dimensional space, or vector space, and the numbers outputted by the model represent the location of the text in that space.
- Simillar pieces of text or words, like teacher and student, are mapped closer together in the space and dissimilar words are mapped futher away.
- this abilitity  to map similar and dissimilar words means that embedding models can be used to capture the **semantic meaning** of text. (by semantic meaning, we mean the full context and intent of the word is captured)
- "which way is to the supermarkey" vs. "Could I have directions to the shop" only have two words in common but semantically very similar. 

In [5]:
import pandas as pd

def load_data(file_path):
    return pd.read_csv(file_path)

def preprocess_data(df):
    df['name1'] = df['name1'].str.lower().str.strip()
    df['name2'] = df['name2'].str.lower().str.strip()
    return df



In [16]:
import Levenshtein
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def levenshtein_distance(str1, str2):
    distance = Levenshtein.distance(str1, str2)
    max_len = max(len(str1), len(str2))
    return distance / max_len

def n_gram_similarity(str1, str2, n=3):
    str1_ngrams = set(ngrams(str1, n))
    str2_ngrams = set(ngrams(str2, n))
    return len(str1_ngrams & str2_ngrams) / float(len(str1_ngrams | str2_ngrams))

def jaro_winkler_similarity(str1, str2):
    return Levenshtein.jaro_winkler(str1, str2)

def tfidf_cosine_similarity(corpus):
    vectorizer = TfidfVectorizer().fit_transform(corpus)
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)


In [7]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

class HuggingFaceEmbedding:
    def __init__(self, model_name="distilbert-base-uncased", api_key=None):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=api_key)
        self.model = AutoModel.from_pretrained(model_name, use_auth_token=api_key)

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors='pt')
        outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()


In [17]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


def compare_algorithms(df):
    results = []

    hf_embedding = HuggingFaceEmbedding(api_key="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")

    for idx, row in df.iterrows():
        name1 = row['name1']
        name2 = row['name2']

        lev_dist = levenshtein_distance(name1, name2)
        ngram_sim = n_gram_similarity(name1, name2)
        jw_sim = jaro_winkler_similarity(name1, name2)

        emb1 = hf_embedding.get_embedding(name1)
        emb2 = hf_embedding.get_embedding(name2)
        embedding_sim = cosine_similarity(emb1, emb2)[0, 0]

        results.append({
            "name1": name1,
            "name2": name2,
            "levenshtein_distance": lev_dist,
            "n_gram_similarity": ngram_sim,
            "jaro_winkler_similarity": jw_sim,
            "embedding_similarity": embedding_sim
        })

    return pd.DataFrame(results)



In [13]:
df = load_data("data/sample_data.csv")
df = preprocess_data(df)
#results = compare_algorithms(df)
#print(results)

In [14]:
df

Unnamed: 0,name1,name2
0,apple inc.,apple incorporated
1,google llc,googol l.l.c.
2,microsoft corporation,micro soft corp
3,amazon.com inc.,amazon incorporated
4,facebook inc.,face book incorporated
5,"netflix, inc.",net flix
6,"tesla, inc.",teslar inc.
7,oracle corporation,orakel corp.
8,ibm,international business machines
9,adobe systems inc.,adoby systems


In [18]:
df = load_data("data/sample_data.csv")
df = preprocess_data(df)
results = compare_algorithms(df)
print(results)



                                     name1                            name2  \
0                               apple inc.               apple incorporated   
1                               google llc                    googol l.l.c.   
2                    microsoft corporation                  micro soft corp   
3                          amazon.com inc.              amazon incorporated   
4                            facebook inc.           face book incorporated   
5                            netflix, inc.                         net flix   
6                              tesla, inc.                      teslar inc.   
7                       oracle corporation                     orakel corp.   
8                                      ibm  international business machines   
9                       adobe systems inc.                    adoby systems   
10                       statistics canada                          statcan   
11                   canada revenue agency          