# Demo 8 - Text embeddings

In [None]:
!pip install --quiet "python-dotenv>=1.0.0" "sentence_transformers>=2.7.0" "openai>=1.30.1"

In [None]:
import getpass
import os

from dotenv import load_dotenv

from sentence_transformers import SentenceTransformer
import openai

## Tools

In [None]:
def vector_norm(v):
    return sum(v_i * v_i for v_i in v) ** 0.5

def normalize_vector(v):
    norm = vector_norm(v)
    return [ v_i / norm for v_i in v]

def cosine_sim(v1, v2):
    nv1 = normalize_vector(v1)
    nv2 = normalize_vector(v2)
    dot_product = sum(v1_i * v2_i for v1_i, v2_i in zip(nv1, nv2))
    return (1 + dot_product) / 2


print(f"vector_norm([3, 4]) = {vector_norm([3, 4])}")
print(f"normalize_vector([3, 4]) = {normalize_vector([3, 4])}")
print(f"cosine_sim([10, 0], [3, 0]) = {cosine_sim([10, 0], [3, 0])}")
print(f"cosine_sim([4, 0], [0, 5]) = {cosine_sim([4, 0], [0, 5])}")
print(f"cosine_sim([91, 0], [-16, 0]) = {cosine_sim([91, 0], [-16, 0])}")

## HuggingFace local embedding model

_Note: the next cell, on its first run, mayu take some time to download all required assets._

In [None]:
hf_model = SentenceTransformer("paraphrase-albert-small-v2")

def get_hf_embeddings(texts):
    raw_vectors = hf_model.encode(texts)
    # This model returns NON-NORMALIZED vectors: they need to be normalized:
    return [normalize_vector(raw_vector) for raw_vector in raw_vectors]

In [None]:
sentences = [
    "At dawn, the pond fills with dragonflies, frogs, and all sorts of critters.",
    "The deep roots of this oak can reach the tiniest amount of stored water.",
    "When plate tectonics was first proposed, not many took it seriously.",
    "Look closely and you'll notice that these 'twigs' are in fact weird insects...",
]

In [None]:
hf_embeddings = get_hf_embeddings(sentences)

In [None]:
print("(HuggingFace) embedding of first sentence:")
hf_vector0 = hf_embeddings[0]
print(f"    {str(hf_vector0)[:64]} ... ({len(hf_vector0)} numbers)")
print(f"Norm = {vector_norm(hf_vector0)}")

### Similarities

In [None]:
print(f"(HuggingFace) similarities to '{sentences[0][:24]}...':")
for hf_vector, sentence in zip(hf_embeddings, sentences):
    sim = cosine_sim(hf_vector0, hf_vector)
    print(f"  sim={sim:.3f} '{sentence[:24]}...''")

## OpenAI embedding model

In [None]:
load_dotenv()

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Please input your OpenAI API Key:")

In [None]:
client = openai.OpenAI()

def get_openai_embeddings(texts):
    api_response = client.embeddings.create(input=texts, model="text-embedding-ada-002")
    return [r.embedding for r in api_response.data]

In [None]:
openai_embeddings = get_openai_embeddings(sentences)

In [None]:
print("(openAI) embedding of first sentence:")
openai_vector0 = openai_embeddings[0]
print(f"    {str(openai_vector0)[:64]} ... ({len(openai_vector0)} numbers)")
print(f"Norm = {vector_norm(openai_vector0)}")

In [None]:
print(f"(openAI) similarities to '{sentences[0][:24]}...':")
for openai_vector, sentence in zip(openai_embeddings, sentences):
    sim = cosine_sim(openai_vector0, openai_vector)
    print(f"  sim={sim:.3f} '{sentence[:24]}...''")

### Randomness!

(A little effect, but it may bring nasty surprises)

In [None]:
results = get_openai_embeddings(["This is always the same text."] * 40)

for result in results:
    print(cosine_sim(results[0], result))

## The Cone

(this uses OpenAI's embeddings right now, but can be changed)

In [None]:
sentences2 = [
    "Whereof one cannot speak, thereof one must be silent.",
    "The world is the totality of facts, not of things.",
    "I was so amazed to find out he accepted to have dinner with me!",
    "Gh gh gh ghghgh bo bobobobobo bobobo",
    # note the tiny difference just on the last 'word':
    "Gh gh gh ghghgh bo bobobobobo bababa",
]

In [None]:
openai_embeddings2 = get_openai_embeddings(sentences2)

In [None]:
# table header:
print("     ", end="")
for emb_b_i in range(len(openai_embeddings2)):
    print(f"  {emb_b_i:<5}", end="")
print("")
#
for emb_a_i, (emb_a, sentence_a) in enumerate(zip(openai_embeddings2, sentences2)):
    print(f"{emb_a_i:<4}:", end="")
    for emb_b in openai_embeddings2:
        sim = cosine_sim(emb_a, emb_b)
        print(f"  {sim:0.3f}", end="")
    print(f"      ({sentence_a[:16]} ... {sentence_a[-16:]})", end="")
    print("")

#### In other words ...

In [None]:
def text_sim(text1, text2):
    emb1, emb2 = get_openai_embeddings([text1, text2])
    return cosine_sim(emb1, emb2)

print(text_sim("Ghghgh kokoko", "Ghghgh kokoky"))
print(text_sim("Something that makes sense", "Ghghgh kokoky"))
print(text_sim("Something that makes sense", "This sentence is largely unrelated to the previous"))

## The End