# Overview of Embeddings

In [None]:
import json
import os
import tiktoken

In [None]:
with open("../secrets.json", "r") as fp:
    secrets = json.load(fp)
os.environ['OPENAI_API_KEY'] = secrets["OPENAI_API_KEY"]

# OpenAI Embeddings

* https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
* https://github.com/openai/openai-cookbook/blob/main/examples/Get_embeddings.ipynb
* https://github.com/openai/openai-python/blob/main/openai/embeddings_utils.py

Parameters

* context length: 8192
* embedding dimension: 1536

# Python OpenAI Utilities

Lets look at the function signature provided by the openai Python package function `get_embeddings`.
Note that the function removes new lines from the input text before embedding it. 
This is a reminder that its always useful to examine implementations in code in addition to reading docs.

In [None]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")
print(enc)
token_ids = enc.encode("self-evident")
token_strs = [enc.decode_single_token_bytes(token) for token in token_ids]
print("token_ids: ", token_ids)
print("token_strs: ", token_strs)

In [None]:
from openai.embeddings_utils import get_embedding

In [None]:
get_embedding??

In [None]:
text = """We hold these truths to be self-evident, that all men are created equal,
that they are endowed by their Creator with certain unalienable Rights,
that among these are Life, Liberty and the pursuit of Happiness."""

In [None]:
print(text)

In [None]:
print(text.replace("\n", " "))

In [None]:
model_name = 'text-embedding-ada-002'  # its good practice to specify a model instead of relying on defaults

In [None]:
import openai
import numpy as np

In [None]:
def check_close(vec1, vec2, rtol=1e-3, atol=1e-3):
    """Check that vectors are close to equal."""
    return np.allclose(np.array(vec1), np.array(vec2), rtol=rtol, atol=atol)

In [None]:
# show that OpenAI embeddings are 1536 dimensional
oai_embd_w_new_lines = openai.Embedding.create(input=text, model=model_name)['data'][0]['embedding']
len(oai_embd_w_new_lines), oai_embd_w_new_lines[0:10]

In [None]:
# do same embedding after removing new lines
oai_embd_wo_new_lines = openai.Embedding.create(input=text.replace("\n", " "), model=model_name)['data'][0]['embedding']
len(oai_embd_wo_new_lines), oai_embd_wo_new_lines[0:10]

In [None]:
# use openAI utility function to embed
oai_embd_util = get_embedding(text, engine=model_name)
oai_embd_util[0:10]

In [None]:
# show that the embedding util function removes new lines
check_close(oai_embd_util, oai_embd_wo_new_lines)

# Langchain Wrapper Around OpenAI Embeddings

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings

The abstract base class for `Embeddings` defines two abstract methods related to the primary use case in LangChain, namely retrieval. In the case of the OpenAI wrapper, they call the same end-point, but its good to keep in mind the potential for differentiation.

In [None]:
Embeddings??

In [None]:
embeddings = OpenAIEmbeddings(model=model_name)

In [None]:
embeddings.embed_documents??

In [None]:
embeddings.embed_query??

## Activity 

* discover what `_get_len_safe_embeddings` does

In [None]:
# use LangChain `embed_query` 
lc_embd_query = embeddings.embed_query(text.replace("\n", " "))

In [None]:
# use LangChain `embed_docuemnts`
lc_embd_docs = embeddings.embed_documents([text.replace("\n", " ")])[0]

In [None]:
# demonstrate the two LangChain methods return the same thing
check_close(lc_embd_query, lc_embd_docs)

In [None]:
# demonstrate the LangChain and direct OpenAI methods return the same thing
check_close(lc_embd_query, oai_embd_wo_new_lines)

# Vector Distances

## L1 (Manhattan) Norm

$$\Large  
\lVert \vec{u} \rVert_1 = 
\sum_{i=1}^{n} |u_i|
$$ 

## L2 (Euclidean) Norm

$$\Large  
\lVert \vec{u} \rVert_2 = 
\left[ \sum_{i=1}^{n} u_i^2 \right]^{1/2} 
$$ 

## Lp Norm

$$\Large  
\lVert \vec{u} \rVert_p = 
\left[ \sum_{i=1}^{n} |u_i|^p \right]^{1/p} 
$$ 

## L2 Distance

$$\Large  
d_{2} = \lVert \vec{x} - \vec{y} \rVert_2 = 
\left[ \sum_{i=1}^{n} \left( x_i - y_i\right)^2 \right]^{1/2} 
$$ 

## Cosine Distance

$$\Large  
1 - \cos {\theta} = 1 - \frac{\vec{x} \cdot \vec{y}}{\lVert \vec{x} \rVert_2 \lVert \vec{y} \rVert_2}
$$ 


In [None]:
from scipy import spatial

In [None]:
words = [
    "cat", 
    "dog",
    "chair",
]
wembd = np.array([embeddings.embed_query(w) for w in words])
cat, dog, chair = wembd

In [None]:
wembd

In [None]:
wembd.shape

In [None]:
# implement L2 by hand 
np.sqrt(np.sum((cat-dog)**2))

In [None]:
print("cat to dog", spatial.distance.euclidean(cat, dog))
print("cat to chair", spatial.distance.euclidean(cat, chair))

In [None]:
print("cat to dog", spatial.distance.cosine(cat, dog))
print("cat to chair", spatial.distance.cosine(cat, chair))