# Overview of Embeddings

In [1]:
import json
import os

In [2]:
with open("../secrets.json", "r") as fp:
    secrets = json.load(fp)
os.environ['OPENAI_API_KEY'] = secrets["OPENAI_API_KEY"]

# OpenAI Embeddings

* https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
* https://github.com/openai/openai-cookbook/blob/main/examples/Get_embeddings.ipynb
* https://github.com/openai/openai-python/blob/main/openai/embeddings_utils.py

Parameters

* context length: 8192
* embedding dimension: 1536

# Python OpenAI Utilities

Lets look at the function signature provided by the openai Python package function `get_embeddings`.
Note that the function removes new lines from the input text before embedding it. 
This is a reminder that its always useful to examine implementations in code in addition to reading docs.

In [3]:
from openai.embeddings_utils import get_embedding

In [4]:
get_embedding??

[0;31mSignature:[0m [0mget_embedding[0m[0;34m([0m[0mtext[0m[0;34m:[0m [0mstr[0m[0;34m,[0m [0mengine[0m[0;34m=[0m[0;34m'text-similarity-davinci-001'[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m [0;34m->[0m [0mList[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;34m@[0m[0mretry[0m[0;34m([0m[0mwait[0m[0;34m=[0m[0mwait_random_exponential[0m[0;34m([0m[0mmin[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mmax[0m[0;34m=[0m[0;36m20[0m[0;34m)[0m[0;34m,[0m [0mstop[0m[0;34m=[0m[0mstop_after_attempt[0m[0;34m([0m[0;36m6[0m[0;34m)[0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;32mdef[0m [0mget_embedding[0m[0;34m([0m[0mtext[0m[0;34m:[0m [0mstr[0m[0;34m,[0m [0mengine[0m[0;34m=[0m[0;34m"text-similarity-davinci-001"[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m [0;34m->[0m [0mList[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m:[0m[0

In [5]:
text = """We hold these truths to be self-evident, that all men are created equal,
that they are endowed by their Creator with certain unalienable Rights,
that among these are Life, Liberty and the pursuit of Happiness."""

In [6]:
print(text)

We hold these truths to be self-evident, that all men are created equal,
that they are endowed by their Creator with certain unalienable Rights,
that among these are Life, Liberty and the pursuit of Happiness.


In [7]:
print(text.replace("\n", " "))

We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.


In [8]:
model_name = 'text-embedding-ada-002'  # its good practice to specify a model instead of relying on defaults

In [9]:
import openai
import numpy as np

In [10]:
def check_close(vec1, vec2, rtol=1e-3, atol=1e-3):
    """Check that vectors are close to equal."""
    return np.allclose(np.array(vec1), np.array(vec2), rtol=rtol, atol=atol)

In [11]:
# show that OpenAI embeddings are 1536 dimensional
oai_embd_w_new_lines = openai.Embedding.create(input=text, model=model_name)['data'][0]['embedding']
len(oai_embd_w_new_lines), oai_embd_w_new_lines[0:10]

(1536,
 [0.01782410405576229,
  -0.006171564571559429,
  -0.008730811066925526,
  -0.03427327796816826,
  -0.01978650502860546,
  0.0006976211443543434,
  -0.013924299739301205,
  0.0058997031301259995,
  -0.012824354693293571,
  -0.004977874457836151])

In [12]:
# do same embedding after removing new lines
oai_embd_wo_new_lines = openai.Embedding.create(input=text.replace("\n", " "), model=model_name)['data'][0]['embedding']
len(oai_embd_wo_new_lines), oai_embd_wo_new_lines[0:10]

(1536,
 [0.016644857823848724,
  -0.006680168677121401,
  -0.007779124658554792,
  -0.03541353717446327,
  -0.019657723605632782,
  -0.0028754977975040674,
  -0.015447119250893593,
  0.005457735154777765,
  -0.012273729778826237,
  -0.0038833883590996265])

In [13]:
# use openAI utility function to embed
oai_embd_util = get_embedding(text, engine=model_name)
oai_embd_util[0:10]

[0.0165903028100729,
 -0.006615120451897383,
 -0.007510724943131208,
 -0.035503000020980835,
 -0.019703300669789314,
 -0.0028906408697366714,
 -0.015478516928851604,
 0.0055774543434381485,
 -0.012260517105460167,
 -0.0037800688296556473]

In [14]:
# show that the embedding util function removes new lines
check_close(oai_embd_util, oai_embd_wo_new_lines)

True

# Langchain Wrapper Around OpenAI Embeddings

In [15]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings

The abstract base class for `Embeddings` defines two abstract methods related to the primary use case in LangChain, namely retrieval. In the case of the OpenAI wrapper, they call the same end-point, but its good to keep in mind the potential for differentiation.

In [16]:
Embeddings??

[0;31mInit signature:[0m [0mEmbeddings[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mEmbeddings[0m[0;34m([0m[0mABC[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Interface for embedding models."""[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;34m@[0m[0mabstractmethod[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0membed_documents[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mtexts[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m)[0m [0;34m->[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m][0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Embed search docs."""[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;34m@[0m[0mabstractmethod[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0membed_query[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mtext[0m[0;34m:[0m [0mstr[0m[0;34m)[0m [0;34m->[0m [0mList[0m[0;34m[[0m[

In [17]:
embeddings = OpenAIEmbeddings(model=model_name)

In [18]:
embeddings.embed_documents??

[0;31mSignature:[0m
[0membeddings[0m[0;34m.[0m[0membed_documents[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtexts[0m[0;34m:[0m [0;34m'List[str]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchunk_size[0m[0;34m:[0m [0;34m'Optional[int]'[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'List[List[float]]'[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0membed_documents[0m[0;34m([0m[0;34m[0m
[0;34m[0m        [0mself[0m[0;34m,[0m [0mtexts[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mchunk_size[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;36m0[0m[0;34m[0m
[0;34m[0m    [0;34m)[0m [0;34m->[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m][0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Call out to OpenAI's embedding endpoint for embedding search docs.[0m
[0;34m

In [19]:
embeddings.embed_query??

[0;31mSignature:[0m [0membeddings[0m[0;34m.[0m[0membed_query[0m[0;34m([0m[0mtext[0m[0;34m:[0m [0;34m'str'[0m[0;34m)[0m [0;34m->[0m [0;34m'List[float]'[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0membed_query[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mtext[0m[0;34m:[0m [0mstr[0m[0;34m)[0m [0;34m->[0m [0mList[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Call out to OpenAI's embedding endpoint for embedding query text.[0m
[0;34m[0m
[0;34m        Args:[0m
[0;34m            text: The text to embed.[0m
[0;34m[0m
[0;34m        Returns:[0m
[0;34m            Embedding for the text.[0m
[0;34m        """[0m[0;34m[0m
[0;34m[0m        [0membedding[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_embedding_func[0m[0;34m([0m[0mtext[0m[0;34m,[0m [0mengine[0m[0;34m=[0m[0mself[0m[0;34m.[0m[0mdeployment[0m[0;34m)[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[

## Activity 

* discover what `_get_len_safe_embeddings` does

In [20]:
# use LangChain `embed_query` 
lc_embd_query = embeddings.embed_query(text.replace("\n", " "))

In [21]:
# use LangChain `embed_docuemnts`
lc_embd_docs = embeddings.embed_documents([text.replace("\n", " ")])[0]

In [22]:
# demonstrate the two LangChain methods return the same thing
check_close(lc_embd_query, lc_embd_docs)

True

In [23]:
# demonstrate the LangChain and direct OpenAI methods return the same thing
check_close(lc_embd_query, oai_embd_wo_new_lines)

True

# Vector Distances

## L1 (Manhattan) Norm

$$\Large  
\lVert \vec{u} \rVert_1 = 
\sum_{i=1}^{n} |u_i|
$$ 

## L2 (Euclidean) Norm

$$\Large  
\lVert \vec{u} \rVert_2 = 
\left[ \sum_{i=1}^{n} u_i^2 \right]^{1/2} 
$$ 

## Lp Norm

$$\Large  
\lVert \vec{u} \rVert_p = 
\left[ \sum_{i=1}^{n} |u_i|^p \right]^{1/p} 
$$ 

## L2 Distance

$$\Large  
d_{2} = \lVert \vec{x} - \vec{y} \rVert_2 = 
\left[ \sum_{i=1}^{n} \left( x_i - y_i\right)^2 \right]^{1/2} 
$$ 

## Cosine Distance

$$\Large  
1 - \cos {\theta} = 1 - \frac{\vec{x} \cdot \vec{y}}{\lVert \vec{x} \rVert_2 \lVert \vec{y} \rVert_2}
$$ 


In [24]:
from scipy import spatial

In [25]:
words = [
    "cat", 
    "dog",
    "chair",
]
wembd = np.array([embeddings.embed_query(w) for w in words])
cat, dog, chair = wembd

In [26]:
wembd

array([[-0.00702348, -0.01733333, -0.009632  , ..., -0.01430958,
        -0.02342342, -0.01420311],
       [-0.00334666, -0.0177677 , -0.0159152 , ..., -0.00609159,
         0.00458052, -0.01906579],
       [ 0.01898345, -0.01368758, -0.01306182, ..., -0.00837854,
         0.00442969, -0.00634318]])

In [27]:
wembd.shape

(3, 1536)

In [28]:
# implement L2 by hand 
np.sqrt(np.sum((cat-dog)**2))

0.5223519681672937

In [29]:
print("cat to dog", spatial.distance.euclidean(cat, dog))
print("cat to chair", spatial.distance.euclidean(cat, chair))

cat to dog 0.5223519681672937
cat to chair 0.5927160911041964


In [30]:
print("cat to dog", spatial.distance.cosine(cat, dog))
print("cat to chair", spatial.distance.cosine(cat, chair))

cat to dog 0.13642577705644987
cat to chair 0.1756561792455582
