## Generating Simple Embeddings

In [14]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [15]:
from openai import OpenAI
client = OpenAI()

In [4]:
text = '''This is a random text
OpenAI o3 is a GPT model developed by OpenAI as a successor to OpenAI o1. '''
text = text.replace('\n', ' ')

embedding = client.embeddings.create(
    input=text,
    model='text-embedding-3-small'
)


In [5]:
print(embedding.data[0].embedding)

[0.0023240922018885612, 0.016592085361480713, 0.05409766733646393, -0.0340111069381237, 0.003961293492466211, -0.040599923580884933, -0.0008356895996257663, 0.01812591962516308, -0.0019206271972507238, -0.017138930037617683, 0.03051663190126419, -0.016778811812400818, -0.02484811469912529, -0.003757893806323409, -0.02559502422809601, -0.01880614086985588, -0.02023327350616455, -0.09101638942956924, -0.03427786007523537, 0.004504804499447346, 0.018912842497229576, 0.005385091993957758, 0.013364364393055439, -0.022193914279341698, 0.00553847523406148, -0.010976918041706085, 0.01724563166499138, 0.03625183925032616, 0.006288720294833183, -0.04676193743944168, 0.03638521581888199, -0.020926833152770996, -0.08381403982639313, 0.0050849937833845615, -0.016178617253899574, 0.0005814398755319417, -0.00022215588251128793, 0.01997985877096653, 0.002032330259680748, 0.052283741533756256, 0.0014779826160520315, -0.010423404164612293, 0.008002613671123981, 0.03089008666574955, -0.016472045332193375

In [1]:
pip install pandas numpy -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
df = pd.read_csv('files/words.csv')
df = df.sample(frac=1)
df

Unnamed: 0,text
1,opossum
14,hamster
36,cat
16,ferret
32,pasta
29,salad
23,blue
19,lizard
0,fox
10,cappuccino


In [10]:
def get_embedding(text, model='text-embedding-3-small'):
    client = OpenAI()
    text = text.replace('\n', ' ')

    response = client.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding


In [10]:
df['embedding'] = df['text'].apply(lambda x: get_embedding(x))

In [11]:
df

Unnamed: 0,text,embedding
26,fish,"[-0.0026927476283162832, -0.021741770207881927..."
5,coffee,"[-0.01013763528317213, 0.0037400354631245136, ..."
3,purple,"[0.02476203814148903, -0.03475233167409897, -0..."
32,pasta,"[-0.050148531794548035, -0.04081052914261818, ..."
34,brown,"[-0.0307017732411623, -0.007146547082811594, 0..."
39,gray,"[0.004851989448070526, -0.02022680640220642, -..."
15,burger,"[-0.03547876328229904, -0.01602969318628311, 0..."
21,milk,"[0.039659809321165085, -0.004168314393609762, ..."
9,yellow,"[-0.014253244735300541, -0.017989395186305046,..."
18,squirrel,"[0.03277624770998955, -0.04038501903414726, -0..."


In [12]:
df.to_csv('files/words-embeddings.csv', index=False)

## Estimating Embedding Costs with tiktoken


In [13]:
pip install tiktoken -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import tiktoken
import pandas as pd
df = pd.read_csv('files/words-embeddings.csv')
df

Unnamed: 0,text,embedding
0,fox,"[-0.03349510580301285, 0.0016073230654001236, ..."
1,opossum,"[0.03201136365532875, 0.032397374510765076, -0..."
2,black,"[0.011193670332431793, -0.012095698155462742, ..."
3,purple,"[0.02476203814148903, -0.03475233167409897, -0..."
4,badger,"[0.021669577807188034, -0.022312505170702934, ..."
5,coffee,"[-0.01013763528317213, 0.0037400354631245136, ..."
6,rabbit,"[0.006238460540771484, -0.01577582396566868, 0..."
7,hare,"[0.018302442505955696, -0.022041788324713707, ..."
8,soda,"[0.01846451684832573, -0.025069599971175194, -..."
9,yellow,"[-0.014283397234976292, -0.018006160855293274,..."


In [4]:
words = list(df['text'])
enc = tiktoken.encoding_for_model('text-embedding-3-small')
print(enc.encode('Hello world!'))
total_tokens = sum([len(enc.encode(word)) for word in words])
print(f'Total tokens: {total_tokens}')

[9906, 1917, 0]
Total tokens: 62


In [5]:
cost_per_token = 0.02 / 1_000_000
estimated_cost = total_tokens * cost_per_token
print(f'Estimated cost in USD: {estimated_cost:.10f}')

Estimated cost in USD: 0.0000012400


## Performing Semantic Searches

In [6]:
import pandas as pd
import numpy as np
df = pd.read_csv('files/words-embeddings.csv')

In [7]:
df['embedding'] = df['embedding'].apply(eval).apply(np.array)

In [16]:
search_term = 'red'
search_term_vector = get_embedding(search_term)

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-***********************************************************************************************************************************************T_0A. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'code': 'invalid_api_key', 'param': None}, 'status': 401}

In [8]:
import numpy as np

def cosine_similarity(vector_x, vector_y):
    """
    Compute the cosine similarity between two vectors.

    Parameters:
    vector_x (array-like): First input vector.
    vector_y (array-like): Second input vector.

    Returns:
    float: Cosine similarity between vector_x and vector_y.

    Raises:
    ValueError: If the input vectors have different dimensions or if any of the vectors is zero.
    """
    # Convert inputs to NumPy arrays
    x = np.array(vector_x)
    y = np.array(vector_y)
    
    # Ensure the vectors are one-dimensional
    if x.ndim != 1 or y.ndim != 1:
        raise ValueError("Both vectors must be one-dimensional.")
    
    # Check if vectors have the same dimensions
    if x.shape[0] != y.shape[0]:
        raise ValueError("Vectors must be of the same dimensions.")
    
    # Compute the dot product of the two vectors
    dot_product = np.dot(x, y)
    
    # Compute the norm (magnitude) of each vector
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y)
    
    # Check for zero vectors to avoid division by zero
    if norm_x == 0 or norm_y == 0:
        raise ValueError("One of the vectors is zero; cosine similarity is not defined.")
    
    # Compute cosine similarity
    similarity = dot_product / (norm_x * norm_y)
    
    return similarity



In [17]:
# search_term_vector = df['embedding'].iloc[20]
df['similarities'] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector))
df.sort_values('similarities', ascending=False).head(10)

Unnamed: 0,text,embedding,similarities
20,red,"[-0.02211996167898178, -0.010933708399534225, ...",1.0
23,blue,"[-0.0011275681899860501, -0.016529185697436333...",0.641048
3,purple,"[0.02476203814148903, -0.03475233167409897, -0...",0.565335
35,white,"[0.0032487320713698864, -0.02826567552983761, ...",0.547585
41,orange,"[-0.02592204324901104, -0.00554656470194459, -...",0.544626
11,deer,"[0.060789555311203, -0.03292562812566757, 0.00...",0.504658
39,gray,"[0.004851989448070526, -0.02022680640220642, -...",0.504009
2,black,"[0.011193670332431793, -0.012095698155462742, ...",0.485274
34,brown,"[-0.0307017732411623, -0.007146547082811594, 0...",0.481929
0,fox,"[-0.03349510580301285, 0.0016073230654001236, ...",0.473394


In [18]:
df

Unnamed: 0,text,embedding,similarities
0,fox,"[-0.03349510580301285, 0.0016073230654001236, ...",0.473394
1,opossum,"[0.03201136365532875, 0.032397374510765076, -0...",0.210644
2,black,"[0.011193670332431793, -0.012095698155462742, ...",0.485274
3,purple,"[0.02476203814148903, -0.03475233167409897, -0...",0.565335
4,badger,"[0.021669577807188034, -0.022312505170702934, ...",0.335412
5,coffee,"[-0.01013763528317213, 0.0037400354631245136, ...",0.300724
6,rabbit,"[0.006238460540771484, -0.01577582396566868, 0...",0.393808
7,hare,"[0.018302442505955696, -0.022041788324713707, ...",0.388736
8,soda,"[0.01846451684832573, -0.025069599971175194, -...",0.291013
9,yellow,"[-0.014283397234976292, -0.018006160855293274,...",0.438403


In [19]:
v1 = df['embedding'].iloc[21]  # Milk
v2 = df['embedding'].iloc[10] # Cappuccino
v = v1 + v2

df['similarities'] = df['embedding'].apply(lambda x: cosine_similarity(x, v))
df.sort_values('similarities', ascending=False).head(10)

Unnamed: 0,text,embedding,similarities
21,milk,"[0.039659809321165085, -0.004168314393609762, ...",0.829871
10,cappuccino,"[-0.02482164278626442, -0.030885322019457817, ...",0.829871
5,coffee,"[-0.01013763528317213, 0.0037400354631245136, ...",0.507647
32,pasta,"[-0.05015381798148155, -0.04086881875991821, 0...",0.451762
8,soda,"[0.01846451684832573, -0.025069599971175194, -...",0.443202
22,tea,"[-0.016386866569519043, -0.030908560380339622,...",0.409031
24,sandwich,"[-0.005681606009602547, -0.04887460917234421, ...",0.378767
29,salad,"[-0.0007213581702671945, -0.037118617445230484...",0.34307
40,water,"[0.0030297308694571257, 0.017433997243642807, ...",0.335281
35,white,"[0.0032487320713698864, -0.02826567552983761, ...",0.334519
