In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

os.environ['OPENAI_API_KEY']


'voc-10694876731266774594167694c2af1d65632.81978761'

In [4]:
import json
import numpy as np
from typing import List, Union

def cosine_similarity(v1: Union[List[float], np.ndarray],
                     v2: Union[List[float], np.ndarray]) -> float:
    """
    Calculate the cosine similarity between two vectors.
    Args:
        v1: First vector (list or numpy array of floats)
        v2: Second vector (list or numpy array of floats)
    Returns:
        float: Cosine similarity score between -1 and 1
    Raises:
        ValueError: If vectors are not of equal length or contain all zeros
    """
    # Convert to numpy arrays if they aren't already
    v1_array = np.array(v1)
    v2_array = np.array(v2)
    # Check if vectors are of equal length
    if v1_array.shape != v2_array.shape:
        raise ValueError("Vectors must be of equal length")
    # Calculate dot product and magnitudes
    dot_product = np.dot(v1_array, v2_array)
    magnitude1 = np.linalg.norm(v1_array)
    magnitude2 = np.linalg.norm(v2_array)
    # Check for zero vectors
    if magnitude1 == 0 or magnitude2 == 0:
        raise ValueError("Vectors must not be zero vectors")
    # Calculate cosine similarity
    similarity = dot_product / (magnitude1 * magnitude2)
    # Due to floating point precision, we might get values slightly outside [-1, 1]
    return max(min(similarity, 1.0), -1.0)

In [5]:
from openai import OpenAI

client = OpenAI()

In [6]:
response = client.embeddings.create(
    input="The cat is blue",
    model="text-embedding-3-small"
)

embedding_1 = response.data[0].embedding

In [7]:
response = client.embeddings.create(
    input="The universe is very large",
    model="text-embedding-3-small"
)

embedding_2 = response.data[0].embedding

In [8]:
cosine = cosine_similarity(embedding_1, embedding_2)
print(cosine)

0.15815272156163315


In [9]:
response = client.embeddings.create(
    input="The cat is green",
    model="text-embedding-3-small"
)

embedding_3 = response.data[0].embedding

In [10]:
cosine = cosine_similarity(embedding_1, embedding_3)
print(cosine)

0.7479201266341614
