# Vector embeddings with OpenAI

## Setup OpenAI API

## See aka.ms/aitour/vectors

In [1]:
import os
import dotenv
import openai
from openai import AzureOpenAI

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_API_KEY = os.getenv("AZURE_API_KEY")
AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT")

# api_version = "2023-05-15"
api_version = "2024-06-01"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=AZURE_ENDPOINT,
    api_key=AZURE_API_KEY,
)

In [2]:
#see embedding model here: https://platform.openai.com/docs/models/embeddings
#for example embedding model: text-embedding-ada-002

embedding_model = "text-embedding-3-small"

## Vector representations

In [3]:
%%time

sentence = "A dog just walked past my house and yipped yipped like a Martian"

response = client.embeddings.create(model=embedding_model, input=sentence)

vector = response.data[0].embedding

CPU times: user 149 ms, sys: 1.16 s, total: 1.31 s
Wall time: 498 ms


In [4]:
vector

[-0.017961615696549416,
 -0.006984414998441935,
 -0.056301847100257874,
 0.019762517884373665,
 0.010645456612110138,
 0.031349893659353256,
 0.024572817608714104,
 -0.01263000164180994,
 -0.03317448869347572,
 -0.008127749897539616,
 -0.02073405683040619,
 -0.012511521577835083,
 0.01396882999688387,
 -0.01033740770071745,
 -0.029359422624111176,
 0.015544617548584938,
 -0.016563547775149345,
 0.006907402537763119,
 -0.033885370939970016,
 0.03824544697999954,
 0.007262843661010265,
 0.059287551790475845,
 -0.05251047760248184,
 8.830486331135035e-05,
 -0.004072761163115501,
 0.02200179547071457,
 -0.01759432815015316,
 -0.010544748045504093,
 0.0537426732480526,
 0.02793765999376774,
 0.0026080480311065912,
 -0.026966121047735214,
 -0.024501729756593704,
 -0.034027546644210815,
 -0.018577713519334793,
 -0.010906113311648369,
 -0.014904824085533619,
 -0.007754536811262369,
 0.023944873362779617,
 0.03665781021118164,
 -0.030141392722725868,
 -0.0238500889390707,
 0.050899144262075424,

In [5]:
len(vector)

1536

### Document similarity modeled as cosine distance

In [6]:
%%time

import numpy as np
import pandas as pd


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['The new movie is awesome',
              'This recent movie is so good',
              'djkshsjdkhfsjdfkhsd']

def get_embeddings(sentences):
    embeddings_response = client.embeddings.create(model=embedding_model, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]

embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

for i in range(len(sentences1)):
    print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

The new movie is awesome 		 The new movie is awesome 		 Score: 1.0000
The new movie is awesome 		 This recent movie is so good 		 Score: 0.5637
The new movie is awesome 		 djkshsjdkhfsjdfkhsd 		 Score: 0.1299
CPU times: user 137 ms, sys: 36 ms, total: 173 ms
Wall time: 701 ms


### Vector search

In [7]:
import json

# Load in vectors for movie titles
with open('./openai_movies.json') as json_file:
    movie_vectors = json.load(json_file)

In [8]:
# Compute vector for query
query = "101 Dalmations"

embeddings_response = client.embeddings.create(model=embedding_model, input=[query])
vector = embeddings_response.data[0].embedding

# Compute cosine similarity between query and each movie title
scores = []
for movie in movie_vectors:
    scores.append((movie, cosine_similarity(vector, movie_vectors[movie])))

# Display the top 10 results
df = pd.DataFrame(scores, columns=['Movie', 'Score'])
df = df.sort_values('Score', ascending=False)
df.head(10)

Unnamed: 0,Movie,Score
275,Kundun,0.086887
316,Deuce Bigalow: Male Gigolo,0.078912
402,The Village,0.077904
70,The Good Mother,0.077118
133,1492: Conquest of Paradise,0.076437
389,Calendar Girls,0.075668
308,The 13th Warrior,0.075653
219,Feast of July,0.074937
284,Armageddon,0.074414
29,Condorman,0.074038
