In [None]:
# Install required libraries
!pip install openai faiss-cpu numpy python-dotenv

In [None]:
# Importing libraries
import openai
import numpy as np
import os
from dotenv import load_dotenv

# Importing faiss library (Facebook AI Similarity Search)
import faiss

load_dotenv() # Load environment variables from .env file

True

In [2]:
# Set your OpenAI API key
openai.api_key = os.environ["API_KEY"]

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
    # Text embedding model that coverts text into numeric vector (array)
    response = openai.Embedding.create(input=text, model=model)
    return np.array(response["data"][0]["embedding"])

In [4]:
# Example sentences
sentences = [
    "I love programming.",
    "Machine learning is fascinating.",
    "Deep learning is a subset of machine learning.",
    "I enjoy writing code in Python."
]

In [5]:
# Generate embeddings for each sentence
embeddings = [get_embedding(sentence) for sentence in sentences]

In [7]:
print(type(embeddings))
embeddings

<class 'list'>


[array([-0.00505038, -0.00910243, -0.01105342, ..., -0.00687739,
        -0.00086538, -0.02711806]),
 array([-0.02653796,  0.00856252,  0.01694327, ..., -0.01533333,
        -0.02314931, -0.01226277]),
 array([-0.02140337, -0.00273163,  0.02488495, ..., -0.0095069 ,
        -0.02813528, -0.00824145]),
 array([ 0.01494298, -0.00142405,  0.01142543, ...,  0.00628465,
         0.01327677, -0.04165521])]

In [8]:
# Convert embeddings to NumPy array for efficient processing
embeddings = np.array(embeddings)

In [9]:
print(type(embeddings))
embeddings

<class 'numpy.ndarray'>


array([[-0.00505038, -0.00910243, -0.01105342, ..., -0.00687739,
        -0.00086538, -0.02711806],
       [-0.02653796,  0.00856252,  0.01694327, ..., -0.01533333,
        -0.02314931, -0.01226277],
       [-0.02140337, -0.00273163,  0.02488495, ..., -0.0095069 ,
        -0.02813528, -0.00824145],
       [ 0.01494298, -0.00142405,  0.01142543, ...,  0.00628465,
         0.01327677, -0.04165521]])

In [10]:
embeddings.shape

(4, 1536)

In [11]:
# Create FAISS index for similarity search (L2 or Cosine)
d = embeddings.shape[1]  # Vector dimension
index = faiss.IndexFlatL2(d)
index.add(embeddings)  # Store embeddings in FAISS

In [14]:
d

1536

In [16]:
# Search for the closest sentence to a query
# query = "I like coding in Python."
query = "I'm interested in neural networks."
query_embedding = get_embedding(query).reshape(1, -1)  # Get embedding

In [19]:
print(type(query_embedding))
query_embedding

<class 'numpy.ndarray'>


array([[-0.04367673,  0.00459294,  0.00884181, ..., -0.00616395,
        -0.01594383, -0.03006995]])

In [26]:
# Perform search
k = 2  # Get top 2 closest matches
distances, indices = index.search(query_embedding, k) # L2 distance (squared Euclidean distance) and index of 'k' most closest k-nearest sentence

In [22]:
distances

array([[0.27444547, 0.34549877]], dtype=float32)

In [23]:
indices

array([[1, 2]])

In [24]:
def cosine_similarity(sentence, query):
    # Compute similarity (Cosine Similarity)
    similarity = np.dot(sentence, query) / (np.linalg.norm(sentence) * np.linalg.norm(query))
    print(f"Similarity Score: {similarity:.4f}")

In [25]:
# Print most similar sentences with distance
print("\n🔍 Most Similar Sentences with Distance:")
for i, j in zip(distances[0], indices[0]):
    print(f"- {sentences[j]} (L2 distance: {i:.4f})")
    cosine_similarity(get_embedding(sentences[j]), get_embedding(query))


🔍 Most Similar Sentences with Distance:
- Machine learning is fascinating. (L2 distance: 0.2744)
Similarity Score: 0.8628
- Deep learning is a subset of machine learning. (L2 distance: 0.3455)
Similarity Score: 0.8273


### Less L2 distance and high Similarity score indicates more closest similarity
