In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [6]:
# Load a small, fast embedding model
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded!")
print(f"Model produces {model.get_sentence_embedding_dimension()} dimensional embeddings")

Loading embedding model...
✅ Model loaded!
Model produces 384 dimensional embeddings


In [25]:
# Simple example
sentences = [
    "The dog is playing in the park",
    "A puppy is running outside",
    "The cat is sleeping on the couch",
    "Python is a programming language",
    "Machine learning models need data",
    "I love coding in Python"
]

# Generate embedding
embeddings = model.encode(sentences)

print(f"Original text: {sentences}")
print(f"Embedding shape: {embeddings.shape}")
print(f"Embedding type: {type(embeddings)}")
print(f"\nFirst 10 values: {embeddings[:10]}")

Original text: ['The dog is playing in the park', 'A puppy is running outside', 'The cat is sleeping on the couch', 'Python is a programming language', 'Machine learning models need data', 'I love coding in Python']
Embedding shape: (6, 384)
Embedding type: <class 'numpy.ndarray'>

First 10 values: [[ 0.04757566 -0.07015255  0.06429745 ...  0.07358797  0.01249519
   0.01645608]
 [-0.0252566   0.03054359  0.05249952 ...  0.01063144  0.01476685
   0.10832038]
 [ 0.12203883 -0.04751379 -0.00115909 ...  0.08472569  0.06573964
   0.0092331 ]
 [-0.03537083  0.03816499 -0.04126014 ...  0.11130316  0.19625439
  -0.0289743 ]
 [ 0.0166593  -0.04558858  0.02346507 ...  0.02717924 -0.03379643
  -0.05370044]
 [-0.06430852  0.01564189 -0.0467849  ...  0.15115134  0.10791418
  -0.04270949]]


In [None]:
#3 helper function to calculate similarity scores 

# def cosine_similarity(a, b):
#     return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

print("Similarity function ready!")

Similarity function ready!


In [26]:
def similarity_report(query_index):
    query_embedding = embeddings[query_index].reshape(1, -1)
    scores = cosine_similarity(query_embedding, embeddings)[0]
    
    results = list(zip(sentences, scores))
    results.sort(key=lambda x: x[1], reverse=True)
    return results

print("helper function ready")


helper function ready


In [27]:
# calculate similarity

# 4. Calculate similarities

print("Similarity Analysis:\n")

# Query 1: "The dog is playing in the park" (index 0)
query_idx1 = 0
similarities1 = [cosine_similarity(embeddings[query_idx1], embeddings[i]) for i in range(len(sentences))]

print(f"Query: \"{sentences[query_idx1]}\"")
print("Similarity scores:")
for i, score in enumerate(similarities1):
    print(f"  -> {sentences[i]:<40} : {score:.4f}")

max_idx1 = np.argmax(similarities1)
min_idx1 = np.argmin(similarities1)

print(f"\nMost similar: \"{sentences[max_idx1]}\" (score: {similarities1[max_idx1]:.4f})")
print(f"Least similar: \"{sentences[min_idx1]}\" (score: {similarities1[min_idx1]:.4f})")
print("Observations: The model correctly identifies that 'A puppy is running outside' is most similar "
      "because both describe young dogs being active outdoors. 'The cat is sleeping' is least similar "
      "due to different animal and action.\n")


Similarity Analysis:

Query: "The dog is playing in the park"
Similarity scores:
  -> The dog is playing in the park           : 1.0000
  -> A puppy is running outside               : 0.3984
  -> The cat is sleeping on the couch         : 0.0714
  -> Python is a programming language         : 0.0987
  -> Machine learning models need data        : -0.0052
  -> I love coding in Python                  : 0.0902

Most similar: "The dog is playing in the park" (score: 1.0000)
Least similar: "Machine learning models need data" (score: -0.0052)
Observations: The model correctly identifies that 'A puppy is running outside' is most similar because both describe young dogs being active outdoors. 'The cat is sleeping' is least similar due to different animal and action.



In [28]:
# Query 2: "Python is a programming language" (index 3)
query_idx2 = 3
similarities2 = [cosine_similarity(embeddings[query_idx2], embeddings[i]) for i in range(len(sentences))]

print(f"Query: \"{sentences[query_idx2]}\"")
print("Similarity scores:")
for i, score in enumerate(similarities2):
    print(f"  -> {sentences[i]:<40} : {score:.4f}")

max_idx2 = np.argmax(similarities2)
min_idx2 = np.argmin(similarities2)

print(f"\nMost similar: \"{sentences[max_idx2]}\" (score: {similarities2[max_idx2]:.4f})")
print(f"Least similar: \"{sentences[min_idx2]}\" (score: {similarities2[min_idx2]:.4f})")
print("Observations: The model finds 'I love coding in Python' most similar because both are directly "
      "about Python programming. Sentences about dogs and cats score very low, showing strong topic separation.\n")



Query: "Python is a programming language"
Similarity scores:
  -> The dog is playing in the park           : 0.0987
  -> A puppy is running outside               : 0.0395
  -> The cat is sleeping on the couch         : 0.0199
  -> Python is a programming language         : 1.0000
  -> Machine learning models need data        : 0.1133
  -> I love coding in Python                  : 0.7304

Most similar: "Python is a programming language" (score: 1.0000)
Least similar: "The cat is sleeping on the couch" (score: 0.0199)
Observations: The model finds 'I love coding in Python' most similar because both are directly about Python programming. Sentences about dogs and cats score very low, showing strong topic separation.



In [29]:
# Recommended threshold
print("Recommended similarity threshold: 0.4 - 0.5")
print("Reasoning:")
print("   • Scores above ~0.5: clearly related (same topic)")
print("   • Scores 0.3–0.5: somewhat related (might be useful with more context)")
print("   • Scores below ~0.3: unrelated (should be filtered out in RAG retrieval)")

Recommended similarity threshold: 0.4 - 0.5
Reasoning:
   • Scores above ~0.5: clearly related (same topic)
   • Scores 0.3–0.5: somewhat related (might be useful with more context)
   • Scores below ~0.3: unrelated (should be filtered out in RAG retrieval)
