### Semantic Search

Its an advanced search method that understands the intent and context behind a query, not just keywords, using AI (NLP & ML) to find conceptually similar results, making searches more relevant and natural, unlike traditional keyword matching which relies on exact word matches.

In [7]:
import numpy as np
from sentence_transformers import SentenceTransformer

In [8]:
import numpy as np
import torch # You need this import if you check for tensor type

def cosine_similarity(vec1, vec2):
    """
    Calculates cosine similarity, ensuring vectors are on the CPU 
    and converted to NumPy arrays if they are PyTorch tensors.
    """
    # Check if the input is a PyTorch tensor
    if isinstance(vec1, torch.Tensor):
        # Move the tensor to the CPU and then convert to NumPy
        # .cpu() returns a copy of the tensor in host memory
        # .numpy() converts the CPU tensor to a NumPy array
        vec1 = vec1.cpu().numpy()
        vec2 = vec2.cpu().numpy()
        
    dot_product = np.dot(vec1, vec2)
    norm_a = np.linalg.norm(vec1)
    norm_b = np.linalg.norm(vec2)
    return dot_product / (norm_a * norm_b)

In [9]:
docs = [
    "Langchain is a framework for developing applications powered by language models.",
    "python is a programming language that lets you work quickly and integrate systems more effectively.",
    "Machine learning is a field of artificial intelligence that uses statistical techniques to give computer systems the ability to 'learn' from data.",
    "Embeddings are a way to represent words or phrases as vectors in a continuous vector space.",
    "APIs (Application Programming Interfaces) allow different software applications to communicate with each other."
]

query = "What is Langchain?"


In [12]:
def semantic_search(query, docs, embedding_model, top_k=3):
    """simple semantic search implementation"""
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    doc_embeddings = embedding_model.encode(docs, convert_to_tensor=True)


    similarities = []
    for i, doc_embedding in enumerate(doc_embeddings):
        similarity = cosine_similarity(query_embedding, doc_embedding)
        similarities.append((similarity, docs[i]))

        similarities.sort(reverse=True)
    
    return similarities[:top_k]
        
    

In [13]:


# Define the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Perform semantic search
result = semantic_search(query, docs, embedding_model)
result

[(np.float32(0.708697),
  'Langchain is a framework for developing applications powered by language models.'),
 (np.float32(0.1372443),
  'APIs (Application Programming Interfaces) allow different software applications to communicate with each other.'),
 (np.float32(0.13393477),
  'python is a programming language that lets you work quickly and integrate systems more effectively.')]