# Comparison of keyword search vs semantic search

In [1]:
# pip install numpy scikit-learn sentence-transformers

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Example corpus
chunks = [
    "I really enjoyed the film we watched last night",
    "The movie was excellent",
    "I didn't like the documentary",
    "The cinematic experience was remarkable"
]

query = "The film was great"



In [3]:

# Keyword-based approach (Bag of Words)
vectorizer = CountVectorizer()
keyword_vectors = vectorizer.fit_transform(chunks)
keyword_matrix = keyword_vectors.toarray()

# Query
query_keywords = vectorizer.transform([query]).toarray()

print("Similarity based on keywords:")
for i, doc in enumerate(chunks):
    similarity = cosine_similarity(query_keywords, [keyword_matrix[i]])[0][0]
    print(f"Chunk {i+1}: {similarity:.4f} - {doc}")



Similarity based on keywords:
Chunk 1: 0.4082 - I really enjoyed the film we watched last night
Chunk 2: 0.5774 - The movie was excellent
Chunk 3: 0.2887 - I didn't like the documentary
Chunk 4: 0.5164 - The cinematic experience was remarkable


In [4]:
# Embedding-based approach
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_vectors = model.encode(chunks)

# Query
query_embedding = model.encode([query])[0]

print("\nSimilarity based on embeddings:")
for i, doc in enumerate(chunks):
    similarity = cosine_similarity([query_embedding], [embedding_vectors[i]])[0][0]
    print(f"Chunk {i+1}: {similarity:.4f} - {doc}")


Similarity based on embeddings:
Chunk 1: 0.6905 - I really enjoyed the film we watched last night
Chunk 2: 0.9167 - The movie was excellent
Chunk 3: 0.4964 - I didn't like the documentary
Chunk 4: 0.6170 - The cinematic experience was remarkable
