In [None]:
%pip install whoosh sentence-transformers scikit-learn pandas

In [9]:
docs = [
    "A beginner's guide to engine troubleshooting: check fuel lines, spark, and air intake before replacing parts.",
    "Piston wear can cause loss of compression; regular maintenance and proper lubrication extend engine life.",
    "Valve timing issues often masquerade as rough idle—inspect the timing belt and camshaft alignment.",
    "Motor diagnostics for intermittent power loss: scan error codes, inspect sensors, and test the ignition coil.",
    "Understanding airflow: clogged filters, intake leaks, and MAF sensor failures reduce performance.",
    "Basic oil system checks: pressure light warnings, pump failures, and choosing the right viscosity."
]
titles = [
    "Engine Troubleshooting 101",
    "Piston Wear & Compression",
    "Valve Timing Problems",
    "Motor Diagnostics Checklist",
    "Airflow & Intake Issues",
    "Oil System Basics"
]

In [10]:
import os, shutil, tempfile
from whoosh import index
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser


In [11]:
# Create temp index
index_dir = tempfile.mkdtemp()
schema = Schema(title=ID(stored=True), content=TEXT(stored=True))
ix = index.create_in(index_dir, schema)

In [12]:
# Add documents
writer = ix.writer()
for t, d in zip(titles, docs):
    writer.add_document(title=t, content=d)
writer.commit()

In [13]:
# Run a keyword (TF-IDF) query
query_text = "engine troubleshooting"

with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    q = parser.parse(query_text)
    results = searcher.search(q, limit=10)
    kw_hits = [(r['title'], r.score) for r in results]

# Cleanup index dir when you’re truly done (leave for now if you want to re-run)
# shutil.rmtree(index_dir)

kw_hits

[('Engine Troubleshooting 101', 3.688410483089193)]

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [15]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [16]:
# Embed documents and query
doc_embeddings = model.encode(docs, normalize_embeddings=True)
query_embedding = model.encode([query_text], normalize_embeddings=True)

In [17]:
# Cosine similarity scores
sims = cosine_similarity(query_embedding, doc_embeddings).flatten()

In [18]:
# Rank by semantic similarity
sem_hits_idx = np.argsort(-sims)  # descending
sem_hits = [(titles[i], float(sims[i])) for i in sem_hits_idx]

sem_hits[:5]

[('Engine Troubleshooting 101', 0.6778184771537781),
 ('Valve Timing Problems', 0.4540880024433136),
 ('Motor Diagnostics Checklist', 0.45055025815963745),
 ('Oil System Basics', 0.32170265913009644),
 ('Piston Wear & Compression', 0.23983070254325867)]

In [19]:
import pandas as pd

# Normalize/pretty print both lists with rank
kw_df = pd.DataFrame(kw_hits, columns=["Title", "TFIDF_Score"])
kw_df["KW_Rank"] = range(1, len(kw_df) + 1)

sem_df = pd.DataFrame(sem_hits, columns=["Title", "CosineSim"])
sem_df["SEM_Rank"] = range(1, len(sem_df) + 1)

# Merge on title to show both ranks together
comparison = pd.merge(sem_df, kw_df, on="Title", how="outer")

# Sort by semantic rank to highlight the “meaning” ordering
comparison_sorted = comparison.sort_values(by="SEM_Rank", na_position="last")

comparison_sorted[["Title", "SEM_Rank", "CosineSim", "KW_Rank", "TFIDF_Score"]]

Unnamed: 0,Title,SEM_Rank,CosineSim,KW_Rank,TFIDF_Score
1,Engine Troubleshooting 101,1,0.677818,1.0,3.68841
5,Valve Timing Problems,2,0.454088,,
2,Motor Diagnostics Checklist,3,0.45055,,
3,Oil System Basics,4,0.321703,,
4,Piston Wear & Compression,5,0.239831,,
0,Airflow & Intake Issues,6,0.225038,,
