In [8]:
!pip install whoosh sentence-transformers scikit-learn pandas matplotlib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting matplotlib
  Downloading matplotlib-3.10.7-cp313-cp313-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.60.1-cp313-cp313-macosx_10_13_universal2.whl.metadata (112 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.3 kB)
Collecting pyparsing>=3 (from matplotlib)
  Downloading pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.7-cp313-cp313-macosx_11_0_arm64.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m10.8 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hDownloading contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl (274 kB)
Usi

In [2]:
docs = [
    "A beginner's guide to engine troubleshooting: check fuel lines, spark, and air intake before replacing parts.",
    "Piston wear can cause loss of compression; regular maintenance and proper lubrication extend engine life.",
    "Valve timing issues often masquerade as rough idle—inspect the timing belt and camshaft alignment.",
    "Motor diagnostics for intermittent power loss: scan error codes, inspect sensors, and test the ignition coil.",
    "Understanding airflow: clogged filters, intake leaks, and MAF sensor failures reduce performance.",
    "Basic oil system checks: pressure light warnings, pump failures, and choosing the right viscosity."
]
titles = [
    "Engine Troubleshooting 101",
    "Piston Wear & Compression",
    "Valve Timing Problems",
    "Motor Diagnostics Checklist",
    "Airflow & Intake Issues",
    "Oil System Basics"
]

In [3]:
import os, shutil, tempfile
from whoosh import index
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser

# Create temp index
index_dir = tempfile.mkdtemp()
schema = Schema(title=ID(stored=True), content=TEXT(stored=True))
ix = index.create_in(index_dir, schema)

# Add documents
writer = ix.writer()
for t, d in zip(titles, docs):
    writer.add_document(title=t, content=d)
writer.commit()

# Run a keyword (TF-IDF) query
query_text = "engine troubleshooting"

with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    q = parser.parse(query_text)
    results = searcher.search(q, limit=10)
    kw_hits = [(r['title'], r.score) for r in results]

# Cleanup index dir when you’re truly done (leave for now if you want to re-run)
# shutil.rmtree(index_dir)

kw_hits


[('Engine Troubleshooting 101', 3.688410483089193)]

In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Embed documents and query
doc_embeddings = model.encode(docs, normalize_embeddings=True)
query_embedding = model.encode([query_text], normalize_embeddings=True)

# Cosine similarity scores
sims = cosine_similarity(query_embedding, doc_embeddings).flatten()

# Rank by semantic similarity
sem_hits_idx = np.argsort(-sims)  # descending
sem_hits = [(titles[i], float(sims[i])) for i in sem_hits_idx]

sem_hits[:5]

[('Engine Troubleshooting 101', 0.7078356742858887),
 ('Valve Timing Problems', 0.4606196880340576),
 ('Motor Diagnostics Checklist', 0.39236342906951904),
 ('Piston Wear & Compression', 0.3693113625049591),
 ('Oil System Basics', 0.2519925832748413)]

In [6]:
import pandas as pd

# Normalize/pretty print both lists with rank
kw_df = pd.DataFrame(kw_hits, columns=["Title", "TFIDF_Score"])
kw_df["KW_Rank"] = range(1, len(kw_df) + 1)

sem_df = pd.DataFrame(sem_hits, columns=["Title", "CosineSim"])
sem_df["SEM_Rank"] = range(1, len(sem_df) + 1)

# Merge on title to show both ranks together
comparison = pd.merge(sem_df, kw_df, on="Title", how="outer")

# Sort by semantic rank to highlight the “meaning” ordering
comparison_sorted = comparison.sort_values(by="SEM_Rank", na_position="last")

comparison_sorted[["Title", "SEM_Rank", "CosineSim", "KW_Rank", "TFIDF_Score"]]

Unnamed: 0,Title,SEM_Rank,CosineSim,KW_Rank,TFIDF_Score
1,Engine Troubleshooting 101,1,0.707836,1.0,3.68841
5,Valve Timing Problems,2,0.46062,,
2,Motor Diagnostics Checklist,3,0.392363,,
4,Piston Wear & Compression,4,0.369311,,
3,Oil System Basics,5,0.251993,,
0,Airflow & Intake Issues,6,0.197247,,
