In [1]:
# ✅ COMPLETE HUGGING FACE SEMANTIC SEARCH SCRIPT (NO WIDGETS)
# --------------------------------------------------------------
# Works in Jupyter, VS Code, or Colab. No ipywidgets needed.

# --------------------------------------------------------------
# 1. INSTALL DEPENDENCIES (ONLY ONCE)
# --------------------------------------------------------------
# Uncomment the line below if running in Colab or fresh setup:
# !pip install transformers torch pandas scikit-learn

# --------------------------------------------------------------
# 2. IMPORTS AND SETUP
# --------------------------------------------------------------
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Optional: Avoid Hugging Face warnings on Windows
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# --------------------------------------------------------------
# 3. LOAD MODEL AND TOKENIZER
# --------------------------------------------------------------
model_name = "thenlper/gte-base"  # You can change to gte-small, bge-small, etc.
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"\u2705 Model loaded on {device}")

# --------------------------------------------------------------
# 4. EMBEDDING FUNCTION (MEAN POOLING)
# --------------------------------------------------------------
def embed(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.cpu().numpy()

# --------------------------------------------------------------
# 5. LOAD DATASET
# --------------------------------------------------------------
df = pd.read_csv(r"C:\\Users\\Admin\\Downloads\\aggregate_and_limits_1000_rows.csv")

# Combine multiple columns into one searchable text
df["combined_text"] = (
    df["limits"].fillna('') + " " +
    df["aggregate"].fillna('') + " " +
    df["aggregate_and_limits"].fillna('') + " " +
    df["remaining"].fillna('')
)

# --------------------------------------------------------------
# 6. EMBED ALL DOCUMENTS
# --------------------------------------------------------------
print("\U0001F4E6 Generating embeddings...")
all_embeddings = []
batch_size = 16

for i in range(0, len(df), batch_size):
    batch = df["combined_text"].iloc[i:i+batch_size].tolist()
    batch_embeddings = embed(batch)
    all_embeddings.extend(batch_embeddings)

df["embedding"] = all_embeddings
print("\u2705 All embeddings ready!")

# --------------------------------------------------------------
# 7. SEMANTIC SEARCH FUNCTION
# --------------------------------------------------------------
def semantic_search(query, df, top_n=5):
    query_embedding = embed([query])
    doc_embeddings = np.vstack(df["embedding"].values)
    similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
    top_indices = similarities.argsort()[-top_n:][::-1]
    results = df.iloc[top_indices].copy()
    results["similarity"] = similarities[top_indices]
    return results

# --------------------------------------------------------------
# 8. TEST QUERY
# --------------------------------------------------------------
query = "total coverage limit"
results = semantic_search(query, df, top_n=5)

# --------------------------------------------------------------
# 9. DISPLAY RESULTS
# --------------------------------------------------------------
for _, row in results.iterrows():
    print(f"\nScore: {row['similarity']:.4f}")
    print(f"Combined Text: {row['combined_text']}")
    print(f"Limits: {row.get('limits', '')}")
    print(f"Aggregate: {row.get('aggregate', '')}")
    print(f"Aggregate and Limits: {row.get('aggregate_and_limits', '')}")
    print("-" * 80)

# --------------------------------------------------------------
# DONE! You now have GPU/CPU-compatible semantic search with Hugging Face.
# --------------------------------------------------------------


✅ Model loaded on cpu
📦 Generating embeddings...
✅ All embeddings ready!

Score: 0.8770
Combined Text: Limited, Sudden, and Accidental Pollution liability Coverage-Environmental Property Damage Limit Additional Conditions And Exclusions- Contractors Subcontracted Work - General Aggregate Limit Aggregate Limits of Insurance - Per Location and Total Policy Coverage Home Alteration and Vehicle Modification Benefit Maximum Amount
Limits: Limited, Sudden, and Accidental Pollution liability Coverage-Environmental Property Damage Limit
Aggregate: Additional Conditions And Exclusions- Contractors Subcontracted Work - General Aggregate Limit
Aggregate and Limits: Aggregate Limits of Insurance - Per Location and Total Policy Coverage
--------------------------------------------------------------------------------

Score: 0.8762
Combined Text: Computer Attack Limit. Public Relations Violent Event Response Coverage for Schools; Each Person Limit Aggregate Limits of Insurance - Per Location and Tot