In [1]:
# Install required packages
%pip install faiss-cpu numpy sentence-transformers python-dotenv




In [2]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import os
import warnings

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Suppress some warnings
warnings.filterwarnings('ignore')

In [4]:
# Load environment variables (not needed for local embeddings, but kept for structure)
load_dotenv()

class LocalEmbedder:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        """
        Initialize with a local embedding model
        Options: 'all-MiniLM-L6-v2' (fast, 384-dim), 'all-mpnet-base-v2' (slower, 768-dim)
        """
        self.model = SentenceTransformer(model_name)
        self.dimensions = self.model.get_sentence_embedding_dimension()
        
    def get_embedding(self, text):
        """Get embedding from local model"""
        try:
            return self.model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
        except Exception as e:
            print(f"⚠️ Embedding error: {str(e)}")
            return self._fallback_embedding(text)
    
    def _fallback_embedding(self, text):
        """Simpler fallback if main model fails"""
        print("⚠️ Using fallback embedding method")
        words = text.lower().split()
        embedding = np.zeros(self.dimensions, dtype=np.float32)
        
        # Basic word presence embedding
        for word in words:
            hash_val = hash(word) % self.dimensions
            embedding[hash_val] = 1.0
            
        # Normalize
        norm = np.linalg.norm(embedding)
        return embedding / norm if norm > 0 else embedding

In [5]:
# Initialize embedder (384 dimensions for 'all-MiniLM-L6-v2')
embedder = LocalEmbedder()


In [6]:
# Sample texts
texts = [
    "FAISS is a library for efficient similarity search.",
    "It is developed by Facebook AI Research.",
    "It supports cosine and L2 distance search.",
    "You can use FAISS with local embeddings."
]

In [7]:
# Generate embeddings
print("Generating embeddings...")
embeddings = np.array([embedder.get_embedding(text) for text in texts])
print(f"✅ Generated {len(embeddings)} embeddings of dimension {embeddings.shape[1]}")

Generating embeddings...
✅ Generated 4 embeddings of dimension 384


In [8]:
embeddings

array([[-0.05062099, -0.07261783, -0.08909272, ...,  0.00162049,
         0.10721246,  0.01419312],
       [-0.07286035, -0.00755991, -0.08941661, ...,  0.10490253,
         0.0536476 ,  0.01168776],
       [-0.04875184,  0.00328672, -0.09100699, ...,  0.0371596 ,
         0.01309524, -0.0072599 ],
       [-0.01144552, -0.04123598, -0.0329535 , ..., -0.00409432,
         0.08605338,  0.04142921]], dtype=float32)

In [9]:
# Create FAISS index
index = faiss.IndexFlatIP(embeddings.shape[1])  # Inner product = cosine similarity
index.add(embeddings)

In [10]:
# Store text mappings
text_id_map = {i: text for i, text in enumerate(texts)}

def search(query, k=2):
    """Search for similar texts"""
    query_vector = embedder.get_embedding(query).reshape(1, -1)
    distances, indices = index.search(query_vector, k)
    
    return [(text_id_map[idx], float(score)) 
            for idx, score in zip(indices[0], distances[0])]

In [11]:
# Example query
query = "What is FAISS?"
results = search(query)

print("\nSearch results:")
for i, (text, score) in enumerate(results):
    print(f"{i+1}. {text} (Score: {score:.4f})")


Search results:
1. You can use FAISS with local embeddings. (Score: 0.4261)
2. FAISS is a library for efficient similarity search. (Score: 0.3947)
