In [9]:
import datasets
import faiss
import numpy as np
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

In [4]:
ds = datasets.load_dataset("openvega-simon/investopedia", split="train")
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["md_content"], metadata={"title": doc["title"], "source": doc["url"]})
    for doc in ds
]

In [10]:
embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")



In [13]:
from tqdm import tqdm

In [14]:
texts = [doc.page_content for doc in tqdm(RAW_KNOWLEDGE_BASE)]
embeddings = embeddings_model.encode(texts, normalize_embeddings=True)

100%|██████████| 4723/4723 [00:00<00:00, 1682209.39it/s]


In [17]:
embeddings_np = np.array(embeddings).astype('float32')

In [18]:
index = faiss.IndexFlatIP(embeddings_np.shape[1])  # Using inner product (cosine similarity)
index.add(embeddings_np)

In [19]:
faiss.write_index(index, "faiss_index.bin")

In [21]:
import json

In [22]:
metadata = [doc.metadata for doc in RAW_KNOWLEDGE_BASE]
with open("metadata.json", "w") as f:
    json.dump(metadata, f)

In [23]:
index = faiss.read_index("faiss_index.bin")

In [24]:
with open("metadata.json", "r") as f:
    loaded_metadata = json.load(f)

In [25]:
def search_faiss(query, k=5):
    query_embedding = embeddings_model.encode([query], normalize_embeddings=True).astype('float32')
    D, I = index.search(query_embedding, k)
    results = [
        {
            "doc": RAW_KNOWLEDGE_BASE[i].page_content,
            "metadata": loaded_metadata[i],
            "score": D[0][j]
        }
        for j, i in enumerate(I[0])
    ]
    return results

In [28]:
results = search_faiss("How to save for a house")
for result in results:
    print(f"Title: {result['metadata']['title']}, Score: {result['score']}, Source: {result['metadata']['source']}")


Title: 6 Tips for Getting Approved for a Mortgage, Score: 0.6068446636199951, Source: https://www.investopedia.com/articles/mortgage-real-estate/08/mortgage-application-rejected.asp
Title: Should You Save for a Home or Retirement?, Score: 0.5993292331695557, Source: https://www.investopedia.com/save-for-a-home-or-retirement-8599275
Title: How To Save for a House: A Step-by-Step Guide, Score: 0.5908926725387573, Source: https://www.investopedia.com/articles/investing/092815/where-should-i-keep-my-down-payment-savings.asp
Title: Do You Need a Savings Plan? And How Do You Make One?, Score: 0.5228768587112427, Source: https://www.investopedia.com/make-savings-plan-5208028
Title: Mortgage Calculator, Score: 0.5062568187713623, Source: https://www.investopedia.com/mortgage-calculator-5084794
