In [1]:
%pip install sentence-transformers qdrant-client==1.3.2

Note: you may need to restart the kernel to use updated packages.


In [2]:
import csv
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models, conversions
from tqdm.notebook import tqdm

  from tqdm.autonotebook import tqdm, trange


In [3]:
client = QdrantClient(path="investopedia.db")

In [4]:
encoder = SentenceTransformer("all-MiniLM-L6-v2") #embedding model



In [5]:
client.recreate_collection(
    collection_name="investopedia",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

In [6]:
%pip install ipywidgets datasets

Note: you may need to restart the kernel to use updated packages.


In [7]:
from huggingface_hub import notebook_login

In [8]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
import datasets

ds = datasets.load_dataset("openvega-simon/investopedia", split="train")

In [10]:
%pip install langchain

Note: you may need to restart the kernel to use updated packages.


In [11]:
from langchain.docstore.document import Document as LangchainDocument

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["md_content"], metadata={"title": doc["title"], "source": doc["url"]})
    for doc in ds
]

In [12]:
points = []
for idx, doc in enumerate(RAW_KNOWLEDGE_BASE):
    content = f'''{doc.metadata["title"]}: {doc.page_content}'''
    vector = encoder.encode(content).tolist()
    payload = {"title": doc.metadata["title"], "source": doc.metadata["source"], "page_content": doc.page_content}
    point = models.PointStruct(id=idx, vector=vector, payload=payload)
    points.append(point)

In [13]:
points[0]

PointStruct(id=0, vector=[-0.061122678220272064, -0.04948068782687187, 0.03331484645605087, 0.015764832496643066, -0.032936662435531616, 0.07784141600131989, 0.0634869635105133, 0.002088918350636959, 0.018205704167485237, -0.0263025164604187, 0.027163399383425713, 0.1352146565914154, 0.012547994032502174, -0.03340467810630798, -0.03246687725186348, -0.033541351556777954, -0.025679931044578552, 0.024640290066599846, -0.07919353246688843, -0.04568867012858391, 0.0008832374005578458, -0.018578944727778435, -0.005226372741162777, -0.04436135292053223, 0.05813624709844589, 0.020838670432567596, 0.047265391796827316, 0.0498414970934391, -0.04496615380048752, -0.021625259891152382, -0.06375158578157425, 0.010397330857813358, 0.032535307109355927, -0.005954468622803688, -0.05003182217478752, 0.025810308754444122, -0.0428733304142952, 0.03517916426062584, 0.046001650393009186, -0.030650993809103966, -0.026887275278568268, -0.0381648913025856, -0.01992914453148842, 0.01794719696044922, -0.009150

In [14]:
client.upload_records(
    collection_name="investopedia",  # Replace with your actual collection name
    records=points,
    parallel=4
)

In [15]:
hits = client.search(
    collection_name="investopedia",
    query_vector=encoder.encode("J Term").tolist(),
    limit=3
)

In [16]:
hits

[ScoredPoint(id=0, version=0, score=0.6032048648035324, payload={'title': 'J', 'source': 'https://www.investopedia.com/terms/j/j.asp', 'page_content': "## What Is J?\n\n\n The term J refers to a designation for [Nasdaq](https://www.investopedia.com/terms/n/nasdaq.asp)-listed stocks that specifies that the stock has voting rights. The designation appears as the fifth letter following a dot after a stock's four-letter [ticker symbol](https://www.investopedia.com/terms/s/stocksymbol.asp). It is added to denote a shareholder vote situation. The letter J is a temporary suffix that is removed once the shareholder vote situation is resolved. Other letter designations are used to describe [share classes](https://www.investopedia.com/terms/s/share_class.asp), foreign issues, preferred issues, and a company's financial status.\n\n\n\n\n### Key Takeaways\n\n\n* J is a fifth-letter designation used to illustrate that a Nasdaq-listed security has voting rights.\n* Nasdaq-listed stocks have four cha