In [1]:
from qdrant_client import QdrantClient, models
from app.core.config import settings as config
from app.utils.utils import split_docs
from app.web_loader.bs_loader import load_web_docs
from app.web_loader.bs_utils import urls
from app.db.vector_db import VectorDB
import os
import pickle
from tenacity import (
    retry,
    wait_exponential,
    stop_after_attempt,
    retry_if_exception_type,
)
import time
import requests
from fastembed import SparseTextEmbedding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = QdrantClient(config.qdrant_url, api_key=config.qdrant_api_key)

In [3]:
client.create_collection(
    collection_name=config.qdrant_collection_name,
    vectors_config={
        "dense": models.VectorParams(
            size=config.embeddings_dim, distance=models.Distance.COSINE
        )
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF)
    },
)

True

In [4]:
# client.delete_collection(collection_name=config.qdrant_collection_name)

In [5]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='sutd')])

In [6]:
CACHE_FILE = "all_docs.pkl"

if os.path.exists(CACHE_FILE):
    print("Loading cached documents...")
    with open(CACHE_FILE, "rb") as f:
        all_docs = pickle.load(f)
else:
    all_docs = load_web_docs(urls)
    print("Saving documents to cache...")
    with open(CACHE_FILE, "wb") as f:
        pickle.dump(all_docs, f)

Loading cached documents...


In [7]:
chunks = split_docs(all_docs)

In [8]:
print(len(chunks))
print(chunks[20])

1013
Document(text='Global experience and exchange Career development Student well-being CAMPUS Academic facilities Housing Sports and recreation centre F&B and services Library About About OVERVIEW Design·AI At a glance Diversity and inclusion Sustainability PEOPLE Leadership Board of Trustees President Emeritus Professorships Faculty MORE Happenings Careers at SUTD Partnering with SUTD On this page Popular searches quick links Design·AI Education Go beyond traditional learning here at SUTD, where we emphasise', metadata={'source': 'https://www.sutd.edu.sg/education'})


In [9]:
vector_db = VectorDB(config)

sparse_model = SparseTextEmbedding("Qdrant/bm25")

dense_embeddings = []
sparse_embeddings = []

# Embeddings Generation

In [10]:
@retry(
    retry=retry_if_exception_type((requests.exceptions.RequestException, Exception)),
    wait=wait_exponential(multiplier=1, min=4, max=60),  # Exponential backoff
    stop=stop_after_attempt(5),
)
def get_embedding_with_retry(text):
    # time.sleep(60 / 4500)  # Rate limit of 4500 requests per minute

    dense_embedding = vector_db.get_embeddings(text)
    sparse_embedding = list(sparse_model.embed([text]))[0]

    return dense_embedding, sparse_embedding


for i, chunk in enumerate(chunks):
    try:
        dense_embedding, sparse_embedding = get_embedding_with_retry(chunk.text)
        dense_embeddings.append(dense_embedding)
        sparse_embeddings.append(sparse_embedding)

        if i % 100 == 0:
            print(f"Processed {i}/{len(chunks)} chunks")

    except Exception as e:
        print(f"Failed after retries on chunk {i}: {str(e)}")

Processed 0/1013 chunks
Processed 100/1013 chunks
Processed 200/1013 chunks
Processed 300/1013 chunks
Processed 400/1013 chunks
Processed 500/1013 chunks
Processed 600/1013 chunks
Processed 700/1013 chunks
Processed 800/1013 chunks
Processed 900/1013 chunks
Processed 1000/1013 chunks


In [11]:
print(dense_embeddings[4])

[ 4.17368600e-02 -4.90867500e-02 -5.51766530e-02 -6.50333800e-03
  1.40829100e-02  2.47139970e-02 -2.06715580e-02 -2.76145770e-02
 -2.98982910e-02  1.33872945e-02 -2.73258310e-02  3.04232840e-02
 -9.63360200e-02 -2.95832960e-02 -2.87695580e-02  6.91676900e-03
 -1.80137870e-03  1.80794100e-03  3.02362560e-03 -5.28273170e-03
  2.87695580e-02 -3.07645300e-02 -8.66892900e-03 -1.29709920e-04
 -1.33544820e-03 -3.36782340e-02 -3.97943900e-02  5.88647230e-03
 -1.79416000e-02  1.50803940e-02  2.81133190e-02 -9.44985500e-02
 -1.03160920e-02  6.69692880e-03  2.27321520e-02  1.82565950e-02
 -5.90944060e-03  2.02121900e-02  1.15695100e-02 -3.82981640e-02
 -1.06770240e-02 -6.17390540e-02  3.25232520e-02  1.40829100e-02
  1.52641410e-02  2.85858120e-02 -2.74308300e-02 -9.30548300e-03
  1.07885850e-02 -8.75424100e-03 -8.30012260e-02  4.92704960e-02
  3.14732680e-02 -1.53428900e-02 -1.07951470e-02 -1.26326190e-02
  9.83539600e-04  1.37022900e-02 -4.65142880e-02  1.13529510e-02
 -2.14918590e-03 -2.19348

In [12]:
print(sparse_embeddings)

[SparseEmbedding(values=array([1.47666492, 1.89120215, 1.76717917, 1.76717917, 2.03394727,
       1.95997912, 1.47666492, 1.47666492, 1.47666492, 1.47666492,
       1.47666492, 1.47666492, 1.76717917, 1.89120215, 1.47666492,
       1.76717917, 1.47666492, 1.47666492, 1.47666492, 1.47666492,
       1.47666492, 1.47666492, 1.47666492, 1.47666492, 1.47666492,
       1.47666492, 1.47666492, 1.47666492, 1.47666492, 1.47666492,
       1.47666492, 1.47666492, 1.47666492, 1.47666492, 1.47666492,
       1.47666492, 1.47666492, 1.47666492]), indices=array([1150284091, 1559241040,  482863870,  575755316, 1598346136,
       1203407763,  605660245, 2138561093,  714169958, 1412287001,
       1928848833,  976863851, 1653355191,  618043554,  150760872,
        804292855, 1771402836, 1354311854,  928204015, 1135041482,
       1096988414,  796135479, 1891304650, 1885148710, 1644170059,
       1816947699,  192565064, 1731613229, 1566116842,  458859209,
        341249135,  794129062, 1264823322,  26794617

# Cache and Load Embeddings

In [13]:
DENSE_EMBEDDINGS_CACHE_FILE = "dense_embeddings.pkl"

with open(DENSE_EMBEDDINGS_CACHE_FILE, "wb") as f:
    pickle.dump(dense_embeddings, f)

In [14]:
if os.path.exists(DENSE_EMBEDDINGS_CACHE_FILE):
    print("Loading cached dense embeddings...")
    with open(DENSE_EMBEDDINGS_CACHE_FILE, "rb") as f:
        dense_embeddings = pickle.load(f)

    # Regenerate sparse embeddings quickly
    print("Regenerating sparse embeddings...")
    sparse_embeddings = []
    for chunk in chunks:
        sparse_embedding = list(sparse_model.embed([chunk.text]))[0]
        sparse_embeddings.append(sparse_embedding)

Loading cached dense embeddings...
Regenerating sparse embeddings...


In [15]:
print(len(dense_embeddings))
print(len(sparse_embeddings))

1013
1013


# Add Embeddings to VectorDB

In [16]:
document_ids = vector_db.add_documents(
    docs=chunks,
    dense_embeddings=dense_embeddings,
    sparse_embeddings=sparse_embeddings,
)

In [17]:
collection_info = vector_db.client.get_collection(vector_db.collection_name)
print(f"Collection info: {collection_info}")

# Count the number of points in the collection
point_count = vector_db.client.count(vector_db.collection_name)
print(f"Number of documents in vector store: {point_count}")

Collection info: status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=1013 points_count=1013 segments_count=2 config=CollectionConfig(params=CollectionParams(vectors={'dense': VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None)}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'bm25': SparseVectorParams(index=None, modifier=<Modifier.IDF: 'idf'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=10000, flush_interval_sec=5, max_optimization_thr

# Test Query against VectorDB

In [18]:
query = "what is sutd?"
query_embedding = next(sparse_model.query_embed(query))
results = vector_db.client.query_points(
    collection_name=vector_db.collection_name,
    query=models.SparseVector(**query_embedding.as_object()),
    limit=3,
    using="bm25",
)
print(results)

points=[ScoredPoint(id=851, version=0, score=1.2887435, payload={'text': 'Kewalram Chanrai Group — SUTD Scholarship for Women Through the generosity of the Kewalram Chanrai Group, the Kewalram Chanrai Group – SUTD Scholarship for Women is available to outstanding full-time female undergraduates pursuing the SUTD Honours and Research Programme (SHARP), the SUTD-Duke-NUS Special Track or the SUTD Technology Entrepreneurship Programme (STEP)', 'metadata': {'source': 'https://www.sutd.edu.sg/about/diversity-inclusion/building-gender-diversity/'}}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=49, version=0, score=1.2870779, payload={'text': 'See all SUTD-Duke-NUS Special Track SUTD Honours And Research Programme (SHARP) SUTD-Tianjin Eco-City (SUTD-TEC) Programme Undergraduate SUTD Technology and Entrepreneurship Programme (USTEP) Outside the classroom Seek challenges beyond traditional learning Expand your horizons, enhance your communication skills, and push your limits b

In [19]:
query = "what is sutd?"
query_embedding = vector_db.get_embeddings(query)
results = vector_db.client.query_points(
    collection_name=vector_db.collection_name,
    query=query_embedding,
    limit=3,
    using="dense",
)
print(results)

points=[ScoredPoint(id=122, version=0, score=0.7566401, payload={'text': 'SUTD is education and research partner to almost 20 leading healthcare institutions. Courses 30.203 Topics in Biomedical and Healthcare Engineering 01.101 Global Health Technologies 30.123 Healthcare Product Design 02.230 Health Communication & Behaviour Change Serious Games for Healthcare Artificial Intelligence/Data Science Smart Nation is Singapore’s vision to be an economically competitive global city and a liveable home', 'metadata': {'source': 'https://www.sutd.edu.sg/education/undergraduate/sectors/'}}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=432, version=0, score=0.73423386, payload={'text': 'Intellectually curious Resilient Hands-on Risk-taking Collaborative Innovative & Nonconforming Regardless of programme, we seek students who embody our core values of creativity, collaboration, and a commitment to making a positive impact on society. SUTD is a small, tight-knit community of lik