In [None]:
from qdrant_client import QdrantClient, models
from app.core.config import settings as config
from app.utils.utils import split_docs
from app.web_loader.bs_loader import load_web_docs
from app.web_loader.bs_utils import urls
from app.db.vector_store import VectorStore
import os
import pickle
from tenacity import (
    retry,
    wait_exponential,
    stop_after_attempt,
    retry_if_exception_type,
)
import time
import requests

In [2]:
client = QdrantClient(config.qdrant_url, api_key=config.qdrant_api_key)

In [3]:
client.create_collection(
    collection_name="sutd",
    vectors_config=models.VectorParams(
        size=config.embeddings_dim, distance=models.Distance.COSINE
    ),
)

True

In [None]:
# client.delete_collection(collection_name="sutd")

In [4]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='sutd')])

In [9]:
CACHE_FILE = "all_docs.pkl"

if os.path.exists(CACHE_FILE):
    print("Loading cached documents...")
    with open(CACHE_FILE, "rb") as f:
        all_docs = pickle.load(f)
else:
    all_docs = load_web_docs(urls)
    print("Saving documents to cache...")
    with open(CACHE_FILE, "wb") as f:
        pickle.dump(all_docs, f)

Loading cached documents...


In [10]:
chunks = split_docs(all_docs)

In [11]:
print(len(chunks))
print(chunks[4])

1008
Document(text='ousing Sports and recreation centre F&B and services Library About About OVERVIEW Design AI At a glance Diversity and inclusion Sustainability PEOPLE Leadership Board of Trustees President Emeritus Professorships Faculty MORE Happenings Careers at SUTD Partnering with SUTD On this page Popular searches quick links Discover SUTD The world’s first Design AI university Innovation @ SUTD AirXeed radiosonde AI @ SUTD Mustango – DALL-E for music Discover SUTD The world’s first Design AI university Pla', metadata={'source': 'https://www.sutd.edu.sg/'})


In [12]:
vector_store = VectorStore(config)

In [None]:
@retry(
    retry=retry_if_exception_type((requests.exceptions.RequestException, Exception)),
    wait=wait_exponential(multiplier=1, min=4, max=60),  # Exponential backoff
    stop=stop_after_attempt(5),  # Max 5 attempts
)
def get_embedding_with_retry(text):
    time.sleep(60 / 1500)
    return vector_store.get_embeddings(text)


embeddings = []
for i, chunk in enumerate(chunks):
    try:
        embedding = get_embedding_with_retry(chunk.text)
        embeddings.append(embedding)

        if i % 100 == 0:
            print(f"Processed {i}/{len(chunks)} chunks")

    except Exception as e:
        print(f"Failed after retries on chunk {i}: {str(e)}")
        embeddings.append(None)

print(embeddings[0])

In [None]:
EMBEDDINGS_CACHE_FILE = "embeddings.pkl"

with open(EMBEDDINGS_CACHE_FILE, "wb") as f:
    pickle.dump(embeddings, f)

In [5]:
EMBEDDINGS_CACHE_FILE = "embeddings.pkl"

embeddings = []

if os.path.exists(EMBEDDINGS_CACHE_FILE):
    print("Loading cached embeddings...")
    with open(EMBEDDINGS_CACHE_FILE, "rb") as f:
        embeddings = pickle.load(f)

Loading cached embeddings...


In [13]:
document_ids = vector_store.add_documents(docs=chunks, embeddings=embeddings)

Upserting points 0 to 1000...
Upserting points 1000 to 1008...


In [14]:
collection_info = vector_store.client.get_collection(vector_store.collection_name)
print(f"Collection info: {collection_info}")

# Count the number of points in the collection
point_count = vector_store.client.count(vector_store.collection_name)
print(f"Number of documents in vector store: {point_count}")

Collection info: status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=1008 segments_count=2 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quanti

In [None]:
query = "what is sutd?"
query_embedding = vector_store.get_embeddings(query)
results = vector_store.client.query_points(
    collection_name=vector_store.collection_name,
    query=query_embedding,
    limit=5,
)
print(results)

points=[ScoredPoint(id=325, version=0, score=0.67123914, payload={'text': 't SUTD, and discover the best fit for your interests. See more Useful information Careers at SUTD Connect with us Take a virtual campus tour 8 Somapah Rd, Singapore 487372 +65 6303 6600 +65 6303 6600 Email HASS Directory Social Media Facebook X Youtube Linkedin TikTok Instagram Contact SUTD Whistleblowing Policy Terms & Conditions Privacy Statement Copyright © 2011 – 2025 We use cookies to ensure you get the best experience on our website. By selecting “I agree”, you consent to our use of cookie', 'metadata': {'source': 'https://www.sutd.edu.sg/hass/education/undergraduate/minors/digital-humanities-minor/'}}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=556, version=0, score=0.6282536, payload={'text': 't SUTD Connect with us Take a virtual campus tour 8 Somapah Rd, Singapore 487372 +65 6303 6600 +65 6303 6600 Email us Directory Social Media Facebook X YouTube LinkedIn TikTok Instagram Contact 