In [None]:
from qdrant_client import QdrantClient, models
from app.core.config import settings as config
from app.utils.utils import split_docs
from app.web_loader.bs_loader import load_web_docs
from app.web_loader.bs_utils import urls
from app.db.vector_store import VectorStore
import os
import pickle
from tenacity import (
    retry,
    wait_exponential,
    stop_after_attempt,
    retry_if_exception_type,
)
import time
import requests
import numpy as np

In [None]:
client = QdrantClient(config.qdrant_url, api_key=config.qdrant_api_key)

In [None]:
client.create_collection(
    collection_name="sutd",
    vectors_config=models.VectorParams(
        size=config.embeddings_dim, distance=models.Distance.COSINE
    ),
)

In [None]:
# client.delete_collection(collection_name="sutd")

In [None]:
client.get_collections()

In [None]:
CACHE_FILE = "all_docs.pkl"

if os.path.exists(CACHE_FILE):
    print("Loading cached documents...")
    with open(CACHE_FILE, "rb") as f:
        all_docs = pickle.load(f)
else:
    all_docs = load_web_docs(urls)
    print("Saving documents to cache...")
    with open(CACHE_FILE, "wb") as f:
        pickle.dump(all_docs, f)

In [None]:
chunks = split_docs(all_docs)

In [None]:
print(len(chunks))
print(chunks[20])

In [None]:
vector_store = VectorStore(config)
embeddings = []

In [None]:
@retry(
    retry=retry_if_exception_type((requests.exceptions.RequestException, Exception)),
    wait=wait_exponential(multiplier=1, min=4, max=60),  # Exponential backoff
    stop=stop_after_attempt(5),  # Max 5 attempts
)
def get_embedding_with_retry(text):
    time.sleep(60 / 4500)  # Rate limit of 4500 requests per minute
    return vector_store.get_embeddings(text)


for i, chunk in enumerate(chunks):
    try:
        embedding = get_embedding_with_retry(chunk.text)
        embeddings.append(embedding)

        if i % 100 == 0:
            print(f"Processed {i}/{len(chunks)} chunks")

    except Exception as e:
        print(f"Failed after retries on chunk {i}: {str(e)}")
        embeddings.append(None)

In [None]:
print(embeddings[4])

In [None]:
EMBEDDINGS_CACHE_FILE = "embeddings.pkl"

with open(EMBEDDINGS_CACHE_FILE, "wb") as f:
    pickle.dump(embeddings, f)

In [None]:
EMBEDDINGS_CACHE_FILE = "embeddings.pkl"

if os.path.exists(EMBEDDINGS_CACHE_FILE):
    print("Loading cached embeddings...")
    with open(EMBEDDINGS_CACHE_FILE, "rb") as f:
        embeddings = pickle.load(f)

In [None]:
document_ids = vector_store.add_documents(docs=chunks, embeddings=embeddings)

In [None]:
collection_info = vector_store.client.get_collection(vector_store.collection_name)
print(f"Collection info: {collection_info}")

# Count the number of points in the collection
point_count = vector_store.client.count(vector_store.collection_name)
print(f"Number of documents in vector store: {point_count}")

In [None]:
query = "what is sutd?"
query_embedding = vector_store.get_embeddings(query)
results = vector_store.client.query_points(
    collection_name=vector_store.collection_name,
    query=query_embedding,
    limit=5,
)
print(results)