In [None]:
!pip install -qU datasets ollama openai "semantic-router[local]" qdrant-client

In [1]:
from datasets import load_dataset
from semantic_router.encoders import HuggingFaceEncoder
from qdrant_client import QdrantClient
from qdrant_client.http import models
from tqdm.auto import tqdm
import time


In [2]:
# Load the dataset
data = load_dataset("jamescalam/ai-arxiv2-semantic-chunks", split="train[:10000]")
data = data.map(lambda x: {"id": x["id"], "metadata": {"title": x["title"], "content": x["content"]}})
data = data.remove_columns(["title", "content", "prechunk_id", "postchunk_id", "arxiv_id", "references"])

In [3]:
# Embedding Model
encoder = HuggingFaceEncoder(name="dwzhu/e5-base-4k")
dims = len(encoder(["this is a test"])[0])

In [5]:
# Create embeddings
def create_embeddings(data, batch_size=128):
    embeddings = []
    for i in tqdm(range(0, len(data), batch_size)):
        batch = data[i:i+batch_size]
        chunks = [f'{x["title"]}: {x["content"]}' for x in batch["metadata"]]
        embeds = encoder(chunks)
        embeddings.extend(embeds)
    return embeddings

embeddings = create_embeddings(data)

  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Qdrant Setup
index_name = "qdrant-llama-3-rag-1"
client = QdrantClient("http://localhost:6333")
if index_name not in [collection.name for collection in client.get_collections().collections]:
    client.create_collection(collection_name=index_name, vectors_config=models.VectorParams(size=dims, distance=models.Distance.COSINE))
    time.sleep(1)

In [8]:
# Populate Qdrant with embeddings
def populate_qdrant(data, embeddings, batch_size=128):
    for i in tqdm(range(0, len(data), batch_size)):
        batch = data[i:i+batch_size]
        points = [
            models.PointStruct(
                id=int(id.split('#')[0].replace('.', '')),  # Convert to integer
                vector=embed,
                payload=metadata
            )
            for id, embed, metadata in zip(batch["id"], embeddings[i:i+batch_size], batch["metadata"])
        ]
        client.upsert(collection_name=index_name, points=points)

populate_qdrant(data, embeddings)

  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:

from openai import OpenAI
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

In [10]:
llm_client = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
MODEL = 'llama3'

In [11]:
def suggest_clustering_params():
    query = "Based on the dataset characteristics, suggest optimal hyperparameters for hierarchical clustering, including the number of clusters and linkage type."
    messages = [{"role": "user", "content": query}]
    response = llm_client.chat.completions.create(model=MODEL, messages=messages)
    return response.choices[0].message.content

def parse_params(suggestion):
    # Implement parsing logic based on LLM suggestion format
    params = {"n_clusters": 5, "linkage": "ward"}
    return params

suggested_params = suggest_clustering_params()
params = parse_params(suggested_params)

In [12]:
def perform_clustering(embeddings, n_clusters, linkage):
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
    labels = model.fit_predict(embeddings)
    score = silhouette_score(embeddings, labels)
    return labels, score

In [13]:

labels, score = perform_clustering(embeddings, params['n_clusters'], params['linkage'])
print(f"Silhouette Score: {score}")

Silhouette Score: 0.13468380270731145


In [None]:
# Iterate based on feedback
for _ in range(3):
    suggested_params = suggest_clustering_params()
    params = parse_params(suggested_params)
    labels, score = perform_clustering(embeddings, params['n_clusters'], params['linkage'])
    print(f"Silhouette Score: {score}")


In [None]:
def get_docs(query: str, top_k: int) -> list[str]:
    xq = encoder([query])
    search_result = client.search(collection_name=index_name, limit=top_k, query_vector=xq[0], with_payload=True)
    return [point.payload['content'] for point in search_result]

def generate(query: str, docs: list[str]):
    system_message = (
        "You are a helpful assistant that answers questions about AI using the "
        "context provided below.\n\n"
        "CONTEXT:\n"
        "\n---\n".join(docs)
    )
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": query}
    ]
    chat_response = llm_client.chat.completions.create(model=MODEL, messages=messages)
    return chat_response.choices[0].message.content

In [None]:
query = "can you tell me about the Llama LLMs?"
docs = get_docs(query, 5)
print(generate(query=query, docs=docs))