# Get the embedding model

https://qdrant.tech/documentation/fastembed/fastembed-colbert/

In [5]:
from fastembed import LateInteractionTextEmbedding

In [6]:
for model in LateInteractionTextEmbedding.list_supported_models():
    print(model['model'])

colbert-ir/colbertv2.0
answerdotai/answerai-colbert-small-v1
jinaai/jina-colbert-v2


In [7]:
model_name = "colbert-ir/colbertv2.0"
embedding_model = LateInteractionTextEmbedding(model_name)

# Prepare the data

### Convert pdf to markdown using docling

In [8]:
from docling.document_converter import DocumentConverter

sources = [
    "https://arxiv.org/pdf/2408.09869",
    "https://www.arxiv.org/pdf/2509.04664",
    "https://arxiv.org/pdf/2505.09388",
    "https://arxiv.org/pdf/2506.05176"
]

converter = DocumentConverter()

results_parsed = []
for source in sources:
    result = converter.convert(source)
    results_parsed.append(result.document.export_to_markdown())

2025-09-14 01:17:09,798 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-14 01:17:09,874 - INFO - Going to convert document batch...
2025-09-14 01:17:09,876 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e647edf348883bed75367b22fbe60347
2025-09-14 01:17:09,892 - INFO - Loading plugin 'docling_defaults'
2025-09-14 01:17:09,895 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-09-14 01:17:09,912 - INFO - Loading plugin 'docling_defaults'
2025-09-14 01:17:09,917 - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-09-14 01:17:10,094 - INFO - Accelerator device: 'cuda:0'
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
2025-09-14 01:17:12,798 - INFO - Accelerator device: 'cuda:

### Chunking using chonkie

In [9]:
from chonkie import SemanticChunker

# Basic initialization with default parameters
chunker = SemanticChunker(
    embedding_model="minishlab/potion-base-32M",  # Default model
    threshold=0.8,                               # Similarity threshold (0-1)
    chunk_size=1024,                             # Maximum tokens per chunk
    similarity_window=3,                         # Window for similarity calculation
    skip_window=0                                # Skip-and-merge window (0=disabled)
)

chunks = []
for doc in results_parsed:
    doc_chunks = chunker.chunk(doc)
    chunks.append(doc_chunks)

2025-09-14 01:19:03,409 - INFO - Folder does not exist locally, attempting to use huggingface hub.


## Connect to Qdrant

In [10]:
from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")

2025-09-14 01:19:07,330 - INFO - HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"


### Create a collection if it does not exist

In [11]:
client.get_collections().collections

2025-09-14 01:19:07,344 - INFO - HTTP Request: GET http://localhost:6333/collections "HTTP/1.1 200 OK"


[]

In [12]:
from qdrant_client.http import models

collection_name = "test_multivector"

# Create a collection if it does not exist
if collection_name not in client.get_collections().collections:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=128,
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM
            )
        ),
    )

2025-09-14 01:19:07,355 - INFO - HTTP Request: GET http://localhost:6333/collections "HTTP/1.1 200 OK"
2025-09-14 01:19:07,404 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_multivector "HTTP/1.1 200 OK"


### Embed data using multi-vector

In [13]:
metadata = [
    {"document_name" : "Docling Technical Report"},
    {"document_name" : "Why Language Models Hallucinate"},
    {"document_name" : "Qwen 3 Technical Report"},
    {"document_name" : "Qwen 3 Embedding Technical Report"} 
]

In [14]:
new_metadata = []
for i, doc_chunks in enumerate(chunks):
    for chunk in doc_chunks:
        new_metadata.append({
            "document_name": metadata[i]["document_name"],
            "text": chunk.text,
        })

In [15]:
chunk_texts = [chunk.text for doc_chunks in chunks for chunk in doc_chunks]

chunks_embedding = list(
    embedding_model.embed(chunk_texts)
)

In [16]:
client.upload_points(
    collection_name=collection_name,
    points=[
        models.PointStruct(
            id=idx,
            payload=new_metadata[idx],
            vector=vector
        )
        for idx, vector in enumerate(chunks_embedding)
    ],
)

2025-09-14 01:24:24,831 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_multivector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:24:25,070 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_multivector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:24:25,412 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_multivector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:24:25,733 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_multivector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:24:25,904 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_multivector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:24:26,064 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_multivector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:24:26,281 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_multivector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:24:26,569 - INFO - H

## Search

In [17]:
client.query_points(
    collection_name=collection_name,
    query=list(embedding_model.query_embed("why llm hallucinate"))[0], #converting generator object into numpy.ndarray
    limit=10, #How many closest to the query movies we would like to get
    #with_vectors=True, #If this option is used, vectors will also be returned
    with_payload=True #So metadata is provided in the output
)

2025-09-14 01:24:29,116 - INFO - HTTP Request: POST http://localhost:6333/collections/test_multivector/points/query "HTTP/1.1 200 OK"


QueryResponse(points=[ScoredPoint(id=373, version=5, score=23.378906, payload={'document_name': 'Why Language Models Hallucinate', 'text': '- Yiyou Sun, Yu Gai, Lijie Chen, Abhilasha Ravichander, Yejin Choi, and Dawn Song. 2025. Why and How LLMs Hallucinate: Connecting the Dots with Subsequence Associations. https: //doi.org/10.48550/arXiv.2504.12691 arXiv:2504.12691 [cs.CL]\n- Mirac Suzgun, Nathan Scales, Nathanael Sch¨ arli, Sebastian Gehrmann, Yi Tay, Hyung Won Chung, Aakanksha Chowdhery, Quoc V. Le, Ed H. Chi, Denny Zhou, and Jason Wei. '}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=272, version=4, score=18.774275, payload={'document_name': 'Why Language Models Hallucinate', 'text': '2024. Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations?. '}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=109, version=1, score=18.587532, payload={'document_name': 'Why Language Models Hallucinate', 'text': "\nLike students facing hard exam question

## Embed single vector

In [23]:
# Requires transformers>=4.51.0
# Requires sentence-transformers>=2.7.0

from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

document_embeddings = model.encode(chunk_texts, prompt_name='document')

2025-09-14 01:36:17,311 - INFO - Use pytorch device_name: cuda:0
2025-09-14 01:36:17,312 - INFO - Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
2025-09-14 01:36:24,644 - INFO - 1 prompt is loaded, with the key: query
Batches: 100%|██████████| 32/32 [00:07<00:00,  4.20it/s]


In [22]:
model.prompts

{'query': 'Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:',
 'document': ''}

In [24]:
from qdrant_client.http import models

collection_name = "test_single_vector"

# Create a collection if it does not exist
if collection_name not in client.get_collections().collections:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=1024,
            distance=models.Distance.COSINE,
        ),
    )

2025-09-14 01:36:48,396 - INFO - HTTP Request: GET http://localhost:6333/collections "HTTP/1.1 200 OK"
2025-09-14 01:36:48,443 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_single_vector "HTTP/1.1 200 OK"


In [25]:
client.upload_points(
    collection_name=collection_name,
    points=[
        models.PointStruct(
            id=idx,
            payload=new_metadata[idx],
            vector=vector
        )
        for idx, vector in enumerate(document_embeddings)
    ],
)

2025-09-14 01:36:54,215 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_single_vector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:36:54,262 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_single_vector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:36:54,314 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_single_vector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:36:54,376 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_single_vector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:36:54,438 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_single_vector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:36:54,509 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_single_vector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:36:54,579 - INFO - HTTP Request: PUT http://localhost:6333/collections/test_single_vector/points?wait=false "HTTP/1.1 200 OK"
2025-09-14 01:36:54,

In [28]:
retrieved = client.query_points(
    collection_name=collection_name,
    query=model.encode("why llm hallucinate", prompt_name='query'), #converting generator object into numpy.ndarray
    limit=10, #How many closest to the query movies we would like to get
    #with_vectors=True, #If this option is used, vectors will also be returned
    with_payload=True #So metadata is provided in the output
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 12.29it/s]
2025-09-14 01:38:23,136 - INFO - HTTP Request: POST http://localhost:6333/collections/test_single_vector/points/query "HTTP/1.1 200 OK"


In [31]:
type(retrieved)

qdrant_client.http.models.models.QueryResponse

In [39]:
for point in retrieved.points:
    print(point.payload['text'])
    print("-------------")

## Why Language Models Hallucinate

Adam Tauman Kalai ∗ OpenAI

Ofir Nachum OpenAI

Santosh S. Vempala † Georgia Tech


-------------

Like students facing hard exam questions, large language models sometimes guess when uncertain, producing plausible yet incorrect statements instead of admitting uncertainty. Such 'hallucinations' persist even in state-of-the-art systems and undermine trust. We argue that language models hallucinate because the training and evaluation procedures reward guessing over acknowledging uncertainty, and we analyze the statistical causes of hallucinations in the modern training pipeline. Hallucinations need not be mysterious-they originate simply as errors in binary classification. If incorrect statements cannot be distinguished from facts, then hallucinations in pretrained language models will arise through natural statistical pressures. We then argue that hallucinations persist due to the way most evaluations are graded-language models are optimized to be goo

In [36]:
retrieved_multivec = client.query_points(
    collection_name="test_multivector",
    query=list(embedding_model.query_embed("why llm hallucinate"))[0], #converting generator object into numpy.ndarray
    limit=10, #How many closest to the query movies we would like to get
    #with_vectors=True, #If this option is used, vectors will also be returned
    with_payload=True #So metadata is provided in the output
)

2025-09-14 01:40:59,164 - INFO - HTTP Request: POST http://localhost:6333/collections/test_multivector/points/query "HTTP/1.1 200 OK"


In [38]:
for point in retrieved_multivec.points:
    print(point.payload['text'])
    print("-------------")

- Yiyou Sun, Yu Gai, Lijie Chen, Abhilasha Ravichander, Yejin Choi, and Dawn Song. 2025. Why and How LLMs Hallucinate: Connecting the Dots with Subsequence Associations. https: //doi.org/10.48550/arXiv.2504.12691 arXiv:2504.12691 [cs.CL]
- Mirac Suzgun, Nathan Scales, Nathanael Sch¨ arli, Sebastian Gehrmann, Yi Tay, Hyung Won Chung, Aakanksha Chowdhery, Quoc V. Le, Ed H. Chi, Denny Zhou, and Jason Wei. 
-------------
2024. Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations?. 
-------------

Like students facing hard exam questions, large language models sometimes guess when uncertain, producing plausible yet incorrect statements instead of admitting uncertainty. Such 'hallucinations' persist even in state-of-the-art systems and undermine trust. We argue that language models hallucinate because the training and evaluation procedures reward guessing over acknowledging uncertainty, and we analyze the statistical causes of hallucinations in the modern training pipeline. Halluc

## Check size of 2 collections

In [40]:
info = client.get_collection("test_single_vector")

print(info)

2025-09-14 01:43:18,630 - INFO - HTTP Request: GET http://localhost:6333/collections/test_single_vector "HTTP/1.1 200 OK"


status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=1000 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1024, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=10000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=No

In [41]:
info = client.get_collection("test_multivector")

print(info)

2025-09-14 01:43:28,559 - INFO - HTTP Request: GET http://localhost:6333/collections/test_multivector "HTTP/1.1 200 OK"


status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=1000 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=128, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=MultiVectorConfig(comparator=<MultiVectorComparator.MAX_SIM: 'max_sim'>)), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=10000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfi