## Saving AG news  dataset

In [54]:
import pandas as pd
from datasets import load_dataset

In [55]:
news_data = load_dataset("ag_news", split="train")

In [56]:
sample_data = news_data.shuffle(seed=42).select(range(2000))

In [57]:
news_df = pd.DataFrame(sample_data)

In [58]:
news_df['label'].value_counts()

label
3    522
0    509
1    508
2    461
Name: count, dtype: int64

In [59]:
label_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

## Create Qdrant collection

In [60]:
from qdrant_client import QdrantClient, models

In [61]:
client = QdrantClient("http://localhost:6333")

In [62]:
from fastembed import TextEmbedding

dimensions = set()

for model in TextEmbedding.list_supported_models():
    dimensions.add(model['dim'])

print(f"Supported dimensions: {sorted(dimensions)}")

Supported dimensions: [384, 512, 768, 1024]


In [63]:
for model in TextEmbedding.list_supported_models():
    if model['dim'] == 512 and "Text embeddings" in model['description'] and "English" in model['description']:
        print(f"Found suitable model: {model['model']} : {model['description']}")

Found suitable model: Qdrant/clip-ViT-B-32-text : Text embeddings, Multimodal (text&image), English, 77 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year
Found suitable model: jinaai/jina-embeddings-v2-small-en : Text embeddings, Unimodal (text), English, 8192 input tokens truncation, Prefixes for queries/documents: not necessary, 2023 year.


In [64]:
embedding_model = 'jinaai/jina-embeddings-v2-small-en'

In [66]:
collection_name_news = "ag_news"

client.recreate_collection(
    collection_name=collection_name_news,
    vectors_config=models.VectorParams(
        size=512,  
        distance=models.Distance.COSINE  
    )
)

  client.recreate_collection(


True

## Inserting questions to vector db

In [67]:
points = []
id = 0

for _, news in news_df.iterrows():

    point = models.PointStruct(
        id=id,
        vector=models.Document(text=news['text'], model=embedding_model),
        payload={
            "news" : news['text'],
            "label": label_map[news['label']]
        }
    )
    points.append(point)

    id += 1

In [68]:
client.upsert(
    collection_name=collection_name_news,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

Qdrant visualization parameters:
{
  "limit" : 1000,
  "color_by": {
    "payload": "label"
  }
}