In [4]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

## Load data

In [5]:
# Wczytaj dane z pliku CSV
csv_file_path = "../data/interim//articles_with_score_df.csv"
df = pd.read_csv(csv_file_path)

### Prepare embedding

In [7]:
from sentence_transformers import SentenceTransformer, util
import torch

if torch.cuda.is_available():
    print("CUDA is available: ", torch.cuda.is_available())   
    print("Number of CUDA devices: ", torch.cuda.device_count())
    print("CUDA current device: ", torch.cuda.current_device())
    print("CUDA device name: ", torch.cuda.get_device_name(0))

    # Utwórz instancję modelu SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda')
else:
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cpu')

# Zanurzenie tytułów
titles = df['title'].tolist()

titles_embeddings = []

for title in tqdm(titles):
    embedding = model.encode([title])
    titles_embeddings.append(embedding[0])

df['embedding'] = titles_embeddings

100%|███████████████████████████████████████████████████████████████████████| 850406/850406 [1:27:03<00:00, 162.79it/s]


## Load data to Vector DB (chroma)

In [37]:
import chromadb
from chromadb.config import Settings

chroma_client = chromadb.HttpClient(host="localhost", port = 8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))

collection_status = False
while collection_status != True:
    try:
        document_collection = chroma_client.get_or_create_collection(name="articles_with_score")
        collection_status = True
    except Exception as e:
        pass

batch_size = 10000
last_confirmed_id = 0

# Dzielimy DataFrame na tablice po 10 000 elementów
for batch_start in tqdm(range(0, df.shape[0], batch_size), desc='Batches', unit='batch'):
    batch_df = df.iloc[batch_start:batch_start + batch_size]

    # Pobieramy listę embeddingów, dokumentów, metadanych i id dla bieżącej partii
    batch_embeddings = batch_df['embedding'].apply(lambda x: x.tolist()).tolist()
    batch_documents = batch_df['title'].tolist()
    batch_metadatas = [{'year': row['year'], 'n_citation': row['n_citation'], 'gov_score': row['gov_score']} for index, row in batch_df.iterrows()]
    batch_ids = [str(index + 1) for index in batch_df.index]
    
    # Dodajemy partię danych do kolekcji
    document_collection.add(
        embeddings=batch_embeddings,
        documents=batch_documents,
        metadatas=batch_metadatas,
        ids=batch_ids
    )

    # Aktualizujemy last_confirmed_id
    last_confirmed_id = batch_df.index[-1] + 1

# Wypisz ostatnio potwierdzone ID
print("Last confirmed ID:", last_confirmed_id)


# Check the size of the collection
try:
    collection_size = document_collection.count()
    print("Size of the collection:", collection_size)
except Exception as e:
    print("Failed to get collection size:", e)

Batches: 100%|██████████████████████████████████████████████████████████████████████| 86/86 [27:16<00:00, 19.03s/batch]


Last confirmed ID: 850406
Size of the collection: 850406


In [42]:
# delete local dataset to clear RAM
del df

In [38]:
# CLEAR
document_collection.count()

#ids = document_collection.get()['ids']
#document_collection.delete(ids)
#document_collection.count()


850406