In [1]:
from qdrant_client import QdrantClient, models

HOST = "localhost"
PORT = 6333
COLLECTION_NAME = "RAG_evaluation_test"

DENSE_VECTOR = "dense"
SPARSE_VECTOR = "sparse"
LATE_INTERACTION_VECTOR = "late_interaction"

client = QdrantClient(host=HOST, port=PORT)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 콜렉션 생성(한 번만)
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        DENSE_VECTOR: models.VectorParams(
            size=1024,  # size of each vector produced by ColBERT
            distance=models.Distance.COSINE,  # similarity metric between each vector
        ),
        LATE_INTERACTION_VECTOR: models.VectorParams(
            size=128,  # size of each vector produced by ColBERT
            distance=models.Distance.COSINE,  # similarity metric between each vector
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM  # similarity metric between multivectors (matrices)
            ),
        ),
    },
    sparse_vectors_config={
        SPARSE_VECTOR: models.SparseVectorParams(),
    },
)

True

In [7]:
import os
import uuid
from datetime import datetime


import pandas as pd
from qdrant_client.models import PointStruct

from embedding import DenseEmbedding, LateInteractionEmbedding, SparseEmbedding

dense_model = DenseEmbedding()
sparse_model = SparseEmbedding()
late_interaction_model = LateInteractionEmbedding()


def embed_and_load_to_vectordb(data: pd.DataFrame):
    file_name = data.file_name[0]
    for idx, (page,content,) in enumerate(zip(data.page,data.content,)):
        start_time = datetime.now()
        point = PointStruct(
            id=str(uuid.uuid4()),
            vector={
                DENSE_VECTOR: dense_model.embed(content),
                SPARSE_VECTOR: sparse_model.embed(content),
                LATE_INTERACTION_VECTOR: late_interaction_model.embed(content),
            },
            payload={"file_name": file_name, "page": page, "content": content},
        )

        client.upsert(collection_name=COLLECTION_NAME, points=[point])
        print(f">> {file_name}_{idx}: {datetime.now() - start_time}")


base_file_path = os.path.join(os.path.pardir, "data/outputs")
input_files = [file_name for file_name in os.listdir(base_file_path) if file_name.endswith((".csv"))]

total_start_time = datetime.now()
for file_name in input_files:
    try:
        file_path = os.path.join(base_file_path, file_name)
        data = pd.read_csv(file_path)
        embed_and_load_to_vectordb(data)
        print(f"::::::::: Complete: {file_name} :::::::::", end="\n\n")
    except Exception as e:
        print(f"::::::::: Error: {file_name} :::::::::", end="\n\n")
        print(e)
print(f"Total Running Time: {datetime.now() - total_start_time}")

Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 325982.18it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>> 2024년 행정안전부 업무계획.pdf_0: 0:00:03.647759
>> 2024년 행정안전부 업무계획.pdf_1: 0:00:00.153555
>> 2024년 행정안전부 업무계획.pdf_2: 0:00:01.525049
>> 2024년 행정안전부 업무계획.pdf_3: 0:00:00.794644
>> 2024년 행정안전부 업무계획.pdf_4: 0:00:00.625109
>> 2024년 행정안전부 업무계획.pdf_5: 0:00:00.972132
>> 2024년 행정안전부 업무계획.pdf_6: 0:00:00.215579
>> 2024년 행정안전부 업무계획.pdf_7: 0:00:00.756548
>> 2024년 행정안전부 업무계획.pdf_8: 0:00:01.115532
>> 2024년 행정안전부 업무계획.pdf_9: 0:00:00.729267
>> 2024년 행정안전부 업무계획.pdf_10: 0:00:00.800236
>> 2024년 행정안전부 업무계획.pdf_11: 0:00:00.143990
>> 2024년 행정안전부 업무계획.pdf_12: 0:00:00.726808
>> 2024년 행정안전부 업무계획.pdf_13: 0:00:00.319330
>> 2024년 행정안전부 업무계획.pdf_14: 0:00:00.800613
>> 2024년 행정안전부 업무계획.pdf_15: 0:00:00.341377
>> 2024년 행정안전부 업무계획.pdf_16: 0:00:00.775451
>> 2024년 행정안전부 업무계획.pdf_17: 0:00:00.314679
>> 2024년 행정안전부 업무계획.pdf_18: 0:00:00.730546
>> 2024년 행정안전부 업무계획.pdf_19: 0:00:00.256887
>> 2024년 행정안전부 업무계획.pdf_20: 0:00:00.712374
>> 2024년 행정안전부 업무계획.pdf_21: 0:00:00.686734
>> 2024년 행정안전부 업무계획.pdf_22: 0:00:00.230568
>> 2024년 행정안전부 업무계획.p

In [19]:
from pathlib import Path 
import pandas as pd
from glob import glob

data_path = Path('.').resolve().parent / "data/outputs"
data_list = data_path.glob('*.csv')
docs_name = [doc.name for doc in data_list]
temp_df_list = [pd.read_csv(doc) for doc in data_list ]
temp_df_list[1]

IndexError: list index out of range