In [1]:
import pandas as pd
import tiktoken

from embeddings_utils import get_embedding

In [2]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

In [12]:
# load & inspect dataset
input_datapath = "filtered_book_info.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
df = df[[ "TITLE_NM", "AUTHR_NM", "IMAGE_URL", "BOOK_INTRCN_CN", "TWO_PBLICTE_DE"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.TITLE_NM.str.strip() + "; Content: " + df.BOOK_INTRCN_CN.str.strip()
)
df.head(2)

Unnamed: 0_level_0,TITLE_NM,AUTHR_NM,IMAGE_URL,BOOK_INTRCN_CN,TWO_PBLICTE_DE,combined
ISBN_THIRTEEN_NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9791156759270,너에게 목소리를 보낼게 - &lt;달빛천사&gt; 성우 이용신의 첫 번째 에세이,이용신 (지은이),https://image.aladin.co.kr/product/28415/8/cov...,2004년 방영한 애니메이션 &lt;달빛천사&gt;에서 주인공 루나(풀문) 역을 맡...,2021-12-03,Title: 너에게 목소리를 보낼게 - &lt;달빛천사&gt; 성우 이용신의 첫 번...
9791168120877,일기에도 거짓말을 쓰는 사람 - 99년생 시인의 자의식 과잉 에세이,차도하 (지은이),https://image.aladin.co.kr/product/28414/66/co...,“그러니 나는 말하고 싶은 것을 말하겠다”「침착하게 사랑하기」 차도하 시인 첫 에세...,2021-12-06,Title: 일기에도 거짓말을 쓰는 사람 - 99년생 시인의 자의식 과잉 에세이; ...


In [14]:
# Ensur["ISBN_THIRTEEN_NO", "TITLE_NM", "AUTHOR_NM", "IMAGE_URL, "BOOK_INTRCN_CN", "TWO_PBLICTE_DE"]e you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df.to_csv("filtered_book_info_embedded.csv")

In [6]:
import qdrant_client
qdrant = qdrant_client.QdrantClient(host="localhost", port=6333)
qdrant.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='Books'), CollectionDescription(name='Articles')])

In [3]:
from qdrant_client.http import models as rest

In [22]:
df["embedding"]

ISBN_THIRTEEN_NO
9791156759270    [0.07661158591508865, -0.014539572410285473, -...
9791168120877    [0.07918298244476318, -0.02614027075469494, -0...
9791168120839    [0.04609770327806473, 0.08594229072332382, -0....
9791168120846    [0.05182841047644615, 0.0358114056289196, -0.0...
9791168120747    [0.044340625405311584, -0.001190714887343347, ...
                                       ...                        
9791197569708    [0.05330181121826172, 0.036431942135095596, 0....
9791138804486    [0.04476550221443176, 0.049679797142744064, 0....
9791138804523    [0.05430883169174194, 0.032628659158945084, -0...
9791197549335    [0.07586272060871124, 0.02601887285709381, -0....
9788970936222    [-0.001166055561043322, 0.03201407194137573, 0...
Name: embedding, Length: 82525, dtype: object

In [10]:


import ast

def convert_embedding(embedding):
    if isinstance(embedding, str):
        # Convert string representation of list to actual list
        embedding = ast.literal_eval(embedding)
    return embedding

# Apply the conversion to the 'embedding' column
df['embedding'] = df['embedding'].apply(convert_embedding)


In [12]:

# Get the vector size from the first row to set up the collection
vector_size = len(df.iloc[0]['embedding'])

# Set up the collection with the vector configuration. You need to declare the vector size and distance metric for the collection. Distance metric enables vector database to index and search vectors efficiently.
qdrant.recreate_collection(
    collection_name='Book2',
    vectors_config={
        'combined': rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

  qdrant.recreate_collection(


True

In [13]:


from qdrant_client.models import PointStruct # Import the PointStruct to store the vector and payload
from tqdm import tqdm # Library to show the progress bar 

# Populate collection with vectors using tqdm to show progress
for k, v in tqdm(df.iterrows(), desc="Upserting articles", total=len(df)):
    try:
        qdrant.upsert(
            collection_name='Book2',
            points=[
                PointStruct(
                    id=k,
                    vector={'combined': v['embedding']},
                    payload={
                        'id': v['AUTHR_NM'],
                        'title': v['TITLE_NM'],
                        'url': v["IMAGE_URL"],
                        'intro': v["BOOK_INTRCN_CN"],
                        "date": v["TWO_PBLICTE_DE"]
                    }
                )
            ]
        )
    except Exception as e:
        print(f"Failed to upsert row {k}: {v}")
        print(f"Exception: {e}")

Upserting articles: 100%|██████████| 82525/82525 [03:51<00:00, 355.79it/s]


In [8]:
# Check the collection size to make sure all the points have been stored
qdrant.count(collection_name='Articles')

NameError: name 'qdrant' is not defined