In [28]:
import pandas as pd
from dotenv import load_dotenv
from langchain_community.document_loaders import DataFrameLoader
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_milvus import Milvus
from langchain_community.vectorstores import FAISS
import torch
from typing import List
import math
import os


In [29]:
print(torch.version.cuda)  # CUDA version PyTorch was built with
print(f"CUDA available: {torch.cuda.is_available()}")
print(torch.backends.cudnn.version())  # cuDNN version
print(torch.cuda.get_device_name(0))  # GPU name
print(torch.__version__)  # PyTorch version


11.8
CUDA available: True
90100
NVIDIA GeForce RTX 3050 Laptop GPU
2.7.1+cu118


In [30]:
load_dotenv()

True

In [31]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [42]:
articles = pd.read_csv('data/articles_full_desc.csv', dtype={'article_id': str})

In [44]:
articles['full_description'].to_csv('data/full_desc.txt', index=False, sep='\n', header=False, encoding='utf-8')

In [51]:
loader = DataFrameLoader(articles[['article_id', 'full_description']], page_content_column='full_description')
docs = loader.load()


In [52]:
docs[2000]

Document(metadata={'article_id': '0808866006'}, page_content='0808866006 Sarah Low BB of type Sneakers from group Shoes and section Kids & Baby Shoes. Product color is White and perceived color is Light. Trainers in imitation leather with a padded edge, lacing at the front and a loop at the back. Mesh linings and insoles and patterned soles.')

In [53]:
# embeddings = HuggingFaceEmbeddings(
#     model_name=os.getenv('HUGGING_FACE_EMBEDDING'),
#     model_kwargs={
#         'device': device
#     }
# )

embeddings = OpenAIEmbeddings()

In [54]:
def batch_import_to_milvus(documents: List, embeddings, batch_size: int = 100, **milvus_args):
    # Initialize Milvus with first batch to create collection
    total_batches = math.ceil(len(documents[batch_size:]) / batch_size)
    first_batch = documents[:batch_size]
    print(f"Processing batch 1/{total_batches+1}")
    db = Milvus.from_documents(
        documents=first_batch,
        embedding=embeddings,
        **milvus_args
    )

    # Process remaining documents in batches
    for i in range(total_batches):
        start_idx = i * batch_size + batch_size  # Add batch_size because we already processed first batch
        end_idx = start_idx + batch_size
        batch = documents[start_idx:end_idx]
        if batch:
            print(f"Processing batch {i+2}/{total_batches+1}")  # +2 because we already did first batch
            db.add_documents(documents=batch)

    return db

In [55]:
# Usage
db_articles = batch_import_to_milvus(
    documents=docs,
    embeddings=embeddings,
    batch_size=300,  # Adjust this based on your needs and memory constraints
    collection_name=os.getenv('MILVIUS_COLLECTION'),
    connection_args={
        "uri": os.getenv('ZILLIZ_CLOUD_URI'),
        "token": os.getenv('ZILLIZ_CLOUD_API_KEY'),
        "secure": True,
    },
    auto_id=True,
    drop_old=True,
)


Processing batch 1/84


2025-06-17 17:46:11,708 [DEBUG][_create_connection]: Created new connection using: e70470a41d194f84b221d6afc7bf0b4a (async_milvus_client.py:599)


Processing batch 2/84
Processing batch 3/84
Processing batch 4/84
Processing batch 5/84
Processing batch 6/84
Processing batch 7/84
Processing batch 8/84
Processing batch 9/84
Processing batch 10/84
Processing batch 11/84
Processing batch 12/84
Processing batch 13/84
Processing batch 14/84
Processing batch 15/84
Processing batch 16/84
Processing batch 17/84
Processing batch 18/84
Processing batch 19/84
Processing batch 20/84
Processing batch 21/84
Processing batch 22/84
Processing batch 23/84
Processing batch 24/84
Processing batch 25/84
Processing batch 26/84
Processing batch 27/84
Processing batch 28/84
Processing batch 29/84
Processing batch 30/84
Processing batch 31/84
Processing batch 32/84
Processing batch 33/84
Processing batch 34/84
Processing batch 35/84
Processing batch 36/84
Processing batch 37/84
Processing batch 38/84
Processing batch 39/84
Processing batch 40/84
Processing batch 41/84
Processing batch 42/84
Processing batch 43/84
Processing batch 44/84
Processing batch 45

In [56]:
recs = db_articles.similarity_search_with_score('white shoes for women', k=10)

In [66]:
recs[0][0]

Document(metadata={'article_id': '0805095005', 'pk': 458312174353380081}, page_content='0805095005 Theresa Loafer of type Ballerinas from group Shoes and section Womens Shoes. Product color is White and perceived color is Light. Loafers in imitation leather with a decorative seam at the front and metal buckle at the top. Satin linings and imitation leather insoles. Heel approx. 1.8 cm.')