In [68]:
import pandas as pd
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_milvus import Milvus
from langchain_community.vectorstores import FAISS
import torch
from typing import List
import math
import os


In [42]:
print(torch.version.cuda)  # CUDA version PyTorch was built with
print(f"CUDA available: {torch.cuda.is_available()}")
print(torch.backends.cudnn.version())  # cuDNN version
print(torch.cuda.get_device_name(0))  # GPU name
print(torch.__version__)  # PyTorch version


11.8
CUDA available: True
90100
NVIDIA GeForce RTX 3050 Laptop GPU
2.7.1+cu118


In [66]:
load_dotenv()

True

In [44]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [56]:
articles = pd.read_csv('data/articles_full_desc.csv', dtype={'article_id': str})

In [57]:
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,full_description
0,0681107007,681107,Dahlia,265,Dress,Garment Full body,1010017,Stripe,72,Blue,...,H,Children Sizes 92-140,4,Baby/Children,76,Kids Girl,1014,Dresses/Skirts girls,Sleeveless dress in a patterned cotton weave w...,0681107007 Dahlia of type Dress from group Gar...
1,0686284001,686284,Santa sweater,252,Sweater,Garment Upper body,1010001,All over pattern,42,Red,...,G,Baby Sizes 50-98,4,Baby/Children,41,Baby Boy,1003,Knitwear,Long-sleeved jumper in jacquard-knit cotton wi...,0686284001 Santa sweater of type Sweater from ...
2,0544254002,544254,NEVADA,252,Sweater,Garment Upper body,1010001,All over pattern,9,Black,...,F,Menswear,3,Menswear,20,Contemporary Smart,1005,Jersey Fancy,"Jumper in a soft, jacquard-knit cotton blend w...",0544254002 NEVADA of type Sweater from group G...
3,0820021001,820021,Pat ls bd oxford gingham,259,Shirt,Garment Upper body,1010004,Check,19,Greenish Khaki,...,F,Menswear,3,Menswear,21,Contemporary Casual,1011,Shirts,Shirt in soft Oxford cotton with a button-down...,0820021001 Pat ls bd oxford gingham of type Sh...
4,0754256001,754256,GREENVILLE high support bra,306,Bra,Underwear,1010016,Solid,9,Black,...,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports bra in fast-drying functional fabric. V...,0754256001 GREENVILLE high support bra of type...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0569374002,569374,CARINA molded high support bra,306,Bra,Underwear,1010005,Colour blocking,73,Dark Blue,...,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports bra in fast-drying functional fabric wi...,0569374002 CARINA molded high support bra of t...
19996,0661365001,661365,Mitchell dress,265,Dress,Garment Full body,1010001,All over pattern,93,Dark Green,...,A,Ladieswear,1,Ladieswear,18,Womens Trend,1010,Blouses,"Long, sleeveless dress in a patterned weave wi...",0661365001 Mitchell dress of type Dress from g...
19997,0592991003,592991,Ari,252,Sweater,Garment Upper body,1010008,Front print,9,Black,...,A,Ladieswear,1,Ladieswear,8,Mama,1005,Jersey Fancy,Top in sweatshirt fabric with appliqués on the...,0592991003 Ari of type Sweater from group Garm...
19998,0618287004,618287,Malena,265,Dress,Garment Full body,1010016,Solid,9,Black,...,A,Ladieswear,1,Ladieswear,11,Womens Tailoring,1013,Dresses Ladies,"V-neck dress in an airy weave with short, doub...",0618287004 Malena of type Dress from group Gar...


In [58]:
articles['full_description'].to_csv('data/full_desc.txt', index=False, sep='\n', header=False, encoding='utf-8')

In [59]:
raw_docs = TextLoader('data/full_desc.txt', encoding='utf-8').load()

In [60]:
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator='\n')
docs = text_splitter.split_documents(raw_docs)

Created a chunk of size 308, which is longer than the specified 0
Created a chunk of size 260, which is longer than the specified 0
Created a chunk of size 258, which is longer than the specified 0
Created a chunk of size 545, which is longer than the specified 0
Created a chunk of size 336, which is longer than the specified 0
Created a chunk of size 281, which is longer than the specified 0
Created a chunk of size 282, which is longer than the specified 0
Created a chunk of size 234, which is longer than the specified 0
Created a chunk of size 316, which is longer than the specified 0
Created a chunk of size 462, which is longer than the specified 0
Created a chunk of size 425, which is longer than the specified 0
Created a chunk of size 336, which is longer than the specified 0
Created a chunk of size 333, which is longer than the specified 0
Created a chunk of size 216, which is longer than the specified 0
Created a chunk of size 314, which is longer than the specified 0
Created a 

In [70]:
# embeddings = HuggingFaceEmbeddings(
#     model_name=os.getenv('HUGGING_FACE_EMBEDDING'),
#     model_kwargs={
#         'device': device
#     }
# )

embeddings = OpenAIEmbeddings()

In [71]:
def batch_import_to_milvus(documents: List, embeddings, batch_size: int = 100, **milvus_args):
    # Initialize Milvus with first batch to create collection
    total_batches = math.ceil(len(documents[batch_size:]) / batch_size)
    first_batch = documents[:batch_size]
    print(f"Processing batch 1/{total_batches+1}")
    db = Milvus.from_documents(
        documents=first_batch,
        embedding=embeddings,
        **milvus_args
    )

    # Process remaining documents in batches
    for i in range(total_batches):
        start_idx = i * batch_size + batch_size  # Add batch_size because we already processed first batch
        end_idx = start_idx + batch_size
        batch = documents[start_idx:end_idx]
        if batch:
            print(f"Processing batch {i+2}/{total_batches+1}")  # +2 because we already did first batch
            db.add_documents(documents=batch)

    return db

In [73]:
# Usage
db_articles = batch_import_to_milvus(
    documents=docs,
    embeddings=embeddings,
    batch_size=100,  # Adjust this based on your needs and memory constraints
    collection_name=os.getenv('MILVIUS_COLLECTION'),
    connection_args={
        "uri": os.getenv('ZILLIZ_CLOUD_URI'),
        "token": os.getenv('ZILLIZ_CLOUD_API_KEY'),
        "secure": True,
    },
    auto_id=True,
    drop_old=True,
)


Processing batch 1/200


2025-06-16 15:48:19,339 [DEBUG][_create_connection]: Created new connection using: a5cb9bb1a8a6424082f7950fc8c63337 (async_milvus_client.py:599)


Processing batch 2/200
Processing batch 3/200
Processing batch 4/200
Processing batch 5/200
Processing batch 6/200
Processing batch 7/200
Processing batch 8/200
Processing batch 9/200
Processing batch 10/200
Processing batch 11/200
Processing batch 12/200
Processing batch 13/200
Processing batch 14/200
Processing batch 15/200
Processing batch 16/200
Processing batch 17/200
Processing batch 18/200
Processing batch 19/200
Processing batch 20/200
Processing batch 21/200
Processing batch 22/200
Processing batch 23/200
Processing batch 24/200
Processing batch 25/200
Processing batch 26/200
Processing batch 27/200
Processing batch 28/200
Processing batch 29/200
Processing batch 30/200
Processing batch 31/200
Processing batch 32/200
Processing batch 33/200
Processing batch 34/200
Processing batch 35/200
Processing batch 36/200
Processing batch 37/200
Processing batch 38/200
Processing batch 39/200
Processing batch 40/200
Processing batch 41/200
Processing batch 42/200
Processing batch 43/200


In [64]:
recs = db_articles.similarity_search_with_score('white shoes for women', k=10)

In [65]:
recs

[(Document(metadata={'source': 'data/full_desc.txt', 'pk': 458312174227128262}, page_content='0600920001 BEACH SLIDERS of type Sandals from group Shoes and section Ladies H&M Sport. Product color is White and perceived color is Light. Plastic slides with a grained sole.'),
  13.592419624328613),
 (Document(metadata={'source': 'data/full_desc.txt', 'pk': 458312174227132974}, page_content='0831650001 Hypatia sandalette of type Heeled sandals from group Shoes and section Womens Shoes. Product color is White and perceived color is Light. Sandals in imitation leather with narrow straps that tie around the ankle and a braided jute trim around the soles and heels. Imitation leather insoles and gently fluted soles.'),
  14.205944061279297),
 (Document(metadata={'source': 'data/full_desc.txt', 'pk': 458312174224648193}, page_content='0622966014 Laura sneaker of type Sneakers from group Shoes and section Womens Shoes. Product color is White and perceived color is Light. Trainers with lacing at t