In [1]:
import pandas as pd
from dotenv import load_dotenv
from langchain_community.document_loaders import DataFrameLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
import torch
from typing import List
import math
import os


In [2]:
print(torch.version.cuda)  # CUDA version PyTorch was built with
print(f"CUDA available: {torch.cuda.is_available()}")
print(torch.backends.cudnn.version())  # cuDNN version
print(torch.cuda.get_device_name(0))  # GPU name
print(torch.__version__)  # PyTorch version


11.8
CUDA available: True
90100
NVIDIA GeForce RTX 3050 Laptop GPU
2.7.1+cu118


In [3]:
load_dotenv()

True

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
articles = pd.read_csv('data/articles_full_desc.csv', dtype={'article_id': str})

In [7]:
embeddings = HuggingFaceEmbeddings(
    model_name=os.getenv('HF_EMBEDDING_MODEL'),
    model_kwargs={
        'device': device,
    }
)

In [8]:
articles['full_description'].to_csv('data/full_desc.txt', index=False, sep='\n', header=False, encoding='utf-8')

In [9]:
loader = DataFrameLoader(articles[['article_id', 'full_description']], page_content_column='full_description')
docs = loader.load()


In [10]:
def batch_import(documents: List, embeddings, batch_size: int = 100, **db_args):
    # Initialize with first batch to create collection
    total_batches = math.ceil(len(documents[batch_size:]) / batch_size)
    first_batch = documents[:batch_size]
    print(f"Processing batch 1/{total_batches+1}")

    db = Qdrant.from_documents(
        documents=first_batch,
        embedding=embeddings,
        url=os.getenv('DB_URL'),
        **db_args
    )

    # Process remaining documents in batches
    for i in range(total_batches):
        start_idx = i * batch_size + batch_size  # Add batch_size because we already processed first batch
        end_idx = start_idx + batch_size
        batch = documents[start_idx:end_idx]
        if batch:
            print(f"Processing batch {i+2}/{total_batches+1}")  # +2 because we already did first batch
            db.add_documents(documents=batch)

    return db

In [12]:
# Usage
db_articles = batch_import(
    documents=docs,
    embeddings=embeddings,
    batch_size=500,  # Adjust this based on your needs and memory constraints
    collection_name=os.getenv('COLLECTION'),
)


Processing batch 1/167
Processing batch 2/167
Processing batch 3/167
Processing batch 4/167
Processing batch 5/167
Processing batch 6/167
Processing batch 7/167
Processing batch 8/167
Processing batch 9/167
Processing batch 10/167
Processing batch 11/167
Processing batch 12/167
Processing batch 13/167
Processing batch 14/167
Processing batch 15/167
Processing batch 16/167
Processing batch 17/167
Processing batch 18/167
Processing batch 19/167
Processing batch 20/167
Processing batch 21/167
Processing batch 22/167
Processing batch 23/167
Processing batch 24/167
Processing batch 25/167
Processing batch 26/167
Processing batch 27/167
Processing batch 28/167
Processing batch 29/167
Processing batch 30/167
Processing batch 31/167
Processing batch 32/167
Processing batch 33/167
Processing batch 34/167
Processing batch 35/167
Processing batch 36/167
Processing batch 37/167
Processing batch 38/167
Processing batch 39/167
Processing batch 40/167
Processing batch 41/167
Processing batch 42/167
P

In [13]:
recs = db_articles.similarity_search_with_score('white shoes for women', k=10)

In [14]:
recs[0][0]

Document(metadata={'article_id': '0854184001', '_id': 'bedf9213-d5de-4c73-aa30-349fcd671570', '_collection_name': 'hm_articles'}, page_content='0854184001 Zoe Highlift of type Sneakers from group Shoes, section Womens Shoes and garment group Shoes. Product color is White and perceived color is Light. Trainers in canvas with lacing at the front and a loop at the back. Fabric linings and insoles and patterned soles. Platform 3 cm.')