In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_chroma import Chroma
from sqlalchemy import create_engine
from transformers import AutoModel, AutoTokenizer, pipeline
import pandas as pd

In [None]:
engine = create_engine("sqlite:////home/karysoares/Documents/book-reviews/notebooks/books.db")

In [None]:
books_data = pd.read_sql('SELECT * FROM books_data', engine)

In [None]:
books_data.head()

In [None]:
books_data = books_data.fillna('')

In [None]:
model = "intfloat/multilingual-e5-large"

In [None]:
embeddings = HuggingFaceEmbeddings(model_name=model)

vector_store = Chroma(
    collection_name="books",
    embedding_function=embeddings,
    persist_directory="/home/karysoares/Documents/book-reviews/notebooks"
)

print("Loaded vector store")
documents = [
    Document(page_content=row['description'], metadata=row.to_dict())
    for n, row in books_data.iterrows()
    if isinstance(row['description'], str)
]
documents.extend([
    Document(page_content=row['Title'], metadata=row.to_dict())
    for n, row in books_data.iterrows()
    if isinstance(row['Title'], str)
])
print("Finished with formatting docs")

# Vamos rodar em chuncks para facilitar a visualização do progresso

chunk_size = 100

for i in range(0, len(documents), chunk_size):
    print(f"Adding chunk {i} to vector store")
    vector_store.add_documents(documents[i:i+chunk_size])