In [None]:
import os
import shutil
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import NotionDBLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
database_id = os.getenv("NOTION_DATABASE_ID")
chroma_dir = './chroma_data'
model_name = 'all-MiniLM-L6-v2'
collection_name = "documents_collection"

if not integration_token or not database_id:
    raise ValueError("Missing Notion API token or Database ID in environment variables.")

In [1]:
# Initialize loaders and models
loader = NotionDBLoader(integration_token=integration_token, database_id=database_id)
embedding_function = HuggingFaceEmbeddings(model_name=model_name)
model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm
  embedding_function = HuggingFaceEmbeddings(model_name=model_name)


Loading documents from Notion...
Loaded 14 documents from Notion.
Splitting documents into smaller chunks...
Split into 14 chunks.
Sanitizing metadata...
Creating Chroma vectorstore...
Chroma vectorstore created and persisted successfully.
Process completed successfully.


  vectorstore.persist()


In [None]:
# Step 1: Load documents from Notion
print("Loading documents from Notion...")
documents = loader.load()
if not documents:
    raise ValueError("No documents were loaded from Notion.")
print(f"Loaded {len(documents)} documents from Notion.")

In [None]:
# Step 2: Split documents into chunks
print("Splitting documents into smaller chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50, separators=["\n\n", ".", " "])
chunked_documents = [
    {"content": chunk, "metadata": doc.metadata}
    for doc in documents
    for chunk in text_splitter.split_text(doc.page_content)
]
if not chunked_documents:
    raise ValueError("No chunks were created from the documents.")
print(f"Split into {len(chunked_documents)} chunks.")

In [None]:
# Step 3: Preprocess metadata
def preprocess_metadata(chunked_documents):
    sanitized_documents = []
    for chunk in chunked_documents:
        sanitized_metadata = {
            key: (value[0] if isinstance(value, list) and len(value) > 0 else value)
            if isinstance(value, list) else value
            for key, value in chunk["metadata"].items()
        }
        sanitized_documents.append({
            "content": chunk["content"],
            "metadata": sanitized_metadata
        })
    return sanitized_documents

print("Sanitizing metadata...")
sanitized_documents = preprocess_metadata(chunked_documents)

In [None]:
# Step 4: Create and persist Chroma vectorstore
print("Creating Chroma vectorstore...")
texts = [doc["content"] for doc in sanitized_documents]
metadatas = [doc["metadata"] for doc in sanitized_documents]

try:
    vectorstore = Chroma.from_texts(
        texts=texts,
        embedding=embedding_function,
        metadatas=metadatas,
        persist_directory=chroma_dir,
        collection_name=collection_name,
    )
    vectorstore.persist()
    print("Chroma vectorstore created and persisted successfully.")
except Exception as e:
    raise RuntimeError(f"Error creating Chroma vectorstore: {e}")

print("Process completed successfully.")