In [2]:
import os
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import NotionDBLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings


In [3]:
class NotionSemanticSearch:
    def __init__(self, integration_token, database_id, model_name='all-MiniLM-L6-v2', embedding_dir='./embeddings'):
        """
        Initialize the NotionSemanticSearch class.
        """
        self.integration_token = integration_token
        self.database_id = database_id
        self.loader = NotionDBLoader(
            integration_token=self.integration_token,
            database_id=self.database_id
        )
        self.model = SentenceTransformer(model_name)
        self.embedding_dir = embedding_dir
        self.embedded_chunks = []
        self.chunked_documents = []

    def load_documents(self):
        """
        Load documents from the Notion database.
        """
        documents = self.loader.load()
        print(f"Loaded {len(documents)} documents from Notion.")
        return documents

    def split_documents(self, documents, chunk_size=300, chunk_overlap=50):
        """
        Split documents into smaller chunks for embedding.
        """
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", ".", " "]
        )
        chunked_documents = []
        for doc in documents:
            chunks = text_splitter.split_text(doc.page_content)
            for chunk in chunks:
                chunked_documents.append({
                    "content": chunk,
                    "metadata": doc.metadata
                })
        print(f"Split into {len(chunked_documents)} chunks.")
        self.chunked_documents = chunked_documents
        return chunked_documents

    def generate_local_embeddings(self):
        """
        Generate embeddings for document chunks.
        """
        embedded_chunks = []
        for chunk in self.chunked_documents:
            embedding = self.model.encode(chunk['content'])
            embedded_chunks.append({'embedding': embedding, 'metadata': chunk['metadata']})
        print(f"Generated embeddings for {len(embedded_chunks)} chunks.")
        self.embedded_chunks = embedded_chunks
        return embedded_chunks

    def save_embeddings(self, filename='embedded_chunks.pkl'):
        """
        Save the embeddings to a file.
        """
        os.makedirs(self.embedding_dir, exist_ok=True)
        filepath = os.path.join(self.embedding_dir, filename)
        with open(filepath, 'wb') as f:
            pickle.dump(self.embedded_chunks, f)
        print(f"Saved embeddings to {filepath}.")

    def load_embeddings(self, filename='embedded_chunks.pkl'):
        """
        Load embeddings from a file.
        """
        filepath = os.path.join(self.embedding_dir, filename)
        with open(filepath, 'rb') as f:
            self.embedded_chunks = pickle.load(f)
        print(f"Loaded embeddings from {filepath}.")
        return self.embedded_chunks

    def search_documents(self, query, top_k=3):
        """
        Search for relevant documents based on a query.
        """
        query_embedding = self.model.encode(query).reshape(1, -1)
        embeddings = np.array([chunk['embedding'] for chunk in self.embedded_chunks])
        similarities = np.dot(embeddings, query_embedding.T).flatten()
        ranked_indices = similarities.argsort()[::-1][:top_k]
        results = [(self.embedded_chunks[i], similarities[i]) for i in ranked_indices]
        return results

    def fetch_full_document(self, doc_id):
        """
        Fetch the full document details by its ID.
        """
        documents = self.loader.load()
        for document in documents:
            if document.metadata.get("id") == doc_id:
                return {
                    "title": document.metadata.get("title", "No Title"),
                    "content": document.page_content
                }
        return None


In [4]:
notion_search = NotionSemanticSearch(
    integration_token=os.getenv("NOTION_INTEGRATION_TOKEN"),
    database_id=os.getenv("NOTION_DATABASE_ID")
)

documents = notion_search.load_documents()
chunked_documents = notion_search.split_documents(documents)

notion_search.generate_local_embeddings()

notion_search.save_embeddings()


Loaded 14 documents from Notion.
Split into 14 chunks.
Generated embeddings for 14 chunks.
Saved embeddings to ./embeddings/embedded_chunks.pkl.


In [5]:
notion_search.load_embeddings()

query = "What are the guidelines for remote work?"
results = notion_search.search_documents(query, top_k=3)

for i, (result, score) in enumerate(results, 1):
    print(f"Result {i}:")
    print(f"Score: {score}")
    print(f"Content: {result['metadata']}")


Loaded embeddings from ./embeddings/embedded_chunks.pkl.
Result 1:
Score: 0.5359396934509277
Content: {'details': 'Guidelines for remote work, including eligibility, expectations, and communication protocols.', 'category': ['HR'], 'policy name': 'Remote Work Policy', 'id': '147d4f32-0aa2-80c9-8ede-fb4b96209c97'}
Result 2:
Score: 0.3378417491912842
Content: {'details': 'Measures to protect company data, employee responsibilities, and procedures for reporting security incidents.', 'category': ['IT'], 'policy name': 'Data Security Policy', 'id': '147d4f32-0aa2-80f6-945d-eb9cd9434c0c'}
Result 3:
Score: 0.3314809799194336
Content: {'details': 'Rules regarding the use of company IT resources, including internet usage, software installations, and personal device policies.', 'category': ['IT'], 'policy name': 'IT Acceptable Use Policy', 'id': '147d4f32-0aa2-805d-980f-f799e21b2800'}


In [6]:
document_id = results[0][0]['metadata']['id'] 
full_document = notion_search.fetch_full_document(document_id)
if full_document:
    print(f"Content: {full_document['content']}")
else:
    print("Document not found.")


Content: At NextGen Enterprises, employees can work remotely for up to three days a week. Eligible roles require manager approval and a signed Remote Work Agreement. Daily stand-up meetings are conducted via Zoom, and employees must update project status on the internal tracker.



In [7]:
class ChromaSemanticSearch:
    def __init__(self, model_name='all-MiniLM-L6-v2', chroma_dir='./chroma_data'):
        """
        Initialize the ChromaSemanticSearch class.
        """
        self.model_name = model_name
        self.chroma_dir = chroma_dir
        self.vectorstore = None
        self.embedding_function = HuggingFaceEmbeddings(model_name=model_name)

    def create_vectorstore(self, chunked_documents):
        """
        Create a Chroma vectorstore and store document embeddings.

        Args:
            chunked_documents (list): List of dictionaries with `content` and `metadata`.
        """
        texts = [chunk["content"] for chunk in chunked_documents]
        metadatas = [chunk["metadata"] for chunk in chunked_documents]

        # Create and persist the Chroma vectorstore
        self.vectorstore = Chroma.from_texts(
            texts=texts,
            embedding=self.embedding_function,
            metadatas=metadatas,
            persist_directory=self.chroma_dir,
        )
        self.vectorstore.persist()
        print("Created and persisted the Chroma vectorstore.")

    def load_vectorstore(self):
        """
        Load an existing Chroma vectorstore.
        """
        self.vectorstore = Chroma(
            persist_directory=self.chroma_dir,
            embedding_function=self.embedding_function,
        )
        print("Loaded the Chroma vectorstore.")

    def search_documents(self, query, top_k=3):
        """
        Search for relevant documents in the Chroma vectorstore based on a query.

        Args:
            query (str): The search query.
            top_k (int): The number of top results to return.

        Returns:
            list: List of search results with metadata.
        """
        # Ensure the vectorstore is loaded
        if not self.vectorstore:
            self.load_vectorstore()

        results = self.vectorstore.similarity_search(query, k=top_k)
        return results


In [8]:
def preprocess_metadata(chunked_documents):
    """
    Preprocess metadata to ensure all values are of valid types for Chroma.
    
    Args:
        chunked_documents (list): List of dictionaries with `content` and `metadata`.

    Returns:
        list: A new list of documents with sanitized metadata.
    """
    sanitized_documents = []
    for chunk in chunked_documents:
        sanitized_metadata = {
            key: (value[0] if isinstance(value, list) and len(value) > 0 else value)
            if isinstance(value, list) else value
            for key, value in chunk["metadata"].items()
        }
        sanitized_documents.append({
            "content": chunk["content"],
            "metadata": sanitized_metadata
        })
    return sanitized_documents


In [9]:
# Step 4: Preprocess Metadata
sanitized_chunked_documents = preprocess_metadata(chunked_documents)

# Step 5: Create Chroma vectorstore
chroma_search = ChromaSemanticSearch()
chroma_search.create_vectorstore(sanitized_chunked_documents)

  self.embedding_function = HuggingFaceEmbeddings(model_name=model_name)


Created and persisted the Chroma vectorstore.


  self.vectorstore.persist()


In [10]:
chroma_search = ChromaSemanticSearch()

query = "I want to work from home, what should I know about it?"
results = chroma_search.search_documents(query, top_k=3)

# Step 7: Display results
for i, result in enumerate(results, 1):
    print(f"Result {i}:")
    print(f"Content: {result.page_content}")
    print(f"Metadata: {result.metadata}")


Result 1:
Content: At NextGen Enterprises, employees can work remotely for up to three days a week. Eligible roles require manager approval and a signed Remote Work Agreement. Daily stand-up meetings are conducted via Zoom, and employees must update project status on the internal tracker.
Metadata: {'category': 'HR', 'details': 'Guidelines for remote work, including eligibility, expectations, and communication protocols.', 'id': '147d4f32-0aa2-80c9-8ede-fb4b96209c97', 'policy name': 'Remote Work Policy'}
Result 2:
Content: Employees at NextGen Enterprises are encouraged to promote company achievements but must avoid sharing confidential projects. Approved templates for LinkedIn and Twitter posts are available in the "Employee Toolkit." Social media training is mandatory for marketing teams.
Metadata: {'category': 'Marketing', 'details': 'Best practices for employees when representing the company on social media platforms, including confidentiality and brand representation.', 'id': '147