In [29]:
import os
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.document_loaders import WikipediaLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_cohere import CohereEmbeddings
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_cohere import ChatCohere
from langchain.retrievers.self_query.chroma import ChromaTranslator
from typing import List, Optional
from typing import Sequence

In [8]:
os.environ["COHERE_API_KEY"] = "Bl5kPeaI9IELAiPwj3B72o5Ub95xoJE9teskkKdI"

In [14]:
embeddings = CohereEmbeddings(model="embed-multilingual-light-v3.0", max_retries=5, request_timeout=20)

In [9]:

class WikipediaDocumentProcessor:
    """
    A class to load and recursively split documents from Wikipedia.
    """
    
    def __init__(self, queries: List[str], load_max_docs: int = 1, doc_content_chars_max=100000, lang: str = 'fa'):
        """
        Initialize the Wikipedia document processor.
        
        Args:
            queries: List of search queries for Wikipedia
            load_max_docs: Maximum number of documents to load per query (default: 1)
            lang: Language code for Wikipedia (default: 'fa' for Persian)
        """
        self.queries = queries
        self.load_max_docs = load_max_docs
        self.lang = lang
        self.documents: List[Document] = []
        self.split_documents: List[Document] = []
        self.doc_content_chars_max = doc_content_chars_max
        
        # Initialize text splitter for recursive splitting
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
        )
        
        # Load documents immediately upon initialization
        self.load_documents()
    
    def load_documents(self) -> None:
        """
        Load documents from Wikipedia for each query.
        """
        print(f"Loading Wikipedia documents in '{self.lang}' language...")
        
        for query in self.queries:
            try:
                print(f"Loading document for query: '{query}'")
                
                # Load document from Wikipedia
                loader = WikipediaLoader(
                    query=query,
                    load_max_docs=self.load_max_docs,
                    lang=self.lang,
                    doc_content_chars_max=self.doc_content_chars_max,
                )
                docs = loader.load()
                self.documents.extend(docs)
                print(f"‚úì Successfully loaded {len(docs)} document(s) for '{query}'")
                
            except Exception as e:
                print(f"‚úó Error loading document for '{query}': {e}")
        for i, doc in enumerate(self.documents):
            metadata = doc.metadata
            new_metadata = {
                'title': metadata['title'],
                'language': self.lang,
            }
            self.documents[i].metadata = new_metadata
    
    def split_documents_recursively(self, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
        """
        Recursively split the loaded documents into smaller chunks.
        
        Args:
            chunk_size: Size of each chunk (default: 1000)
            chunk_overlap: Overlap between chunks (default: 200)
            
        Returns:
            List of split document chunks
        """
        if not self.documents:
            print("No documents loaded to split.")
            return []
        
        # Update text splitter parameters
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
        
        print("Splitting documents recursively...")
        
        # Split all documents
        self.split_documents = self.text_splitter.split_documents(self.documents)
        
        print(f"‚úì Successfully split {len(self.documents)} documents into {len(self.split_documents)} chunks")
    
    def get_loaded_documents(self) -> List[str]:
        """
        Get the content of loaded documents.
        
        Returns:
            List of document contents
        """
        return [doc.page_content for doc in self.documents]
    
    def get_split_documents(self) -> List[str]:
        """
        Get the content of split documents.
        
        Returns:
            List of split document chunks
        """
        return [doc.page_content for doc in self.split_documents]
    
    def get_document_metadata(self) -> List[dict]:
        """
        Get metadata of loaded documents.
        
        Returns:
            List of document metadata
        """
        return [doc.metadata for doc in self.documents]
    
    def print_summary(self) -> None:
        """
        Print a summary of loaded and split documents.
        """
        print("\n" + "="*50)
        print("DOCUMENT PROCESSING SUMMARY")
        print("="*50)
        print(f"Queries processed: {len(self.queries)}")
        print(f"Documents loaded: {len(self.documents)}")
        print(f"Chunks created: {len(self.split_documents)}")
        
        if self.documents:
            print(f"\nOriginal document sizes:")
            for i, doc in enumerate(self.documents):
                print(f"  Document {i+1}: {len(doc.page_content)} characters")
        
        if self.split_documents:
            print(f"\nChunk sizes:")
            for i, chunk in enumerate(self.split_documents):
                print(f"  Chunk {i+1}: {len(chunk.page_content)} characters")

In [10]:
titles = ['ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ']
wikiloader = WikipediaDocumentProcessor(queries=titles)
wikiloader.split_documents_recursively()

Loading Wikipedia documents in 'fa' language...
Loading document for query: 'ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ'
‚úì Successfully loaded 1 document(s) for 'ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ'
Splitting documents recursively...
‚úì Successfully split 1 documents into 25 chunks


In [None]:
vector_store = Chroma.from_documents(
    documents = wikiloader.split_documents, # The documents to embed
    embedding = embeddings, # The embeddings to use
    collection_name = "librechat_wiki", # The name of the collection
    persist_directory = "./collections", # Where to save the collection
)

In [20]:
query = '⁄ÜŸá ⁄©ÿ≥€å ÿ®ŸÜ€åÿßÿØ ŸÜÿ±ŸÖ‚ÄåÿßŸÅÿ≤ÿßÿ±Ÿáÿß€å ÿ¢ÿ≤ÿßÿØ ÿ±ÿß ÿ®ŸÜÿß ŸÜŸáÿßÿØÿü'

results = vector_store.similarity_search(query, k = 3)

for result in results:
    print('Documnt Title:', result.metadata['title'])
    print('Document Content:', result.page_content)
    print('-'*100)

Documnt Title: ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ
Document Content: €±€π€∏€µ: ÿßŸà ÿ®ŸÜ€åÿßÿØ ŸÜÿ±ŸÖ‚ÄåÿßŸÅÿ≤ÿßÿ± ÿ¢ÿ≤ÿßÿØ€å ÿ±ÿß ÿ¢ÿ∫ÿßÿ≤ ⁄©ÿ±ÿØ ⁄©Ÿá ÿÆŸàÿØÿ¥ ÿ®Ÿá ÿπŸÜŸàÿßŸÜ ÿ±ÿ¶€åÿ≥ ÿØÿßŸàÿ∑ŸÑÿ®ÿßŸÜŸáŸî ÿ™ŸÖÿßŸÖ ŸàŸÇÿ™ ÿ¢ŸÜ ÿ®ŸàÿØ.
ÿ≥€åÿ≥ÿ™ŸÖ ⁄ØŸÜŸà/ ŸÑ€åŸÜŸà⁄©ÿ≥ ⁄©Ÿá ŸáŸÖ⁄ÜŸÜ€åŸÜ Ÿáÿ≥ÿ™ŸáŸî ŸÑ€åŸÜŸà⁄©ÿ≥ ÿ™Ÿàÿ≥ÿπŸá ÿØÿßÿØŸá ÿ¥ÿØŸá ÿ™Ÿàÿ≥ÿ∑ ŸÑ€åŸÜŸàÿ≥ ÿ™Ÿàÿ±ŸàÿßŸÑÿØÿ≤ (ÿ®Ÿá ÿßŸÜ⁄ØŸÑ€åÿ≥€å: Linus Torvalds) ÿ±ÿß ÿ®Ÿá ⁄©ÿßÿ± ŸÖ€å‚Äåÿ®ÿ±ÿØÿå ÿØÿ± ÿ±ÿß€åÿßŸÜŸá‚ÄåŸáÿß€å ÿ®ÿ≥€åÿßÿ±€å ÿßÿ≥ÿ™ŸÅÿßÿØŸá ŸÖ€å‚Äåÿ¥ÿØ Ÿà ŸáŸÖ‚Äåÿß⁄©ŸÜŸàŸÜ ÿØÿ± ŸÅÿ±Ÿàÿ¥⁄ØÿßŸá‚ÄåŸáÿß€å ⁄©Ÿà⁄Ü⁄© ÿ±ÿß€åÿßŸÜŸá ŸÜ€åÿ≤ ÿ®Ÿá ÿµŸàÿ±ÿ™ ÿßÿ≤ Ÿæ€åÿ¥ ŸÜÿµÿ® ÿ¥ÿØŸá ŸÅÿ±ÿßŸáŸÖ ŸÖ€å‚Äåÿ®ÿßÿ¥ŸÜÿØ. Ÿáÿ± ⁄ÜŸÜÿØ ÿ™Ÿàÿ≤€åÿπ‚Äå⁄©ŸÜŸÜÿØ⁄ØÿßŸÜ ÿß€åŸÜ ÿ≥€åÿ≥ÿ™ŸÖ‚ÄåŸáÿß ⁄ØÿßŸá€å ÿ™ŸÅ⁄©ÿ± ÿ¢ÿ≤ÿßÿØ€å ÿ±ÿß ⁄©Ÿá ŸÜÿ±ŸÖ‚ÄåÿßŸÅÿ≤ÿßÿ± ÿ¢ÿ≤ÿßÿØ ÿ±ÿß ŸÖŸáŸÖ ŸÖ€å‚Äåÿ≥ÿßÿ≤ÿØÿå ŸÜÿßÿØ€åÿØŸá ⁄Øÿ±ŸÅÿ™ŸÜÿØÿå Ÿà ÿ≠ÿ™€å ŸÜÿ±ŸÖ‚ÄåÿßŸÅÿ≤ÿßÿ±Ÿáÿß€å ÿ∫€åÿ± ÿ¢ÿ≤ÿßÿØ ÿ±ÿß ÿØÿ± ÿ≥€åÿ≥ÿ™ŸÖ‚ÄåŸáÿß€åÿ¥ÿßŸÜ ŸÇÿ±ÿßÿ± ÿØÿßÿØŸÜÿØ. ÿ®Ÿá ŸáŸÖ€åŸÜ ÿØŸÑ€åŸÑ ÿßÿ≥ÿ™ ⁄©Ÿá ÿßÿ≥ÿ™ÿßŸÑŸÖ

In [21]:
query = "ÿØÿ± ÿ®€åÿßŸÜ€åŸá‚Äå€å Ÿá⁄©ÿ±Ÿáÿß ⁄ØŸÅÿ™Ÿá ÿ¥ÿØŸá ⁄©Ÿá ÿ¨ÿ±ŸÖ ÿ¢ŸÜ‚ÄåŸáÿß ÿØÿ± €å⁄© ⁄©ŸÑŸÖŸá ⁄Ü€åÿ≥ÿ™ÿü"

results = vector_store.similarity_search(query, k = 3)

for result in results:
    print('Documnt Title:', result.metadata['title'])
    print('Document Content:', result.page_content)
    print('-'*100)

Documnt Title: ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ
Document Content: == ÿ±Ÿà€åÿØÿßÿØŸáÿß€å ŸÖŸÜÿ™Ÿá€å ÿ®Ÿá Ÿæÿ±Ÿà⁄òŸá ⁄ØŸÜŸà ==
ÿØÿ± ÿßŸàÿßÿÆÿ± ÿØŸáŸá €±€π€∑€∞ Ÿà ÿßŸàÿß€åŸÑ ÿØŸáŸá €±€π€∏€∞ÿå ŸÅÿ±ŸáŸÜ⁄Ø Ÿá⁄©ÿ±Ÿáÿß ⁄©Ÿá ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ ÿ®Ÿá ÿ¢ŸÜ ÿ±ŸàŸÜŸÇ ÿØÿßÿØŸá ÿ®ŸàÿØ ÿ¥ÿ±Ÿàÿπ ÿ®Ÿá ŸÖÿ™ŸÑÿßÿ¥€å ÿ¥ÿØŸÜ ⁄©ÿ±ÿØ. ÿÆ€åŸÑ€å ÿßÿ≤ ÿ™ŸàŸÑ€åÿØ⁄©ŸÜŸÜÿØ⁄ØÿßŸÜ ÿ®ÿ±ÿß€å ÿ¨ŸÑŸà⁄Ø€åÿ±€å ⁄©ÿ±ÿØŸÜ ŸÜÿ±ŸÖ‚ÄåÿßŸÅÿ≤ÿßÿ± ÿßÿ≤ ÿßÿ≥ÿ™ŸÅÿßÿØŸá ÿ¥ÿØŸÜ ÿØÿ± ÿ±ÿß€åÿßŸÜŸá‚ÄåŸáÿß€å ÿ±ŸÇÿ®ÿß€åÿ¥ÿßŸÜÿå ÿ™Ÿàÿ≤€åÿπ ⁄©ÿ±ÿØŸÜ ⁄©ÿØ ŸÖŸÜÿ®ÿπ ÿ±ÿß ŸÖÿ™ŸàŸÇŸÅ ⁄©ÿ±ÿØŸÜÿØ Ÿà ÿ®ÿ±ÿß€å ŸÖÿ≠ÿØŸàÿØ ⁄©ÿ±ÿØŸÜ Ÿà ŸÖŸÖŸÜŸàÿπ ÿ≥ÿßÿÆÿ™ŸÜ ⁄©Ÿæ€å Ÿà ÿ™Ÿàÿ≤€åÿπ ŸÖÿ¨ÿØÿØÿå ÿ¥ÿ±Ÿàÿπ ÿ®Ÿá ÿßÿ≥ÿ™ŸÅÿßÿØŸá ÿßÿ≤ ⁄©Ÿæ€å ÿ±ÿß€åÿ™ Ÿà ŸÖÿ¨Ÿàÿ≤Ÿáÿß€å ŸÜÿ±ŸÖ‚ÄåÿßŸÅÿ≤ÿßÿ± ŸÖÿ≠ÿØŸàÿØ ⁄©ÿ±ÿØŸÜÿØ. ÿß€åŸÜ⁄ÜŸÜ€åŸÜ ŸÜÿ±ŸÖ‚ÄåÿßŸÅÿ≤ÿßÿ±Ÿáÿß€å ÿßÿÆÿ™ÿµÿßÿµ€å ÿßÿ≤ ŸÇÿ®ŸÑ Ÿàÿ¨ŸàÿØ ÿØÿßÿ¥ÿ™ŸÜÿØÿå Ÿà ÿß€åŸÜ ÿßŸÖÿ± ÿß€åŸÜÿ∑Ÿàÿ± ÿ∏ÿßŸáÿ± ÿ¥ÿØ ⁄©Ÿá ÿ®ÿ™ŸàÿßŸÜÿØ ŸáŸÜÿ¨ÿßÿ±€å ÿØÿ± ÿß€åŸÜ ÿ±ÿßÿ≥ÿ™ÿß ÿ¥ŸàÿØ.
ÿ∂ÿ±Ÿàÿ±ÿ™ Ÿàÿ¨ŸàÿØ ÿ®ÿ±ŸÜÿßŸÖŸá‚ÄåŸáÿß€å€å ÿ®ÿß ŸÇÿßÿ®ŸÑ€åÿ™ ÿßŸÜÿ™ŸÇÿßŸ

In [22]:
query = "ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ ÿØÿ± €≤€± ÿ≥ÿßŸÑ⁄Ø€å ÿØÿ± ⁄©ÿØÿßŸÖ ÿ¥ÿ±⁄©ÿ™ ⁄©ÿßÿ± ŸÖ€å‚Äå⁄©ÿ±ÿØÿü"

results = vector_store.similarity_search(query, k = 3)

for result in results:
    print('Documnt Title:', result.metadata['title'])
    print('Document Content:', result.page_content)
    print('-'*100)

Documnt Title: ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ
Document Content: == ÿ≤ŸÜÿØ⁄Ø€å ==
ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ ÿØÿ± ÿ≥ÿßŸÑ €±€π€µ€≥ ÿØÿ± ÿ¥Ÿáÿ± ŸÜ€åŸà€åŸàÿ±⁄© ÿ®Ÿá ÿØŸÜ€åÿß ÿ¢ŸÖÿØ. Alice Lippman Ÿà Daniel Stallman ŸæÿØÿ± Ÿà ŸÖÿßÿØÿ± ÿßŸà ÿ®ŸàÿØŸÜÿØ. ÿßŸàŸÑ€åŸÜ ÿ™ÿ¨ÿ±ÿ®Ÿá ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ ÿ®ÿß ⁄©ÿßŸÖŸæ€åŸàÿ™ÿ±Ÿáÿß ÿØÿ± ÿØŸàÿ±ÿßŸÜ ÿØÿ®€åÿ±ÿ≥ÿ™ÿßŸÜ ÿØÿ± ŸÖÿ±⁄©ÿ≤ ÿπŸÑŸÖ€å IBM ŸÜ€åŸà€åŸàÿ±⁄© ÿ®ŸàÿØ. ÿßŸà ÿ™ÿßÿ®ÿ≥ÿ™ÿßŸÜ ÿ±ÿß ÿ®ÿ±ÿß€å ŸÜŸàÿ¥ÿ™ŸÜ €å⁄© ÿ®ÿ±ŸÜÿßŸÖŸá ÿ¢ŸÜÿßŸÑ€åÿ≤ ÿπÿØÿØ€å ÿØÿ± Fortran ÿßÿ≥ÿ™ÿÆÿØÿßŸÖ ÿ¥ÿØŸá ÿ®ŸàÿØ. ÿßŸà ⁄©ÿßÿ± ÿÆŸàÿØ ÿ±ÿß ÿ®ÿπÿØ ÿßÿ≤ ÿØŸà ŸáŸÅÿ™Ÿá ÿ™ŸÖÿßŸÖ ⁄©ÿ±ÿØ Ÿà ÿ®ŸÇ€åŸá ÿ™ÿßÿ®ÿ≥ÿ™ÿßŸÜ ÿÆŸàÿØ ÿ±ÿß ÿ®Ÿá ŸÜŸàÿ¥ÿ™ŸÜ €å⁄© Ÿà€åÿ±ÿß€åÿ¥⁄Øÿ± ŸÖÿ™ŸÜ ÿØÿ± APL ÿµÿ±ŸÅ ⁄©ÿ±ÿØ. ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ ÿ™ÿßÿ®ÿ≥ÿ™ÿßŸÜ ÿ®ÿπÿØ ÿßÿ≤ ŸÅÿßÿ±ÿ∫‚ÄåÿßŸÑÿ™ÿ≠ÿµ€åŸÑ€å‚Äåÿßÿ¥ ÿ±ÿß ÿµÿ±ŸÅ ŸÜŸàÿ¥ÿ™ŸÜ €å⁄© Ÿæÿ±ÿØÿßÿ≤ÿ¥⁄Øÿ± ÿ®ÿ±ÿß€å ÿ≤ÿ®ÿßŸÜ ÿ®ÿ±ŸÜÿßŸÖŸá‚ÄåŸÜŸà€åÿ≥€å PL/I ÿØÿ± IBM System/360 ⁄©ÿ±ÿØ.
ÿØÿ± ÿØŸÜ€åÿß€å ÿ®ÿ±ŸÜÿßŸÖŸá‚ÄåŸÜŸà€åÿ≥€å Ÿà ⁄©ÿßÿ±Ÿáÿß€å€å ⁄©Ÿá ÿßŸà ÿßŸÜÿ¨ÿßŸÖ ÿØÿßÿØ ÿ®Ÿá ¬´RMS¬ª ÿ¥Ÿáÿ±ÿ

In [25]:
query = 'ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ ÿØÿ± €≤€± ÿ≥ÿßŸÑ⁄Ø€å ÿØÿ± ⁄©ÿØÿßŸÖ ÿ¥ÿ±⁄©ÿ™ ⁄©ÿßÿ± ŸÖ€å‚Äå⁄©ÿ±ÿØÿü'

results = vector_store.max_marginal_relevance_search(query, fetch_k=8, k=3)

for result in results:
    print('Document Title:', result.metadata['title'])
    print('Document Content:', result.page_content)
    print('-'*100)

Document Title: ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ
Document Content: == ÿ≤ŸÜÿØ⁄Ø€å ==
ÿ±€å⁄Üÿßÿ±ÿØ ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ ÿØÿ± ÿ≥ÿßŸÑ €±€π€µ€≥ ÿØÿ± ÿ¥Ÿáÿ± ŸÜ€åŸà€åŸàÿ±⁄© ÿ®Ÿá ÿØŸÜ€åÿß ÿ¢ŸÖÿØ. Alice Lippman Ÿà Daniel Stallman ŸæÿØÿ± Ÿà ŸÖÿßÿØÿ± ÿßŸà ÿ®ŸàÿØŸÜÿØ. ÿßŸàŸÑ€åŸÜ ÿ™ÿ¨ÿ±ÿ®Ÿá ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ ÿ®ÿß ⁄©ÿßŸÖŸæ€åŸàÿ™ÿ±Ÿáÿß ÿØÿ± ÿØŸàÿ±ÿßŸÜ ÿØÿ®€åÿ±ÿ≥ÿ™ÿßŸÜ ÿØÿ± ŸÖÿ±⁄©ÿ≤ ÿπŸÑŸÖ€å IBM ŸÜ€åŸà€åŸàÿ±⁄© ÿ®ŸàÿØ. ÿßŸà ÿ™ÿßÿ®ÿ≥ÿ™ÿßŸÜ ÿ±ÿß ÿ®ÿ±ÿß€å ŸÜŸàÿ¥ÿ™ŸÜ €å⁄© ÿ®ÿ±ŸÜÿßŸÖŸá ÿ¢ŸÜÿßŸÑ€åÿ≤ ÿπÿØÿØ€å ÿØÿ± Fortran ÿßÿ≥ÿ™ÿÆÿØÿßŸÖ ÿ¥ÿØŸá ÿ®ŸàÿØ. ÿßŸà ⁄©ÿßÿ± ÿÆŸàÿØ ÿ±ÿß ÿ®ÿπÿØ ÿßÿ≤ ÿØŸà ŸáŸÅÿ™Ÿá ÿ™ŸÖÿßŸÖ ⁄©ÿ±ÿØ Ÿà ÿ®ŸÇ€åŸá ÿ™ÿßÿ®ÿ≥ÿ™ÿßŸÜ ÿÆŸàÿØ ÿ±ÿß ÿ®Ÿá ŸÜŸàÿ¥ÿ™ŸÜ €å⁄© Ÿà€åÿ±ÿß€åÿ¥⁄Øÿ± ŸÖÿ™ŸÜ ÿØÿ± APL ÿµÿ±ŸÅ ⁄©ÿ±ÿØ. ÿßÿ≥ÿ™ÿßŸÑŸÖŸÜ ÿ™ÿßÿ®ÿ≥ÿ™ÿßŸÜ ÿ®ÿπÿØ ÿßÿ≤ ŸÅÿßÿ±ÿ∫‚ÄåÿßŸÑÿ™ÿ≠ÿµ€åŸÑ€å‚Äåÿßÿ¥ ÿ±ÿß ÿµÿ±ŸÅ ŸÜŸàÿ¥ÿ™ŸÜ €å⁄© Ÿæÿ±ÿØÿßÿ≤ÿ¥⁄Øÿ± ÿ®ÿ±ÿß€å ÿ≤ÿ®ÿßŸÜ ÿ®ÿ±ŸÜÿßŸÖŸá‚ÄåŸÜŸà€åÿ≥€å PL/I ÿØÿ± IBM System/360 ⁄©ÿ±ÿØ.
ÿØÿ± ÿØŸÜ€åÿß€å ÿ®ÿ±ŸÜÿßŸÖŸá‚ÄåŸÜŸà€åÿ≥€å Ÿà ⁄©ÿßÿ±Ÿáÿß€å€å ⁄©Ÿá ÿßŸà ÿßŸÜÿ¨ÿßŸÖ ÿØÿßÿØ ÿ®Ÿá ¬´RMS¬ª ÿ¥Ÿáÿ±

In [None]:


metadata_field_info = [
    AttributeInfo(
        name = "Released_Year",
        description = "The year the movie was released. You must put Released_Year in quotes like \"Released_Year\"",
        type = "integer",
    ),
    AttributeInfo(
        name = "IMDB_Rating",
        description = "A 1-10 rating for the movie. You must put IMDB_Rating in quotes like \"IMDB_Rating\"",
        type = "float"
    ),
]

document_content_description = "Brief summary of a movie"
llm = ChatCohere(temperature=0)

retriever = SelfQueryRetriever.from_llm(
    llm,
    vector_store,
    document_content_description,
    metadata_field_info,
    verbose = True
)

In [None]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict, Any
import re

class WikipediaHeaderSplitter:
    """
    A class to load and split Wikipedia documents by their headers.
    """
    
    def __init__(self, queries: List[str], load_max_docs: int = 1, lang: str = 'fa'):
        """
        Initialize the Wikipedia header splitter.
        
        Args:
            queries: List of search queries for Wikipedia
            load_max_docs: Maximum number of documents to load per query
            lang: Language code for Wikipedia (default: 'fa' for Persian)
        """
        self.queries = queries
        self.load_max_docs = load_max_docs
        self.lang = lang
        self.raw_documents = []
        self.header_documents = []
        
        # Load documents
        self.load_documents()
    
    def load_documents(self) -> None:
        """Load documents from Wikipedia for each query."""
        print(f"Loading Wikipedia documents in '{self.lang}' language...")
        
        for query in self.queries:
            try:
                print(f"Loading document for query: '{query}'")
                
                loader = WikipediaLoader(
                    query=query,
                    load_max_docs=self.load_max_docs,
                    lang=self.lang
                )
                
                docs = loader.load()
                self.raw_documents.extend(docs)
                print(f"‚úì Successfully loaded {len(docs)} document(s) for '{query}'")
                
            except Exception as e:
                print(f"‚úó Error loading document for '{query}': {e}")
    
    def split_by_headers(self) -> List[Dict[str, Any]]:
        """
        Split Wikipedia documents by headers.
        
        Returns:
            List of dictionaries containing header, content, and metadata
        """
        self.header_documents = []
        
        for doc in self.raw_documents:
            sections = self._parse_wikipedia_sections(doc.page_content)
            self.header_documents.extend(sections)
        
        print(f"‚úì Split {len(self.raw_documents)} documents into {len(self.header_documents)} sections by headers")
        return self.header_documents
    
    def _parse_wikipedia_sections(self, content: str) -> List[Dict[str, Any]]:
        """
        Parse Wikipedia content and split by headers/sections.
        
        Args:
            content: Raw Wikipedia content
            
        Returns:
            List of section dictionaries
        """
        sections = []
        
        # Pattern for Wikipedia headers (== Header ==, === Subheader ===, etc.)
        # This works for both English and Persian Wikipedia
        header_pattern = r'(\n|^)(={2,})\s*(.*?)\s*\2(\n|$)'
        
        # Find all headers and their positions
        matches = list(re.finditer(header_pattern, content))
        
        if not matches:
            # If no headers found, treat entire content as one section
            sections.append({
                'header': 'Introduction',
                'content': content.strip(),
                'level': 1,
                'word_count': len(content.split())
            })
            return sections
        
        # Process each section
        for i, match in enumerate(matches):
            header_level = len(match.group(2)) - 1  # == is level 1, === is level 2, etc.
            header_text = match.group(3).strip()
            
            # Determine content start and end positions
            content_start = match.end()
            
            if i < len(matches) - 1:
                content_end = matches[i + 1].start()
            else:
                content_end = len(content)
            
            section_content = content[content_start:content_end].strip()
            
            # Only add if content is not empty
            if section_content:
                sections.append({
                    'header': header_text,
                    'content': section_content,
                    'level': header_level,
                    'word_count': len(section_content.split()),
                    'char_count': len(section_content)
                })
        
        return sections
    
    def get_hierarchical_structure(self) -> Dict[str, Any]:
        """
        Get hierarchical structure of the document.
        
        Returns:
            Nested dictionary representing document hierarchy
        """
        if not self.header_documents:
            self.split_by_headers()
        
        hierarchy = {
            'title': self.raw_documents[0].metadata['title'] if self.raw_documents else 'Unknown',
            'sections': []
        }
        
        current_level_1 = None
        current_level_2 = None
        
        for section in self.header_documents:
            section_data = {
                'header': section['header'],
                'content_preview': section['content'][:100] + '...',
                'word_count': section['word_count'],
                'level': section['level']
            }
            
            if section['level'] == 1:
                current_level_1 = {'section': section_data, 'subsections': []}
                hierarchy['sections'].append(current_level_1)
                current_level_2 = None
            elif section['level'] == 2 and current_level_1:
                current_level_2 = {'section': section_data, 'subsections': []}
                current_level_1['subsections'].append(current_level_2)
            elif section['level'] == 3 and current_level_2:
                current_level_2['subsections'].append(section_data)
        
        return hierarchy
    
    def split_large_sections(self, max_chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Dict[str, Any]]:
        """
        Further split large sections using recursive text splitting.
        
        Args:
            max_chunk_size: Maximum chunk size for large sections
            chunk_overlap: Overlap between chunks
            
        Returns:
            List of all chunks (both header-based and split large sections)
        """
        if not self.header_documents:
            self.split_by_headers()
        
        all_chunks = []
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
        
        for section in self.header_documents:
            # If section is small, keep it as is
            if len(section['content']) <= max_chunk_size:
                all_chunks.append({
                    'header': section['header'],
                    'content': section['content'],
                    'level': section['level'],
                    'is_split': False,
                    'chunk_index': 0,
                    'total_chunks': 1
                })
            else:
                # Split large sections
                chunks = text_splitter.split_text(section['content'])
                for i, chunk in enumerate(chunks):
                    all_chunks.append({
                        'header': section['header'],
                        'content': chunk,
                        'level': section['level'],
                        'is_split': True,
                        'chunk_index': i,
                        'total_chunks': len(chunks)
                    })
        
        print(f"‚úì Created {len(all_chunks)} total chunks (including split large sections)")
        return all_chunks
    
    def print_document_structure(self) -> None:
        """Print the hierarchical structure of the document."""
        hierarchy = self.get_hierarchical_structure()
        
        print(f"\nüìë DOCUMENT STRUCTURE: {hierarchy['title']}")
        print("=" * 60)
        
        for section in hierarchy['sections']:
            level1 = section['section']
            print(f"\n# {level1['header']} ({level1['word_count']} words)")
            
            for subsection in section['subsections']:
                level2 = subsection['section']
                print(f"  ## {level2['header']} ({level2['word_count']} words)")
                
                for subsubsection in subsection['subsections']:
                    level3 = subsubsection
                    print(f"    ### {level3['header']} ({level3['word_count']} words)")
    
    def export_to_langchain_documents(self) -> List[Any]:
        """
        Convert header-based chunks to LangChain Document format.
        
        Returns:
            List of LangChain Document objects
        """
        from langchain.schema import Document
        
        if not self.header_documents:
            self.split_by_headers()
        
        langchain_docs = []
        
        for i, section in enumerate(self.header_documents):
            # Create enhanced metadata
            metadata = {
                'source': 'wikipedia',
                'title': self.raw_documents[0].metadata['title'] if self.raw_documents else 'Unknown',
                'header': section['header'],
                'level': section['level'],
                'section_index': i,
                'word_count': section['word_count'],
                'char_count': section['char_count'],
                'language': self.lang
            }
            
            # Combine header and content for better context
            enhanced_content = f"Section: {section['header']}\n\n{section['content']}"
            
            doc = Document(
                page_content=enhanced_content,
                metadata=metadata
            )
            langchain_docs.append(doc)
        
        return langchain_docs

# Example usage and demonstration
if __name__ == "__main__":
    # Example queries for Persian Wikipedia
    queries = []
    
    # Create header splitter
    splitter = WikipediaHeaderSplitter(
        queries=queries,
        load_max_docs=1,
        lang='fa'
    )
    
    print("üîç ANALYZING WIKIPEDIA DOCUMENT STRUCTURE")
    print("=" * 50)
    
    # Split by headers
    sections = splitter.split_by_headers()
    
    # Print document structure
    splitter.print_document_structure()
    
    # Show section details
    print(f"\nüìä SECTION DETAILS:")
    print(f"Total sections found: {len(sections)}")
    
    for i, section in enumerate(sections[:5]):  # Show first 5 sections
        print(f"\n--- Section {i+1} ---")
        print(f"Header: {section['header']} (Level {section['level']})")
        print(f"Word count: {section['word_count']}")
        print(f"Content preview: {section['content'][:150]}...")
    
    # Further split large sections if needed
    print(f"\nüîÑ SPLITTING LARGE SECTIONS:")
    all_chunks = splitter.split_large_sections(max_chunk_size=800, chunk_overlap=100)
    
    print(f"Total chunks after splitting: {len(all_chunks)}")
    
    # Show split chunks
    split_chunks = [chunk for chunk in all_chunks if chunk['is_split']]
    if split_chunks:
        print(f"Split chunks: {len(split_chunks)}")
        for chunk in split_chunks[:3]:
            print(f"  - {chunk['header']} [Part {chunk['chunk_index'] + 1}/{chunk['total_chunks']}]")
    
    # Export to LangChain documents
    print(f"\nüìÑ EXPORTING TO LANGCHAIN DOCUMENTS:")
    langchain_docs = splitter.export_to_langchain_documents()
    print(f"Created {len(langchain_docs)} LangChain Document objects")
    
    # Show first document as example
    if langchain_docs:
        first_doc = langchain_docs[0]
        print(f"\nFirst document preview:")
        print(f"Content: {first_doc.page_content[:200]}...")
        print(f"Metadata: {first_doc.metadata}")

Loading Wikipedia documents in 'fa' language...
Loading document for query: 'ŸáŸàÿ¥ ŸÖÿµŸÜŸàÿπ€å'
‚úì Successfully loaded 1 document(s) for 'ŸáŸàÿ¥ ŸÖÿµŸÜŸàÿπ€å'
üîç ANALYZING WIKIPEDIA DOCUMENT STRUCTURE
‚úì Split 1 documents into 2 sections by headers

üìë DOCUMENT STRUCTURE: ŸáŸàÿ¥ ŸÖÿµŸÜŸàÿπ€å

# ÿßŸáÿØÿßŸÅ (50 words)
  ## ÿßÿ≥ÿ™ÿØŸÑÿßŸÑ Ÿà ÿ≠ŸÑ ŸÖÿ≥ÿ¶ŸÑŸá (45 words)

üìä SECTION DETAILS:
Total sections found: 2

--- Section 1 ---
Header: ÿßŸáÿØÿßŸÅ (Level 1)
Word count: 50
Content preview: ŸÖÿ≥ÿ¶ŸÑŸá ⁄©ŸÑ€å ÿ¥ÿ®€åŸá‚Äåÿ≥ÿßÿ≤€å (€åÿß ÿß€åÿ¨ÿßÿØ) ŸáŸàÿ¥ ÿ®Ÿá ÿ≤€åÿ±ŸÖÿ≥ÿ¶ŸÑŸá‚ÄåŸáÿß€å€å ÿ™ŸÇÿ≥€åŸÖ ÿ¥ÿØŸá ÿßÿ≥ÿ™. ÿß€åŸÜ ÿ≤€åÿ±ŸÖÿ≥ÿ¶ŸÑŸá‚ÄåŸáÿß ÿ¥ÿßŸÖŸÑ Ÿà€å⁄ò⁄Ø€å‚ÄåŸáÿß €åÿß ŸÇÿßÿ®ŸÑ€åÿ™‚ÄåŸáÿß€å ÿÆÿßÿµ€å Ÿáÿ≥ÿ™ŸÜÿØ ⁄©Ÿá Ÿæ⁄òŸàŸáÿ¥⁄Øÿ±ÿßŸÜ ÿßŸÜÿ™ÿ∏ÿßÿ± ÿØÿßÿ±ŸÜÿØ €å⁄©...

--- Section 2 ---
Header: ÿßÿ≥ÿ™ÿØŸÑÿßŸÑ Ÿà ÿ≠ŸÑ ŸÖÿ≥ÿ¶ŸÑŸá (Level 2)
Word count: 45
Content preview: Ÿæ⁄òŸàŸáÿ¥⁄Øÿ±ÿßŸÜ ÿßŸàŸÑ€åŸá ÿßŸÑ⁄ØŸàÿ±€åÿ™ŸÖ‚ÄåŸáÿß€å€å ÿ±ÿß ÿ™Ÿàÿ≥ÿπŸá ÿØÿßÿØŸÜÿØ ⁄©Ÿá ÿßÿ≤ ÿßÿ≥ÿ™ÿØŸÑÿßŸÑ ⁄

In [None]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict, Any
import re

class WikipediaHeaderSplitter:
    """
    A class to load and split Wikipedia documents by their headers.
    """
    
    def __init__(self, queries: List[str], load_max_docs: int = 1, lang: str = 'fa'):
        """
        Initialize the Wikipedia header splitter.
        
        Args:
            queries: List of search queries for Wikipedia
            load_max_docs: Maximum number of documents to load per query
            lang: Language code for Wikipedia (default: 'fa' for Persian)
        """
        self.queries = queries
        self.load_max_docs = load_max_docs
        self.lang = lang
        self.raw_documents = []
        self.header_documents = []
        
        # Load documents
        self.load_documents()
    
    def load_documents(self) -> None:
        """Load documents from Wikipedia for each query."""
        print(f"Loading Wikipedia documents in '{self.lang}' language...")
        
        for query in self.queries:
            try:
                print(f"Loading document for query: '{query}'")
                
                loader = WikipediaLoader(
                    query=query,
                    load_max_docs=self.load_max_docs,
                    lang=self.lang
                )
                
                docs = loader.load()
                self.raw_documents.extend(docs)
                print(f"‚úì Successfully loaded {len(docs)} document(s) for '{query}'")
                
            except Exception as e:
                print(f"‚úó Error loading document for '{query}': {e}")
    
    def split_by_headers(self) -> List[Dict[str, Any]]:
        """
        Split Wikipedia documents by headers.
        
        Returns:
            List of dictionaries containing header, content, and metadata
        """
        self.header_documents = []
        
        for doc in self.raw_documents:
            sections = self._parse_wikipedia_sections(doc.page_content)
            self.header_documents.extend(sections)
        
        print(f"‚úì Split {len(self.raw_documents)} documents into {len(self.header_documents)} sections by headers")
        return self.header_documents
    
    def _parse_wikipedia_sections(self, content: str) -> List[Dict[str, Any]]:
        """
        Parse Wikipedia content and split by headers/sections.
        
        Args:
            content: Raw Wikipedia content
            
        Returns:
            List of section dictionaries
        """
        sections = []
        
        # Pattern for Wikipedia headers (== Header ==, === Subheader ===, etc.)
        # This works for both English and Persian Wikipedia
        header_pattern = r'(\n|^)(={2,})\s*(.*?)\s*\2(\n|$)'
        
        # Find all headers and their positions
        matches = list(re.finditer(header_pattern, content))
        
        if not matches:
            # If no headers found, treat entire content as one section
            sections.append({
                'header': 'Introduction',
                'content': content.strip(),
                'level': 1,
                'word_count': len(content.split())
            })
            return sections
        
        # Process each section
        for i, match in enumerate(matches):
            header_level = len(match.group(2)) - 1  # == is level 1, === is level 2, etc.
            header_text = match.group(3).strip()
            
            # Determine content start and end positions
            content_start = match.end()
            
            if i < len(matches) - 1:
                content_end = matches[i + 1].start()
            else:
                content_end = len(content)
            
            section_content = content[content_start:content_end].strip()
            
            # Only add if content is not empty
            if section_content:
                sections.append({
                    'header': header_text,
                    'content': section_content,
                    'level': header_level,
                    'word_count': len(section_content.split()),
                    'char_count': len(section_content)
                })
        
        return sections
    
    def get_hierarchical_structure(self) -> Dict[str, Any]:
        """
        Get hierarchical structure of the document.
        
        Returns:
            Nested dictionary representing document hierarchy
        """
        if not self.header_documents:
            self.split_by_headers()
        
        hierarchy = {
            'title': self.raw_documents[0].metadata['title'] if self.raw_documents else 'Unknown',
            'sections': []
        }
        
        current_level_1 = None
        current_level_2 = None
        
        for section in self.header_documents:
            section_data = {
                'header': section['header'],
                'content_preview': section['content'][:100] + '...',
                'word_count': section['word_count'],
                'level': section['level']
            }
            
            if section['level'] == 1:
                current_level_1 = {'section': section_data, 'subsections': []}
                hierarchy['sections'].append(current_level_1)
                current_level_2 = None
            elif section['level'] == 2 and current_level_1:
                current_level_2 = {'section': section_data, 'subsections': []}
                current_level_1['subsections'].append(current_level_2)
            elif section['level'] == 3 and current_level_2:
                current_level_2['subsections'].append(section_data)
        
        return hierarchy
    
    def split_large_sections(self, max_chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Dict[str, Any]]:
        """
        Further split large sections using recursive text splitting.
        
        Args:
            max_chunk_size: Maximum chunk size for large sections
            chunk_overlap: Overlap between chunks
            
        Returns:
            List of all chunks (both header-based and split large sections)
        """
        if not self.header_documents:
            self.split_by_headers()
        
        all_chunks = []
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
        
        for section in self.header_documents:
            # If section is small, keep it as is
            if len(section['content']) <= max_chunk_size:
                all_chunks.append({
                    'header': section['header'],
                    'content': section['content'],
                    'level': section['level'],
                    'is_split': False,
                    'chunk_index': 0,
                    'total_chunks': 1
                })
            else:
                # Split large sections
                chunks = text_splitter.split_text(section['content'])
                for i, chunk in enumerate(chunks):
                    all_chunks.append({
                        'header': section['header'],
                        'content': chunk,
                        'level': section['level'],
                        'is_split': True,
                        'chunk_index': i,
                        'total_chunks': len(chunks)
                    })
        
        print(f"‚úì Created {len(all_chunks)} total chunks (including split large sections)")
        return all_chunks
    
    def print_document_structure(self) -> None:
        """Print the hierarchical structure of the document."""
        hierarchy = self.get_hierarchical_structure()
        
        print(f"\nüìë DOCUMENT STRUCTURE: {hierarchy['title']}")
        print("=" * 60)
        
        for section in hierarchy['sections']:
            level1 = section['section']
            print(f"\n# {level1['header']} ({level1['word_count']} words)")
            
            for subsection in section['subsections']:
                level2 = subsection['section']
                print(f"  ## {level2['header']} ({level2['word_count']} words)")
                
                for subsubsection in subsection['subsections']:
                    level3 = subsubsection
                    print(f"    ### {level3['header']} ({level3['word_count']} words)")
    
    def export_to_langchain_documents(self) -> List[Any]:
        """
        Convert header-based chunks to LangChain Document format.
        
        Returns:
            List of LangChain Document objects
        """
        from langchain.schema import Document
        
        if not self.header_documents:
            self.split_by_headers()
        
        langchain_docs = []
        
        for i, section in enumerate(self.header_documents):
            # Create enhanced metadata
            metadata = {
                'source': 'wikipedia',
                'title': self.raw_documents[0].metadata['title'] if self.raw_documents else 'Unknown',
                'header': section['header'],
                'level': section['level'],
                'section_index': i,
                'word_count': section['word_count'],
                'char_count': section['char_count'],
                'language': self.lang
            }
            
            # Combine header and content for better context
            enhanced_content = f"Section: {section['header']}\n\n{section['content']}"
            
            doc = Document(
                page_content=enhanced_content,
                metadata=metadata
            )
            langchain_docs.append(doc)
        
        return langchain_docs

# Example usage and demonstration
if __name__ == "__main__":
    # Example queries for Persian Wikipedia
    queries = ["ŸáŸàÿ¥ ŸÖÿµŸÜŸàÿπ€å"]
    
    # Create header splitter
    splitter = WikipediaHeaderSplitter(
        queries=queries,
        load_max_docs=1,
        lang='fa'
    )
    
    print("üîç ANALYZING WIKIPEDIA DOCUMENT STRUCTURE")
    print("=" * 50)
    
    # Split by headers
    sections = splitter.split_by_headers()
    
    # Print document structure
    splitter.print_document_structure()
    
    # Show section details
    print(f"\nüìä SECTION DETAILS:")
    print(f"Total sections found: {len(sections)}")
    
    for i, section in enumerate(sections[:5]):  # Show first 5 sections
        print(f"\n--- Section {i+1} ---")
        print(f"Header: {section['header']} (Level {section['level']})")
        print(f"Word count: {section['word_count']}")
        print(f"Content preview: {section['content'][:150]}...")
    
    # Further split large sections if needed
    print(f"\nüîÑ SPLITTING LARGE SECTIONS:")
    all_chunks = splitter.split_large_sections(max_chunk_size=800, chunk_overlap=100)
    
    print(f"Total chunks after splitting: {len(all_chunks)}")
    
    # Show split chunks
    split_chunks = [chunk for chunk in all_chunks if chunk['is_split']]
    if split_chunks:
        print(f"Split chunks: {len(split_chunks)}")
        for chunk in split_chunks[:3]:
            print(f"  - {chunk['header']} [Part {chunk['chunk_index'] + 1}/{chunk['total_chunks']}]")
    
    # Export to LangChain documents
    print(f"\nüìÑ EXPORTING TO LANGCHAIN DOCUMENTS:")
    langchain_docs = splitter.export_to_langchain_documents()
    print(f"Created {len(langchain_docs)} LangChain Document objects")
    
    # Show first document as example
    if langchain_docs:
        first_doc = langchain_docs[0]
        print(f"\nFirst document preview:")
        print(f"Content: {first_doc.page_content[:200]}...")
        print(f"Metadata: {first_doc.metadata}")