In [1]:
%pip install -U langchain-text-splitters langchain-community langgraph langchain-ollama langchain-chroma pandas markdown

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os

# Not needed until LLM is invoked
# os.environ["LANGSMITH_TRACING"] = "true"
# os.environ["LANGSMITH_API_KEY"] = "YOUR_API_KEY"

In [None]:
from langchain_ollama import ChatOllama


llm = ChatOllama(
    model="llama3.1",
    temperature=0.7,
)

In [2]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [3]:
from langchain_chroma import Chroma

vector_store = Chroma(embedding_function=embeddings, persist_directory="./chroma_db-nomic")

### Vectorizing Documents

In [50]:
# This is a small test
import os
import glob
import re
import json
from typing import List
import pandas as pd

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_core.documents import Document

def process_markdown_file(filepath: str, textbooks_df, special_chapters_df) -> List[Document]:

    # Get folder name from file path
    folder_name = os.path.basename(os.path.dirname(filepath))
    
    # Parse folder name to extract subject code, grade, and chapter code
    parts = folder_name.split('_')
    
    if len(parts) < 3:
        print(f"Warning: Folder name '{folder_name}' doesn't match expected format.")
        parsed_info = None
    
    subject_code = parts[0]
    grade = parts[1]
    chapter_code = '_'.join(parts[2:]) if len(parts) > 3 else parts[2]

    parsed_info = {
        "grade": grade,
        "subject_code": subject_code,
        "chapter_code": chapter_code
    }

    if not parsed_info:
        return []
    
    grade = parsed_info["grade"]
    subject_code = parsed_info["subject_code"]
    chapter_code = parsed_info["chapter_code"]
    
    # Look up subject name and textbook
    subject_info = textbooks_df[
        (textbooks_df['grade'] == int(grade)) & 
        (textbooks_df['subjectcode'] == subject_code)
    ]
    
    if subject_info.empty:
        print(f"Warning: No subject info found for grade {grade}, subject code {subject_code}")
        subject_name = subject_code
        textbook_name = "Unknown"
    else:
        subject_name = subject_info['subjectname'].iloc[0]
        textbook_name = subject_info['textbook'].iloc[0]
    
    # Format chapter title
    # Check if this is a special chapter
    chapter_title = chapter_code
    
    match = re.match(r'([a-zA-Z]+)(\d+)', chapter_code)
    if match:
        letter_code = match.group(1)
        number = match.group(2)
        
        # Look up special chapter in dataframe
        special_chapter = special_chapters_df[
            (special_chapters_df['grade'] == int(grade)) & 
            (special_chapters_df['subjectcode'] == subject_code) & 
            (special_chapters_df['special_chaptercode'] == letter_code)
        ]
        
        if not special_chapter.empty:
            chapter_type = special_chapter['chaptertype'].iloc[0]
            chapter_title = f"{chapter_type} {number}"
    
    # If it's just a number or didn't match any special chapter
    if chapter_code.isdigit():
        chapter_title = f"chapter {chapter_code}"
    
    # Load the markdown file
    loader = TextLoader(filepath)
    documents = loader.load()

    meta_filepath = os.path.join(os.path.dirname(filepath), os.path.basename(filepath).replace('.md', '_meta.json'))
    section_titles = []

    if os.path.exists(meta_filepath):
        with open(meta_filepath, 'r') as f:
            metadata = json.load(f)

            for section in metadata.get('table_of_contents', []):
                title = section.get('title', '')
                section_titles.append(title)

    headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ("####", "Header 4")]

    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
    chunks = text_splitter.split_text(documents[0].page_content)
    print(len(chunks), len(section_titles))
    
    # Add metadata to each chunk
    for i, chunk in enumerate(chunks):
        section_title = section_titles[i] if i < len(section_titles) else ""

        chunk.metadata.update({
            "grade": grade,
            "subject_code": subject_code,
            "chapter_code": chapter_code,
            "subject_name": subject_name,
            "textbook_name": textbook_name,
            "chapter_title": chapter_title,
            "section_title": section_title,
            "split_number": i + 1,
            "total_splits": len(chunks),
            "source": filepath
        })
    
    if len(chunks) != len(section_titles):
        print(f"Warning: Number of chunks ({len(chunks)}) doesn't match number of section titles ({len(section_titles)}) for {filepath}")

    return chunks

textbooks_df = pd.read_csv("textbooks.csv")
special_chapters_df = pd.read_csv("special_chapters.csv")

markdown_files = glob.glob(os.path.join("../marker-output", "**/*.md"), recursive=True)
print(f"Found {len(markdown_files)} markdown files to process")

for file_path in markdown_files:
        print(f"Processing file: {file_path}")
        try:
            chunks = process_markdown_file(file_path, textbooks_df, special_chapters_df)
            vector_store.add_documents(chunks)
            print(f"Successfully processed {file_path}")
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    
print("Finished processing all markdown files")

Found 783 markdown files to process
Processing file: ../marker-output/soc2_11_1/soc2_11_1.md
24 24
Successfully processed ../marker-output/soc2_11_1/soc2_11_1.md
Processing file: ../marker-output/bio_11_3/bio_11_3.md
16 18
Successfully processed ../marker-output/bio_11_3/bio_11_3.md
Processing file: ../marker-output/english_11_s8/english_11_s8.md
7 6
Successfully processed ../marker-output/english_11_s8/english_11_s8.md
Processing file: ../marker-output/bus_11_3/bus_11_3.md
37 39
Successfully processed ../marker-output/bus_11_3/bus_11_3.md
Processing file: ../marker-output/eco_10_4/eco_10_4.md
38 40
Successfully processed ../marker-output/eco_10_4/eco_10_4.md
Processing file: ../marker-output/eco_9_3/eco_9_3.md
25 25
Successfully processed ../marker-output/eco_9_3/eco_9_3.md
Processing file: ../marker-output/soc2_12_6/soc2_12_6.md
32 39
Successfully processed ../marker-output/soc2_12_6/soc2_12_6.md
Processing file: ../marker-output/english3_12_4/english3_12_4.md
5 5
Successfully proces

### Process Query

#### Custom Retriever Class

In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.retrievers import BaseRetriever
from typing import List, Dict, Any, Optional
from langchain_core.documents import Document
from pydantic import Field

# Define your custom retriever
class ContextExpandedFilteredRetriever(BaseRetriever):
    """Retriever that fetches top result plus surrounding context chunks with filtering."""
    
    # Explicitly define fields using Pydantic Field
    vectorstore: Any = Field(description="Vector store for retrievals")
    grade: str = Field(description="Grade level for filtering")
    subjects: List[str] = Field(description="List of subject codes for filtering")
    context_window: int = Field(default=5, description="Number of context chunks to retrieve")
    
    def _get_relevant_documents(self, query: str) -> List[Document]:
        # Create a filter for grade and subjects
        filter_dict = {
            "$and": [
                {"grade": self.grade},
                {"subject_code": {"$in": self.subjects}}
            ]
        }
        
        # Get the top result with filters
        results = self.vectorstore.similarity_search(
            query, 
            k=1,
            filter=filter_dict
        )
        if not results:
            return []
            
        top_result = results[0]
        
        # Extract metadata to identify position
        split_number = top_result.metadata.get("split_number", 0)
        total_splits = top_result.metadata.get("total_splits", 0)
        chapter_code = top_result.metadata.get("chapter_code")
        subject_code = top_result.metadata.get("subject_code")
        
        # Calculate range of splits to retrieve
        start_split = max(1, split_number - self.context_window)
        end_split = min(total_splits, split_number + self.context_window)
        
        # Retrieve all the relevant splits
        expanded_results = [top_result]  # Start with the top result
        
        # Get context before the top result
        for i in range(start_split, split_number):
            # Create a filter for this specific split
            split_filter = {
                "$and": [
                    {"grade": self.grade},
                    {"subject_code": subject_code},
                    {"chapter_code": chapter_code},
                    {"split_number": i}
                ]
            }
            
            context_docs = self.vectorstore.similarity_search(
                "", # Empty query to avoid biasing results
                k=1,
                filter=split_filter
            )
            
            if context_docs:
                expanded_results.insert(0, context_docs[0])  # Add before top result
        
        # Get context after the top result
        for i in range(split_number + 1, end_split + 1):
            # Create a filter for this specific split
            split_filter = {
                "$and": [
                    {"grade": self.grade},
                    {"subject_code": subject_code},
                    {"chapter_code": chapter_code},
                    {"split_number": i}
                ]
            }
            
            context_docs = self.vectorstore.similarity_search(
                "", # Empty query to avoid biasing results
                k=1,
                filter=split_filter
            )
            
            if context_docs:
                expanded_results.append(context_docs[0])  # Add after top result
        
        return expanded_results

#### Stream Generator Function

In [5]:
def generate(template, grade, subjects, query):
    # Create the custom retriever
    
    retriever = ContextExpandedFilteredRetriever(
        vectorstore=vector_store,  # Your vector store instance
        grade=grade,
        subjects=subjects,
        context_window=5  # Retrieve 5 chunks before and after
    )

    # Create the prompt template
    prompt = ChatPromptTemplate.from_template(template)

    # Build the chain
    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    # Run the chain
    for chunk in chain.stream(query):
        yield chunk

#### Testcases

Prompt Template

In [None]:
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""

In [None]:
grade = "11"
subjects = ["bio", "chem", "phy", "math"]
query = "What is differentiation"

for chunk in generate(template, grade, subjects, query):
    print(chunk, end="", flush=True)

Differentiation refers to the process of cells becoming specialized in structure and function to perform specific tasks within an organism. In the context of plant growth, differentiation involves the transformation of undifferentiated cells into specialized cells with distinct characteristics, such as root cells, leaf cells, or vascular tissue cells.

In the provided text, it is mentioned that "Auxins like IAA and indole butyric acid (IBA) have been isolated from plants. NAA (naphthalene acetic acid) and 2, 4-D (2, 4-dichlorophenoxyacetic) are synthetic auxins." This implies that auxins play a role in promoting differentiation in plant cells.

However, the text does not explicitly define differentiation.


In [None]:
grade = "11"
subjects = ["chem", "phy", "math", "comp"]
query = "What is differentiation"

for chunk in generate(template, grade, subjects, query):
    print(chunk, end="", flush=True)

Differentiation is a fundamental concept in calculus that deals with the study of rates of change and slopes of curves. It is a mathematical operation that finds the derivative of a function, which represents the rate at which the function changes as its input (or independent variable) changes.

In other words, differentiation measures how fast a function changes when its input changes. It is used to find the slope of a curve at a given point, and it can be thought of as the limit of the ratio of the change in the output of a function to the change in its input as the input changes.

There are several types of differentiation, including:

1. **First derivative**: This measures the rate of change of a function with respect to its input.
2. **Second derivative**: This measures the rate of change of the first derivative with respect to the input.
3. **Higher-order derivatives**: These measure the rate of change of higher-order derivatives with respect to the input.

Differentiation has nu

In [None]:
grade = "10"
subjects = ["english", "english2", "history", "math"]
query = "What did Lencho ask God"

for chunk in generate(template, grade, subjects, query):
    print(chunk, end="", flush=True)

According to the story, Lencho asked God for 100 pesos (money) in his first letter, but only received 70 pesos in response. In his second letter, he asked God to send him the remaining 30 pesos, and also requested that it not be sent through the mail because the post office employees are "crooks".
