In [42]:
%pip install -U langchain-text-splitters langchain-community langgraph langchain-ollama langchain-chroma unstructured pandas markdown unstructured[md] faiss-cpu faiss-gpu-cu12

Collecting faiss-gpu-cu12
  Using cached faiss_gpu_cu12-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting nvidia-cuda-runtime-cu12>=12.1.105 (from faiss-gpu-cu12)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cublas-cu12>=12.1.3.1 (from faiss-gpu-cu12)
  Downloading nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl.metadata (1.7 kB)
Using cached faiss_gpu_cu12-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (47.9 MB)
Downloading nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl (594.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[?25hDownloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (954 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954

In [None]:
import os

# Not needed until LLM is invoked
# os.environ["LANGSMITH_TRACING"] = "true"
# os.environ["LANGSMITH_API_KEY"] = "YOUR_API_KEY"

In [2]:
from langchain_ollama import ChatOllama


llm = ChatOllama(
    model="llama3.1",
    temperature=0,
)

In [11]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-m3", model_kwargs={"device": "cpu"})

In [12]:
from langchain_chroma import Chroma

vector_store = Chroma(embedding_function=embeddings, persist_directory="./chroma_db_bge_m3")

### Vectorizing Documents

In [None]:
# This is a small test
import os
import glob
import re
import json
from typing import List
import pandas as pd

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_core.documents import Document

def process_markdown_file(filepath: str, textbooks_df, special_chapters_df) -> List[Document]:

    # Get folder name from file path
    folder_name = os.path.basename(os.path.dirname(filepath))
    
    # Parse folder name to extract subject code, grade, and chapter code
    parts = folder_name.split('_')
    
    if len(parts) < 3:
        print(f"Warning: Folder name '{folder_name}' doesn't match expected format.")
        parsed_info = None
    
    subject_code = parts[0]
    grade = parts[1]
    chapter_code = '_'.join(parts[2:]) if len(parts) > 3 else parts[2]

    parsed_info = {
        "grade": grade,
        "subject_code": subject_code,
        "chapter_code": chapter_code
    }

    if not parsed_info:
        return []
    
    grade = parsed_info["grade"]
    subject_code = parsed_info["subject_code"]
    chapter_code = parsed_info["chapter_code"]
    
    # Look up subject name and textbook
    subject_info = textbooks_df[
        (textbooks_df['grade'] == int(grade)) & 
        (textbooks_df['subjectcode'] == subject_code)
    ]
    
    if subject_info.empty:
        print(f"Warning: No subject info found for grade {grade}, subject code {subject_code}")
        subject_name = subject_code
        textbook_name = "Unknown"
    else:
        subject_name = subject_info['subjectname'].iloc[0]
        textbook_name = subject_info['textbook'].iloc[0]
    
    # Format chapter title
    # Check if this is a special chapter
    chapter_title = chapter_code
    
    match = re.match(r'([a-zA-Z]+)(\d+)', chapter_code)
    if match:
        letter_code = match.group(1)
        number = match.group(2)
        
        # Look up special chapter in dataframe
        special_chapter = special_chapters_df[
            (special_chapters_df['grade'] == int(grade)) & 
            (special_chapters_df['subjectcode'] == subject_code) & 
            (special_chapters_df['special_chaptercode'] == letter_code)
        ]
        
        if not special_chapter.empty:
            chapter_type = special_chapter['chaptertype'].iloc[0]
            chapter_title = f"{chapter_type} {number}"
    
    # If it's just a number or didn't match any special chapter
    if chapter_code.isdigit():
        chapter_title = f"chapter {chapter_code}"
    
    # Load the markdown file
    loader = TextLoader(filepath)
    documents = loader.load()

    meta_filepath = os.path.join(os.path.dirname(filepath), os.path.basename(filepath).replace('.md', '_meta.json'))
    section_titles = []

    if os.path.exists(meta_filepath):
        with open(meta_filepath, 'r') as f:
            metadata = json.load(f)

            for section in metadata.get('table_of_contents', []):
                title = section.get('title', '')
                section_titles.append(title)

    headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ("####", "Header 4")]

    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
    chunks = text_splitter.split_text(documents[0].page_content)
    print(len(chunks), len(section_titles))
    
    # Add metadata to each chunk
    for i, chunk in enumerate(chunks):
        section_title = section_titles[i] if i < len(section_titles) else ""

        chunk.metadata.update({
            "grade": grade,
            "subject_code": subject_code,
            "chapter_code": chapter_code,
            "subject_name": subject_name,
            "textbook_name": textbook_name,
            "chapter_title": chapter_title,
            "section_title": section_title,
            "split_number": i + 1,
            "total_splits": len(chunks),
            "source": filepath
        })
    
    if len(chunks) != len(section_titles):
        print(f"Warning: Number of chunks ({len(chunks)}) doesn't match number of section titles ({len(section_titles)}) for {filepath}")

    return chunks

textbooks_df = pd.read_csv("textbooks.csv")
special_chapters_df = pd.read_csv("special_chapters.csv")

markdown_files = glob.glob(os.path.join("../marker-output", "**/*.md"), recursive=True)
print(f"Found {len(markdown_files)} markdown files to process")

for file_path in markdown_files:
        print(f"Processing file: {file_path}")
        try:
            chunks = process_markdown_file(file_path, textbooks_df, special_chapters_df)
            vector_store.add_documents(chunks)
            print(f"Successfully processed {file_path}")
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    
print("Finished processing all markdown files")

Found 783 markdown files to process
Processing file: ../marker-output/soc2_11_1/soc2_11_1.md
24 24
Successfully processed ../marker-output/soc2_11_1/soc2_11_1.md
Processing file: ../marker-output/bio_11_3/bio_11_3.md
16 18
Successfully processed ../marker-output/bio_11_3/bio_11_3.md
Processing file: ../marker-output/english_11_s8/english_11_s8.md
7 6
Successfully processed ../marker-output/english_11_s8/english_11_s8.md
Processing file: ../marker-output/bus_11_3/bus_11_3.md
37 39
Successfully processed ../marker-output/bus_11_3/bus_11_3.md
Processing file: ../marker-output/eco_10_4/eco_10_4.md
38 40
Successfully processed ../marker-output/eco_10_4/eco_10_4.md
Processing file: ../marker-output/eco_9_3/eco_9_3.md
25 25
Successfully processed ../marker-output/eco_9_3/eco_9_3.md
Processing file: ../marker-output/soc2_12_6/soc2_12_6.md
32 39
Successfully processed ../marker-output/soc2_12_6/soc2_12_6.md
Processing file: ../marker-output/english3_12_4/english3_12_4.md
5 5
Successfully proces

### Load Vector Database

In [6]:
retrieved_docs = vector_store.similarity_search_with_relevance_scores("what is differentiation in math?", k=100)

print(retrieved_docs[0][0].page_content)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [37]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader, TextLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

loader = TextLoader("../marker-output/bio_11_11/bio_11_11.md")
documents = loader.load()
# for doc in documents:
#     print(doc.page_content)

text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ("####", "Header 4")], strip_headers=False)
chunks = text_splitter.split_text(documents[0].page_content)
for chunk in chunks:
    print(chunk.metadata)
    print(len(chunk.page_content))
    print()

{}
27

{'Header 1': 'PLANT PHYSIOLOGY', 'Header 4': 'Chapter 11'}
2178

{'Header 1': 'PLANT PHYSIOLOGY', 'Header 2': 'PHOTOSYNTHESIS IN HIGHER PLANTS CHAPTER 11'}
1699

{'Header 1': 'PLANT PHYSIOLOGY', 'Header 2': '11.1 WHAT DO WE KNOW?'}
1228

{'Header 1': 'PLANT PHYSIOLOGY', 'Header 2': '11.1 WHAT DO WE KNOW?', 'Header 4': '11.2 EARLY EXPERIMENTS'}
4558

{'Header 1': 'PLANT PHYSIOLOGY', 'Header 2': '11.1 WHAT DO WE KNOW?', 'Header 4': '11.3 WHERE DOES PHOTOSYNTHESIS TAKE PLACE?'}
1814

{'Header 1': 'PLANT PHYSIOLOGY', 'Header 2': '11.1 WHAT DO WE KNOW?', 'Header 4': '11.4 HOW MANY TYPES OF PIGMENTS ARE INVOLVED IN PHOTOSYNTHESIS?'}
2833

{'Header 1': 'PLANT PHYSIOLOGY', 'Header 2': '11.1 WHAT DO WE KNOW?', 'Header 4': '11.5 WHAT IS LIGHT REACTION?'}
1235

{'Header 1': 'PLANT PHYSIOLOGY', 'Header 2': '11.1 WHAT DO WE KNOW?', 'Header 4': '11.6 THE ELECTRON TRANSPORT'}
1452

{'Header 1': 'PLANT PHYSIOLOGY', 'Header 2': '11.1 WHAT DO WE KNOW?', 'Header 4': '11.6.1 Splitting of Water'}
83