In [1]:
%pip install -U langchain-text-splitters langchain-community langgraph langchain-ollama langchain-chroma unstructured pandas markdown unstructured[md] faiss-cpu faiss-gpu-cu12 langchain_huggingface


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l-v2.0")

In [9]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(embedding_function=embeddings, index=index, docstore=InMemoryDocstore(), index_to_docstore_id={})

### Vectorizing Documents

In [13]:
# This is a small test
import os
import glob
import re
import json
from typing import List
import torch
import pandas as pd

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_core.documents import Document

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

def process_markdown_file(filepath: str, textbooks_df, special_chapters_df) -> List[Document]:
    # Get folder name from file path
    folder_name = os.path.basename(os.path.dirname(filepath))
    
    # Parse folder name to extract subject code, grade, and chapter code
    parts = folder_name.split('_')
    
    if len(parts) < 3:
        print(f"Warning: Folder name '{folder_name}' doesn't match expected format.")
        parsed_info = None
    
    subject_code = parts[0]
    grade = parts[1]
    chapter_code = '_'.join(parts[2:]) if len(parts) > 3 else parts[2]

    parsed_info = {
        "grade": grade,
        "subject_code": subject_code,
        "chapter_code": chapter_code
    }

    if not parsed_info:
        return []
    
    grade = parsed_info["grade"]
    subject_code = parsed_info["subject_code"]
    chapter_code = parsed_info["chapter_code"]
    
    # Look up subject name and textbook
    subject_info = textbooks_df[
        (textbooks_df['grade'] == int(grade)) & 
        (textbooks_df['subjectcode'] == subject_code)
    ]
    
    if subject_info.empty:
        print(f"Warning: No subject info found for grade {grade}, subject code {subject_code}")
        subject_name = subject_code
        textbook_name = "Unknown"
    else:
        subject_name = subject_info['subjectname'].iloc[0]
        textbook_name = subject_info['textbook'].iloc[0]
    
    # Format chapter title
    # Check if this is a special chapter
    chapter_title = chapter_code
    
    match = re.match(r'([a-zA-Z]+)(\d+)', chapter_code)
    if match:
        letter_code = match.group(1)
        number = match.group(2)
        
        # Look up special chapter in dataframe
        special_chapter = special_chapters_df[
            (special_chapters_df['grade'] == int(grade)) & 
            (special_chapters_df['subjectcode'] == subject_code) & 
            (special_chapters_df['special_chaptercode'] == letter_code)
        ]
        
        if not special_chapter.empty:
            chapter_type = special_chapter['chaptertype'].iloc[0]
            chapter_title = f"{chapter_type} {number}"
    
    # If it's just a number or didn't match any special chapter
    if chapter_code.isdigit():
        chapter_title = f"chapter {chapter_code}"
    
    # Load the markdown file
    loader = TextLoader(filepath)
    documents = loader.load()

    meta_filepath = os.path.join(os.path.dirname(filepath), os.path.basename(filepath).replace('.md', '_meta.json'))
    section_titles = []

    if os.path.exists(meta_filepath):
        with open(meta_filepath, 'r') as f:
            metadata = json.load(f)

            for section in metadata.get('table_of_contents', []):
                title = section.get('title', '')
                section_titles.append(title)

    headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ("####", "Header 4")]

    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
    chunks = text_splitter.split_text(documents[0].page_content)
    print(len(chunks), len(section_titles))
    
    # Add metadata to each chunk
    for i, chunk in enumerate(chunks):
        section_title = section_titles[i] if i < len(section_titles) else ""

        chunk.metadata.update({
            "grade": grade,
            "subject_code": subject_code,
            "chapter_code": chapter_code,
            "subject_name": subject_name,
            "textbook_name": textbook_name,
            "chapter_title": chapter_title,
            "section_title": section_title,
            "split_number": i + 1,
            "total_splits": len(chunks),
            "source": filepath
        })
    
    if len(chunks) != len(section_titles):
        print(f"Warning: Number of chunks ({len(chunks)}) doesn't match number of section titles ({len(section_titles)}) for {filepath}")

    return chunks

textbooks_df = pd.read_csv("textbooks.csv")
special_chapters_df = pd.read_csv("special_chapters.csv")

markdown_files = glob.glob(os.path.join("../extracted_text", "**/*.md"), recursive=True)
print(f"Found {len(markdown_files)} markdown files to process")

for file_path in markdown_files:
        print(f"Processing file: {file_path}")
        try:
            torch.cuda.empty_cache()
            chunks = process_markdown_file(file_path, textbooks_df, special_chapters_df)
            vector_store.add_documents(chunks)
            vector_store.save_local("faiss-index")
            print(f"Successfully processed {file_path}")
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
            #save these errors file paths in a separate file
            with open("error_files.txt", "a") as f:
                f.write(file_path + "\n")
    
print("Finished processing all markdown files")

Found 790 markdown files to process
Processing file: ../extracted_text/bio_11_7/bio_11_7.md
8 7
Successfully processed ../extracted_text/bio_11_7/bio_11_7.md
Processing file: ../extracted_text/inf_11_8/inf_11_8.md
68 68
Successfully processed ../extracted_text/inf_11_8/inf_11_8.md
Processing file: ../extracted_text/geo_9_1/geo_9_1.md
12 14
Successfully processed ../extracted_text/geo_9_1/geo_9_1.md
Processing file: ../extracted_text/pol_12_1/pol_12_1.md
20 28
Successfully processed ../extracted_text/pol_12_1/pol_12_1.md
Processing file: ../extracted_text/sci_9_3/sci_9_3.md
34 34
Successfully processed ../extracted_text/sci_9_3/sci_9_3.md
Processing file: ../extracted_text/math_11_3/math_11_3.md
20 23
Error processing ../extracted_text/math_11_3/math_11_3.md: CUDA out of memory. Tried to allocate 762.00 MiB. GPU 0 has a total capacity of 5.79 GiB of which 765.69 MiB is free. Including non-PyTorch memory, this process has 5.03 GiB memory in use. Of the allocated memory 4.03 GiB is alloca

In [17]:
remaining_files = []
with open("error_files.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        lines = line.strip()
        line = line.strip("\n")
        remaining_files.append(line.strip())

def add_chunks_in_batches(chunks):
    for chunk in chunks:
        torch.cuda.empty_cache()
        vector_store.add_documents([chunk])
        vector_store.save_local("faiss-index")

while len(remaining_files) > 0:
    new_remaining_files = []
    print(f"Processing {len(remaining_files)} remaining files")
    for file_path in remaining_files:
        print(f"Processing file: {file_path}")
        try:
            torch.cuda.empty_cache()
            chunks = process_markdown_file(file_path, textbooks_df, special_chapters_df)
            add_chunks_in_batches(chunks)
            # vector_store.add_documents(chunks)
            # vector_store.save_local("faiss-index")
            print(f"Successfully processed {file_path}")
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
            new_remaining_files.append(file_path)
    with open("error_files.txt", "w") as f:
        for file_path in new_remaining_files:
            f.write(file_path + "\n")
    remaining_files = new_remaining_files

Processing 32 remaining files
Processing file: ../extracted_text/english2_12_pr5/english2_12_pr5.md
14 19
Successfully processed ../extracted_text/english2_12_pr5/english2_12_pr5.md
Processing file: ../extracted_text/english2_10_9/english2_10_9.md
11 12
Successfully processed ../extracted_text/english2_10_9/english2_10_9.md
Processing file: ../extracted_text/math_12_5/math_12_5.md
24 35
Successfully processed ../extracted_text/math_12_5/math_12_5.md
Processing file: ../extracted_text/english2_12_pr8/english2_12_pr8.md
16 17
Successfully processed ../extracted_text/english2_12_pr8/english2_12_pr8.md
Processing file: ../extracted_text/eco2_12_6/eco2_12_6.md
28 30
Successfully processed ../extracted_text/eco2_12_6/eco2_12_6.md
Processing file: ../extracted_text/chem_11_a1/chem_11_a1.md
19 23
Successfully processed ../extracted_text/chem_11_a1/chem_11_a1.md
Processing file: ../extracted_text/math_11_6/math_11_6.md
14 20
Successfully processed ../extracted_text/math_11_6/math_11_6.md
Proces

### Process Query

In [19]:
retrieved_docs = vector_store.similarity_search_with_relevance_scores("what is integration?", k=100)

for doc in retrieved_docs:
    print(doc[0].metadata)


{'Header 4': 'Introduction IntroductionIntroduction', 'grade': '12', 'subject_code': 'eco', 'chapter_code': '1', 'subject_name': 'Economics', 'textbook_name': 'Introductory Microeconomics', 'chapter_title': 'chapter 1', 'section_title': 'Introduction\n \nIntroductionIntroduction', 'split_number': 1, 'total_splits': 15, 'source': '../extracted_text/eco_12_1/eco_12_1.md'}
{'Header 2': '**7.3 Methods of Integration**', 'grade': '12', 'subject_code': 'math', 'chapter_code': '7', 'subject_name': 'Mathematics', 'textbook_name': 'Mathematics', 'chapter_title': 'chapter 7', 'section_title': 'Reprint 2024-25', 'split_number': 9, 'total_splits': 42, 'source': '../extracted_text/math_12_7/math_12_7.md'}
{'Header 2': '**7.3 Methods of Integration**', 'grade': '12', 'subject_code': 'math', 'chapter_code': '7', 'subject_name': 'Mathematics', 'textbook_name': 'Mathematics', 'chapter_title': 'chapter 7', 'section_title': 'Reprint 2024-25', 'split_number': 9, 'total_splits': 42, 'source': '../extracted

# LLM

In [34]:
!pip install 'accelerate>=0.26.0'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate>=0.26.0
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.4.0-py3-none-any.whl (342 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.4.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model_version = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_version)
model = AutoModelForCausalLM.from_pretrained(model_version)
device = torch.device("cpu")
model = model.to(device)

ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`

In [29]:
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
# Define the retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})  # Retrieve top 3 chunks
# Create a Hugging Face pipeline for text generation
device = "cpu"
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.7,
    device=device
)
# Wrap the pipeline for LangChain compatibility
llm = HuggingFacePipeline(pipeline=pipe)
# Define the Prompt Template
template = """
Use the following context to answer the question. If unsure, say "I don't know."
Context:
{context}
Question: {question}
Answer:
"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
# Define the RAG Chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)

Device set to use cpu


In [30]:
query = "What is differentiation?"
result = rag_chain({"query": query})
# Extract the generated answer
answer = result["result"].split("Answer:")[1].strip()
print(answer)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 5.79 GiB of which 5.69 MiB is free. Including non-PyTorch memory, this process has 5.77 GiB memory in use. Of the allocated memory 5.61 GiB is allocated by PyTorch, and 65.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFacePipeline, ChatHuggingFace

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l-v2.0")

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(embedding_function=embeddings, index=index, docstore=InMemoryDocstore(), index_to_docstore_id={})
vector_store.load_local("faiss-index", embeddings=embeddings, allow_dangerous_deserialization=True)

retriever = vector_store.as_retriever(search_kwargs={"k": 3})  # Retrieve top 3 chunks

result = retriever.invoke("what is integration?")
print(result)

llm = HuggingFacePipeline.from_model_id(
    model_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    task="text-generation",
    device=-1,
    pipeline_kwargs=dict(
        max_new_tokens=256,
        temperature=0.7,
    )
)

chat_model = ChatHuggingFace(llm=llm)

RAG_TEMPLATE = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

<context>
{context}
</context>

Answer the following question:

{question}"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)


qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | chat_model
    | StrOutputParser()
)
 

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFacePipeline, ChatHuggingFace

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l-v2.0")

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(embedding_function=embeddings, index=index, docstore=InMemoryDocstore(), index_to_docstore_id={})
vector_store.load_local("faiss-index", embeddings=embeddings, allow_dangerous_deserialization=True)

retriever = vector_store.as_retriever(search_kwargs={"k": 3})  # Retrieve top 3 chunks

result = retriever.invoke("what is integration?")
print(result)