In [1]:
!pwd

/Users/hieu/Projects/tiny-llm-agent/rag-pipeline/notebooks


### Install Packages
Make sure we have an activated virtual environment running Python 3.10

In [None]:
%pip install -r ../requirements.txt

In [3]:
import torch
torch.cuda.is_available()
torch.backends.mps.is_available()

hardware = "cpu"
if torch.cuda.is_available():
    print("CUDA is available")
    hardware = "cuda"
else:
    if torch.backends.mps.is_available():
        print("MPS is available")
        hardware = "mps"
        
print("Hardware is set to: ", hardware)

MPS is available
Hardware is set to:  mps


### Download LLM Model

In [4]:
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# We use Qwen2.5-0.5B-Instruct as our local LLM model (~1GB)
# Link to download the model: https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e
model_path = "../models/Qwen2.5-0.5B-Instruct"  # local directory to save the model
snapshot_download(repo_id="Qwen/Qwen2.5-0.5B-Instruct", local_dir=model_path)

if hardware == "mps":
    model = AutoModelForCausalLM.from_pretrained(
        model_path, 
        device_map="auto"
        )
else:
    # Quantize the model
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_path, 
        device_map="auto",
        quantization_config=quantization_config
        )

# Compile the model for faster execution
model = torch.compile(model)
print("model:", model)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 5183.91it/s]
Some parameters are on the meta device because they were offloaded to the disk.


model: OptimizedModule(
  (_orig_mod): Qwen2ForCausalLM(
    (model): Qwen2Model(
      (embed_tokens): Embedding(151936, 896)
      (layers): ModuleList(
        (0-23): 24 x Qwen2DecoderLayer(
          (self_attn): Qwen2Attention(
            (q_proj): Linear(in_features=896, out_features=896, bias=True)
            (k_proj): Linear(in_features=896, out_features=128, bias=True)
            (v_proj): Linear(in_features=896, out_features=128, bias=True)
            (o_proj): Linear(in_features=896, out_features=896, bias=False)
          )
          (mlp): Qwen2MLP(
            (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
            (up_proj): Linear(in_features=896, out_features=4864, bias=False)
            (down_proj): Linear(in_features=4864, out_features=896, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
          (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        )
      

### Parse PDF Document

In [5]:
from langchain_community.document_loaders import PyPDFLoader

# Load a PDF document using PyMuPDF
loader = PyPDFLoader("../examples/example.pdf")
docs = loader.load()  # a list of Document objects
print("Number of pages:", len(docs))
docs[0]  # first page

Number of pages: 8


Document(metadata={'producer': 'GPL Ghostscript 10.01.2', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-12-01T21:32:19-05:00', 'moddate': '2024-12-01T21:32:19-05:00', 'title': 'DeMo: Decoupled Momentum Optimization', 'subject': '', 'author': 'Bowen Peng, Jeffrey Quesnelle, Diederik P. Kingma', 'keywords': '', 'source': '../examples/example.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='arXiv:2411.19870v1  [cs.LG]  29 Nov 2024\nDeMo: Decoupled Momentum Optimization\nBowen Peng 1∗ Jeffrey Quesnelle 1† Diederik P . Kingma ‡\n1Nous Research\nAbstract\nTraining large neural networks typically requires sharing gradients between ac-\ncelerators through specialized high-speed interconnects . Drawing from the sig-\nnal processing principles of frequency decomposition and e nergy compaction,\nwe demonstrate that synchronizing full optimizer states an d model parameters\nduring training is unnecessary. By decoupling momentum upd ates and allow-\ning controlled dive

### Create Text Chunks

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # maximum number of characters per chunk
    chunk_overlap=100,  # number of characters to overlap between chunks
)

chunks = []
for doc in docs:
    chunk_list = text_splitter.split_text(doc.page_content)
    for chunk in chunk_list:
        chunks.append(chunk)

print("Number of chunks:", len(chunks))
chunks[:3]

Number of chunks: 30


['arXiv:2411.19870v1  [cs.LG]  29 Nov 2024\nDeMo: Decoupled Momentum Optimization\nBowen Peng 1∗ Jeffrey Quesnelle 1† Diederik P . Kingma ‡\n1Nous Research\nAbstract\nTraining large neural networks typically requires sharing gradients between ac-\ncelerators through specialized high-speed interconnects . Drawing from the sig-\nnal processing principles of frequency decomposition and e nergy compaction,\nwe demonstrate that synchronizing full optimizer states an d model parameters\nduring training is unnecessary. By decoupling momentum upd ates and allow-\ning controlled divergence in optimizer states across accel erators, we achieve\nimproved convergence compared to state-of-the-art optimi zers. W e introduce\nDecoupled Momentum (DeMo), a fused optimizer and data parallel algorith m\nthat reduces inter-accelerator communication requiremen ts by several orders of\nmagnitude. This enables training of large neural networks e ven with limited',
 'magnitude. This enables training of large n

### Create Vector Database

#### Caching Setup

In [7]:
import os
import hashlib
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS 

def compute_content_hash(chunks: list, embedding_model_name: str) -> str:
    """Compute a hash based on document content and embedding model name.

    Args:
        chunks (list): List of text chunks.
        embedding_model_name (str): Model name.
        
    Returns:
        Hash string.
    """
    content = "".join(chunks) + embedding_model_name
    return hashlib.md5(content.encode()).hexdigest()[:8]

def get_vector_store(chunks: list, embeddings: HuggingFaceEmbeddings, cache_dir: str) -> FAISS:
    """Retrieves a vector store from a list of text chunks using the given embeddings.

    Args:
        chunks (list): List of text chunks.
        embeddings (HuggingFaceEmbeddings): Embeddings object.
        cache_dir (str): Directory to save the vector store.
        
    Returns:
        FAISS object.
    """
    # Compute content hash
    embedding_model_name = embeddings.model_name
    current_hash = compute_content_hash(chunks, embedding_model_name)
    
    # Check if cached index exists and contains a valid hash
    hash_file = os.path.join(cache_dir, "content_hash.txt")
    if os.path.exists(cache_dir) and os.path.exists(hash_file):
        with open(hash_file, "r") as f:
            cached_hash = f.read().strip()
            
        if current_hash == cached_hash:
            print("Loading cached FAISS index ...")
            return FAISS.load_local(
                folder_path=cache_dir,
                embeddings=embeddings,
                allow_dangerous_deserialization=True
            )
        else:
            print("Cache invalidated due to changes in documents or embedding model.")
            import shutil
            shutil.rmtree(cache_dir)
            
    # Create a new vector store
    print("Creating a new FAISS index ...")
    vector_store = FAISS.from_texts(chunks, embedding=embeddings)    
    
    # Save the new index and hash
    os.makedirs(cache_dir, exist_ok=True)
    vector_store.save_local(cache_dir)
    with open(hash_file, "w") as f:
        f.write(current_hash)    
        
    return vector_store

#### Retriever Setup

In [8]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": hardware}
)

vector_store = get_vector_store(chunks, embeddings, "../vector_store")

# Test a simple similarity search
query = "The goal of the transformer model is"
results = vector_store.search(query, k=2, search_type="similarity")
print(results)

retriever = vector_store.as_retriever(
    search_type="similarity",  # search type: "similarity" or "mmr"
    search_kwargs={"k": 2}
)

  embeddings = HuggingFaceEmbeddings(


Cache invalidated due to changes in documents or embedding model.
Creating a new FAISS index ...
[Document(id='31441a37-7443-4720-9b50-ba59700a85c0', metadata={}, page_content='W e evaluated the signum variant of DeMo using OLMo [4], a highly reproducible large language\nmodel pre-training framework. Adapting OLMo to use DeMo req uired only including the DeMo\noptimizer class and disabling gradient synchronization in PyT orch Distributed Data Parallelism [5].\nW e provide the modiﬁed OLMo code as well as the conﬁguration ﬁ les for all experiments in the\nsupplementary material.\nOur experiments used the Dolma v1.55 dataset for pre-training. As a baseline we used the publicly\nreleased OLMo-1B 6, a standard decoder-only Transformer model consisting of 1 .18 billion param-\neters using the AdamW optimizer ( β1 = 0.9, β2 = 0.95, weight decay = 0.1) as compared to\nusing the DeMo optimizer ( β = 0.999). The learning rate and the AdamW hyperparameters were\nuntouched and set with the sugges

### Setup RAG Pipeline

In [9]:
from transformers import AutoTokenizer, pipeline
from langchain_huggingface import HuggingFacePipeline

tokenizer = AutoTokenizer.from_pretrained(model_path)

# RAG pipeline with a maximum length of max_new_tokens tokens
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    repetition_penalty=1.2,
    max_new_tokens=256
)

# Wrap the HuggingFace pipeline in a LangChain object
local_llm = HuggingFacePipeline(pipeline=pipe)

Device set to use mps
The model 'OptimizedModule' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCau

### Prompt Engineering

In [10]:
from langchain.prompts import PromptTemplate

# Define the improved prompt template
prompt_template = """Answer based on context:\n{context}\nQuestion: {question}\nAnswer:"""

prompt = PromptTemplate(
    input_variables=["contenxt", "question"],
    template=prompt_template
)

### Question-Answer Chain

In [11]:
from langchain.chains.retrieval_qa.base import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=False,
    chain_type_kwargs={"prompt": prompt}
)


### User Query

In [12]:
question = "Describe the main idea of the paper."
response = qa_chain(question)
answer = response["result"].split("Answer:")[-1].strip()
print(answer)

  response = qa_chain(question)


The paper describes an approach that uses deep learning models to improve natural language processing tasks such as machine translation by training them with large amounts of annotated data from different languages. It also discusses how these models can be integrated into existing systems like those found in Google Translate or Microsoft Translator, which are widely recognized as essential tools for human-machine interaction in international communication. Additionally, it highlights the challenges involved when developing efficient architectures capable of handling high volumes of text at scale while maintaining accuracy and performance. Furthermore, the authors emphasize the importance of reproducibility and comparison between their methods and other approaches in order to facilitate future improvements and comparisons. They provide detailed descriptions of each component of the architecture including encoder-decoder networks, attention mechanisms, cross-attention layers, pooling te