# Retrieval-Augmented Generation (RAG) Development

This notebook contains code and documentation for developing the retrieval-augmented generation (RAG) functionality using a custom LLM model. The goal is to integrate the LLM with a retrieval mechanism to enhance the response generation process.

In [None]:
# Install required packages
%pip install --upgrade pip
%pip install sentence-transformers flagembedding langchain langchain-huggingface transformers bitsandbytes accelerate huggingface_hub langchain-chroma langchain-community wikipedia --upgrade

In [None]:
# HuggingFace Login
from huggingface_hub import login
import os

# Option 1: Use environment variable (recommended)
token = os.getenv('HUGGINGFACE_TOKEN')
if token:
    login(token=token)
    print("Logged in using environment variable")
else:
    # Option 2: Manual login (uncomment and add your token)
    # login(token='your_huggingface_token_here')
    print('Please set HUGGINGFACE_TOKEN environment variable or manually add your token')

In [None]:
# Load LLM Model
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import pipeline

model_id = 'Qwen/Qwen2.5-1.5B-Instruct'  # Model address

# Quantization configuration if needed
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype='auto',
    quantization_config=bnb_config,
    device_map={"": 0}
)

print("Model loaded successfully!")

In [None]:
# Setup Pipeline and Chat Model
from langchain_huggingface import HuggingFacePipeline, ChatHuggingFace

# Generation configuration
gen_config = dict(
    do_sample=True,
    max_new_tokens=512,
    repetition_penalty=1.1,
    temperature=0.7,
    top_p=0.8,
    top_k=20
)

# Create pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    **gen_config
)

# Create LangChain components
llm = HuggingFacePipeline(pipeline=pipe)
chat_model = ChatHuggingFace(llm=llm, tokenizer=tokenizer)

print("Pipeline and chat model setup complete!")

In [None]:
# Setup Embedding Model
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
import gc

model_name = "BAAI/bge-m3"

# Load and save embedding model
emb_model = SentenceTransformer(model_name, device='cpu')
emb_model.save('./bge-m3')
del emb_model
gc.collect()

# Load embeddings for LangChain
embeddings = HuggingFaceEmbeddings(
    model_name='./bge-m3',
    model_kwargs={'device': 'cpu'}
)

print("Embedding model setup complete!")

In [None]:
# Load Documents and Create Vector Database
from langchain_community.document_loaders import WikipediaLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
import uuid

# Load documents from Wikipedia
examples = ['챗GPT', '인공지능', '트랜스포머_(기계_학습)']
docs = []
for query in examples:
    loader = WikipediaLoader(query=query, lang='ko', load_max_docs=1, doc_content_chars_max=1000)
    docs += loader.load()

print(f"Loaded {len(docs)} documents")

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80)
chunks = text_splitter.split_documents(docs)
print(f"Created {len(chunks)} chunks")

# Create vector database
random_dir = f"./RAG_db_{str(uuid.uuid4())[:8]}"
print(f"Creating vector database in: {random_dir}")

db = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=random_dir,
    collection_metadata={'hnsw:space': 'l2'}
)

# Create retriever
retriever = db.as_retriever(search_kwargs={'k': 2})

print("Vector database and retriever setup complete!")

In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Define the function to format documents
def format_docs(docs):
    return "\n---\n".join('주제: ' + doc.metadata['title'] + '\n' + doc.page_content for doc in docs)

# Initialize the RAG prompt
RAG_prompt = ChatPromptTemplate.from_messages([
    ('system', 'Answer the following Question using the Context.'),
    ('user', 'Context: {context}\n---\nQuestion: {question}')
])

# Create the RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | RAG_prompt
    | chat_model
    | StrOutputParser()
)

print("RAG chain setup complete!")

# Test the RAG chain
try:
    response = rag_chain.invoke("트랜스포머가 뭐예요?")
    print(f"Response: {response}")
except Exception as e:
    print(f"Error occurred: {e}")
    import traceback
    traceback.print_exc()


## Additional Examples

You can test the RAG functionality with various questions to see how well the model retrieves and generates responses based on the context provided.

In [None]:
# Test with multiple questions
questions = [
    "인공지능의 위험은 없나요?",
    "챗GPT는 무엇인가요?",
    "트랜스포머 모델의 특징은?",
    "인공지능은 어떤 분야인가요?"
]

for question in questions:
    try:
        print(f"Question: {question}")
        response = rag_chain.invoke(question)
        print(f"Answer: {response}\n")
        print("-" * 50)
    except Exception as e:
        print(f"Error for question '{question}': {e}")
        import traceback
        traceback.print_exc()
        print("-" * 50)


In [None]:
# Advanced RAG with Source Information
from langchain_core.runnables import RunnableParallel

def format_docs_with_source(docs):
    formatted = []
    for i, doc in enumerate(docs, 1):
        formatted.append(f"[출처 {i}] 주제: {doc.metadata['title']}\n내용: {doc.page_content}")
    return "\n\n".join(formatted)

# RAG chain that includes source documents
rag_chain_from_docs = (
    RAG_prompt
    | chat_model
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever | format_docs_with_source, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

# Test with source information
question = "인공지능은 어떤 분야인가요?"
try:
    result = rag_chain_with_source.invoke(question)
    print(f"Question: {question}")
    print(f"Context: {result['context']}")
    print(f"Answer: {result['answer']}")
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Debugging and Troubleshooting
print("=== System Information ===")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

print("\n=== Model Information ===")
print(f"Model ID: {model_id}")
print(f"Model device: {model.device}")
print(f"Tokenizer vocab size: {len(tokenizer)}")

print("\n=== Database Information ===")
print(f"Vector database directory: {random_dir}")
print(f"Number of documents in database: {db._collection.count()}")

print("\n=== Testing Components ===")
# Test retriever
test_query = "인공지능"
try:
    retrieved_docs = retriever.invoke(test_query)
    print(f"Retrieved {len(retrieved_docs)} documents for query: '{test_query}'")
    for i, doc in enumerate(retrieved_docs):
        print(f"  Doc {i+1}: {doc.metadata['title'][:50]}...")
except Exception as e:
    print(f"Retriever error: {e}")

# Test chat model
try:
    test_response = chat_model.invoke([("user", "안녕하세요")])
    print(f"Chat model test successful: {test_response.content[:100]}...")
except Exception as e:
    print(f"Chat model error: {e}")

print("\n=== Ready for RAG! ===")

## Conclusion

This notebook demonstrates the setup and usage of a retrieval-augmented generation (RAG) system using a custom LLM model. You can further enhance the functionality by integrating more complex retrieval mechanisms or fine-tuning the LLM based on specific datasets.