# RAG SYSTEM LLMs

In [1]:
# Import libraries
import os
import glob
import numpy as np
import pandas as pd

from dotenv import load_dotenv
import gradio as gr
from openai import OpenAI
from sentence_transformers import SentenceTransformer

In [None]:
# Unzip the data files
import zipfile
def unzip_file(path, data_dir, delete=True):
    if not os.path.exists(path):
        print("Cannot find this zipfile.")
        return
    
    if path.endswith(".zip"):
        with zipfile.ZipFile(path, "r") as zipref:
            zipref.extractall(data_dir)
            print("Unzip succesfully to:",data_dir)
        if delete:
            os.remove(path)
            print("Deleted zipfile.")
            return
    else:
        print(f"This format file is not accepted: {path}")

data_dir = "../data"
path = f"{data_dir}/knowledge-base.zip"

unzip_file(path=path,data_dir=data_dir)

Cannot find this zipfile.


In [3]:
data_dir = "../data"
data_path = os.path.join(data_dir, "knowledge-base")
print(data_path)

../data/knowledge-base


In [32]:
# Hyperparameters
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", 'your_api_key_here')
MODEL_GPT = "gpt-4o-mini"
MODEL_EMBEDDING = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
TEMPERATURE = 0.1
# Add this check!
if not OPENAI_API_KEY:
    # If the key is not found, stop the program with a clear error
    raise ValueError("🔴 ERROR: The OPENAI_API_KEY environment variable is not set!")

# If the key is found, assign it and print a success message
OpenAI.api_key = OPENAI_API_KEY
print("OpenAI API Key loaded successfully.")
openai = OpenAI()

OpenAI API Key loaded successfully.


## Add data to the context

### Add employees data to the context

In [5]:
context = {}
employees_dir = os.path.join(data_path, "employees/*")
employees = glob.glob(employees_dir)

for employee in employees:
    name = employee.split(' ')[-1][:-3]
    doc = ""
    with open(employee, "r", encoding="utf-8") as f:
        doc = f.read()
    context[name]=doc

In [6]:
context.keys()

dict_keys(['Carter', 'Greene', 'Thomson', 'Tran', 'Thompson', 'Chen', 'Trenton', 'Harper', 'Blake', 'Lancaster', 'Bishop', 'Spencer'])

In [7]:
context["Lancaster"]

"# Avery Lancaster\n\n## Summary\n- **Date of Birth**: March 15, 1985  \n- **Job Title**: Co-Founder & Chief Executive Officer (CEO)  \n- **Location**: San Francisco, California  \n\n## Insurellm Career Progression\n- **2015 - Present**: Co-Founder & CEO  \n  Avery Lancaster co-founded Insurellm in 2015 and has since guided the company to its current position as a leading Insurance Tech provider. Avery is known for her innovative leadership strategies and risk management expertise that have catapulted the company into the mainstream insurance market.  \n\n- **2013 - 2015**: Senior Product Manager at Innovate Insurance Solutions  \n  Before launching Insurellm, Avery was a leading Senior Product Manager at Innovate Insurance Solutions, where she developed groundbreaking insurance products aimed at the tech sector.  \n\n- **2010 - 2013**: Business Analyst at Edge Analytics  \n  Prior to joining Innovate, Avery worked as a Business Analyst, focusing on market trends and consumer preferenc

In [21]:
class UnifiedContextRetriever:
    def __init__(self, paths_list, model_name=MODEL_GPT):
        print(f"Loading embedding model '{model_name}'...")
        self.model = SentenceTransformer(model_name)
        self.documents = self._load_index_documents(paths_list)

    def _load_index_documents(self, paths_list):
        all_docs = []
        for path_parttern in paths_list:
            source_type = os.path.basename(os.path.dirname(path_parttern))
            
            for file_path in glob.glob(path_parttern):
                name = os.path.splitext(os.path.basename(file_path))[0]
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    all_docs.append({
                        "name": name,
                        "content": content,
                        "source": source_type
                    })

        # create embedding for all of documents
        print(f"Creating embeddings for {len(all_docs)} documents...")
        contents = [doc['content'] for doc in all_docs]
        embeddings = self.model.encode(contents, show_progress_bar=True)
        
        # Gắn embedding vào lại mỗi document
        for i, doc in enumerate(all_docs):
            doc['embedding'] = embeddings[i]
            
        return all_docs
        
    # def load_documents(self):
    #     documents = {}
    #     for file_path in glob.glob(self.docs_path):
    #         name = os.path.splitext(os.path.basename(file_path))[0]
    #         with open(file_path, 'r', encoding='utf-8') as file:
    #             documents[name] = file.read()
    #     return documents

    def retrieve(self, query, top_k=5):
        if self.documents == None:
            return []
        
        # Create a embedding for each query
        query_embedding = self.model.encode(query)
        
        # Get all of document embeddings
        doc_embeddings = np.array([doc['embedding'] for doc in self.documents])
        
        # Score with cosine_scores
        cosine_scores = np.dot(doc_embeddings, query_embedding) / (np.linalg.norm(doc_embeddings, axis=1) * np.linalg.norm(query_embedding))
        top_k_indices = np.argsort(cosine_scores)[-top_k:][::-1]
        
        # Return results of query
        res = []
        for i in top_k_indices:
            doc = self.documents[i]
            res.append({
                "name": doc['name'],
                "content": doc['content'],
                "source": doc['source'],
                "score": cosine_scores[i]
            })
            
        return res
    
    def add_context(self, title, details):
        self.context[title] = details

    def get_relevant_context(self, message):
        relevant_context = []
        msg_lower = message.lower()
        for context_title, context_details in self.context.items():
            title_processed = context_title.lower().replace("_", " ")
            if any(word in msg_lower for word in title_processed.split()):
                relevant_context.append(context_details)
        return relevant_context

### Define all data as path and load it to paths list

In [13]:
employees_dir = os.path.join(data_path, "employees", "*.md")
products_dir = os.path.join(data_path, "products", "*.md")
contracts_dir = os.path.join(data_path, "contracts", "*.md")
company_dir = os.path.join(data_path, "company", "*.md")

all_data_paths = [employees_dir, products_dir, contracts_dir, company_dir]

for path_pattern in all_data_paths:
    matching_files = glob.glob(path_pattern)
    if matching_files:
        print(f"{len(matching_files)} of '{path_pattern}'")
    else:
        print(f"Do not find any markdown file of '{path_pattern}'")

12 of '../data/knowledge-base/employees/*.md'
4 of '../data/knowledge-base/products/*.md'
12 of '../data/knowledge-base/contracts/*.md'
3 of '../data/knowledge-base/company/*.md'


## Initialize the embeddings model

In [22]:
retrieve = UnifiedContextRetriever(paths_list=all_data_paths,model_name=MODEL_EMBEDDING)

Loading embedding model 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'...
Creating embeddings for 31 documents...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

## Going to query the data

In [24]:
msg = "Tell me everything about Lancaster"
results = retrieve.retrieve(msg, top_k=3)
for res in results:
    print(f"Found in source '{res['source']}' (File: {res['name']}) with score {res['score']:.4f}:\n{res['content'][:200]}...\n")

Found in source 'employees' (File: Avery Lancaster) with score 0.3093:
# Avery Lancaster

## Summary
- **Date of Birth**: March 15, 1985  
- **Job Title**: Co-Founder & Chief Executive Officer (CEO)  
- **Location**: San Francisco, California  

## Insurellm Career Progr...

Found in source 'company' (File: about) with score 0.1740:
# About Insurellm

Insurellm was founded by Avery Lancaster in 2015 as an insurance tech startup designed to disrupt an industry in need of innovative products. It's first product was Markellm, the ma...

Found in source 'employees' (File: Samantha Greene) with score 0.1390:
# Samantha Greene

## Summary
- **Date of Birth:** October 14, 1990
- **Job Title:** HR Generalist
- **Location:** Denver, Colorado

## Insurellm Career Progression
- **2020** - Joined Insurellm as a ...



## Transfer the knowledge to the LLM

### Load product data to the context

In [33]:
system_message = "You are an expert in answering accurate questions about Insurellm, the Insurance Tech company. Give brief, accurate answers. If you don't know the answer, say so. Do not make anything up if you haven't been provided with relevant context."

In [34]:
def chat(message, history):
    # retrieve from store
    re_res = retrieve.retrieve(query=message, top_k=3)

    # Augmenting found information into prompt
    formatted_context = ""
    if re_res:
        formatted_context += "Base on this information:\n---\n"
        for result in re_res:
            formatted_context += f"[Source: {result['source']}/{result['name']}]\n{result['content']}\n---\n"
        formatted_context += "Let's answer the questions of user."

    augmented_message = f"{formatted_context}\n\nQuestion: {message}"
    
    messages = [{"role": "system", "content": system_message}]
    for human, ai in history:
        messages.append({"role": "user", "content": human})
        messages.append({"role": "assistant", "content": ai})
    messages.append({"role": "user", "content": augmented_message})
    
    stream = openai.chat.completions.create(
        model=MODEL_GPT,
        messages=messages,
        stream=True
    )

    response = ""
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            response += chunk.choices[0].delta.content
            yield response

In [36]:
print("Launching Gradio Interface...")
view = gr.ChatInterface(
    fn=chat,
    title="🤖 Smart RAG Chatbot",
    description="Ask me anything about the company's internal data.",
    examples=[
        "Tell me about Avery Lancaster",
        "What are the main features of the Lancaster Sofa?",
        "Summarize the company's remote work policy"
    ],
    chatbot=gr.Chatbot(height=500),
).launch()

Launching Gradio Interface...


  chatbot=gr.Chatbot(height=500),


* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.
