# RAG SYSTEM LLMs

In [1]:
# Import libraries
import os
import glob

from dotenv import load_dotenv
import gradio as gr
from openai import OpenAI

In [None]:
# Unzip the data files
import zipfile
def unzip_file(path, data_dir, delete=True):
    if not os.path.exists(path):
        print("Cannot find this zipfile.")
        return
    
    if path.endswith(".zip"):
        with zipfile.ZipFile(path, "r") as zipref:
            zipref.extractall(data_dir)
            print("Unzip succesfully to:",data_dir)
        if delete:
            os.remove(path)
            print("Deleted zipfile.")
            return
    else:
        print(f"This format file is not accepted: {path}")

data_dir = "../data"
path = f"{data_dir}/knowledge-base.zip"

unzip_file(path=path,data_dir=data_dir)



Cannot find this zipfile.


In [9]:
data_path = os.path.join(data_dir, "knowledge-base")
print(data_path)

../data/knowledge-base


In [10]:
# Hyperparameters
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", 'your_api_key_here')
MODEL_NAME = "gpt-4o-mini"
TEMPERATURE = 0.1

openai = OpenAI()

## Add date to the context

### Add employees data to the context

In [17]:
context = {}
employees_dir = os.path.join(data_path, "employees/*")
employees = glob.glob(employees_dir)

for employee in employees:
    name = employee.split(' ')[-1][:-3]
    doc = ""
    with open(employee, "r", encoding="utf-8") as f:
        doc = f.read()
    context[name]=doc

In [14]:
context.keys()

dict_keys(['Carter', 'Greene', 'Thomson', 'Tran', 'Thompson', 'Chen', 'Trenton', 'Harper', 'Blake', 'Lancaster', 'Bishop', 'Spencer'])

In [13]:
context["Lancaster"]

"# Avery Lancaster\n\n## Summary\n- **Date of Birth**: March 15, 1985  \n- **Job Title**: Co-Founder & Chief Executive Officer (CEO)  \n- **Location**: San Francisco, California  \n\n## Insurellm Career Progression\n- **2015 - Present**: Co-Founder & CEO  \n  Avery Lancaster co-founded Insurellm in 2015 and has since guided the company to its current position as a leading Insurance Tech provider. Avery is known for her innovative leadership strategies and risk management expertise that have catapulted the company into the mainstream insurance market.  \n\n- **2013 - 2015**: Senior Product Manager at Innovate Insurance Solutions  \n  Before launching Insurellm, Avery was a leading Senior Product Manager at Innovate Insurance Solutions, where she developed groundbreaking insurance products aimed at the tech sector.  \n\n- **2010 - 2013**: Business Analyst at Edge Analytics  \n  Prior to joining Innovate, Avery worked as a Business Analyst, focusing on market trends and consumer preferenc

In [36]:
class ContextRetriever:
    def __init__(self, docs_path="docs/*.txt"):
        self.docs_path = docs_path
        self.documents = self.load_documents()
        self.context = self.load_documents()

    def load_documents(self):
        documents = {}
        for file_path in glob.glob(self.docs_path):
            name = os.path.splitext(os.path.basename(file_path))[0]
            with open(file_path, 'r', encoding='utf-8') as file:
                documents[name] = file.read()
        return documents

    def retrieve(self, query, top_k=3):
        scored_docs = [
            (name, doc, doc.lower().count(query.lower()))
            for name, doc in self.documents.items()
        ]
        scored_docs.sort(key=lambda x: x[2], reverse=True)
        results = [(name, doc) for name, doc, score in scored_docs[:top_k] if score > 0]
        if not results:
            print(f"Not found any document matching: '{query}'")
        return results
    
    def add_context(self, title, details):
        """Thêm 1 context thủ công vào biến self.context"""
        self.context[title] = details

    def get_relevant_context(self, message):
        relevant_context = []
        msg_lower = message.lower()
        for context_title, context_details in self.context.items():
            title_processed = context_title.lower().replace("_", " ")
            if title_processed in msg_lower:
                relevant_context.append(context_details)
        return relevant_context


In [35]:
retriever = ContextRetriever()
msg = "Tell me about Lancaster and Employee Policy"
print(retriever.get_relevant_context(msg))

[]


In [33]:
print(retriever.get_relevant_context("Who is Alex Lancaster?"))

[]


### Load product data to the context

In [26]:
### Retrieve the product
product_dir = os.path.join(data_path, "products", "*")

retriever_product = ContextRetriever(docs_path=product_dir)

results = retriever_product.retrieve("Lancaster", top_k=2)
for name, doc in results:
    print(f"Found in {name}:\n{doc[:500]}...\n")