In [2]:
import os
import glob
from dotenv import load_dotenv 
import gradio as gr 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import CharacterTextSplitter

In [4]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')



In [7]:
folders = glob.glob("knowledgebase/*")

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [8]:
len(documents)

14

In [9]:
text_splitter = CharacterTextSplitter(chunk_size =100, chunk_overlap=50)
chunks= text_splitter.split_documents(documents)

Created a chunk of size 517, which is longer than the specified 100
Created a chunk of size 273, which is longer than the specified 100
Created a chunk of size 266, which is longer than the specified 100
Created a chunk of size 277, which is longer than the specified 100
Created a chunk of size 267, which is longer than the specified 100
Created a chunk of size 233, which is longer than the specified 100
Created a chunk of size 236, which is longer than the specified 100
Created a chunk of size 367, which is longer than the specified 100
Created a chunk of size 141, which is longer than the specified 100
Created a chunk of size 106, which is longer than the specified 100
Created a chunk of size 175, which is longer than the specified 100
Created a chunk of size 207, which is longer than the specified 100
Created a chunk of size 219, which is longer than the specified 100
Created a chunk of size 633, which is longer than the specified 100
Created a chunk of size 244, which is longer tha

In [10]:
documents[0]

Document(metadata={'source': 'knowledgebase/products/Rellm.md', 'doc_type': 'products'}, page_content="# Product Summary\n\n# Rellm: AI-Powered Enterprise Reinsurance Solution\n\n## Summary\n\nRellm is an innovative enterprise reinsurance product developed by Insurellm, designed to transform the way reinsurance companies operate. Harnessing the power of artificial intelligence, Rellm offers an advanced platform that redefines risk management, enhances decision-making processes, and optimizes operational efficiencies within the reinsurance industry. With seamless integrations and robust analytics, Rellm enables insurers to proactively manage their portfolios and respond to market dynamics with agility.\n\n## Features\n\n### AI-Driven Analytics\nRellm utilizes cutting-edge AI algorithms to provide predictive insights into risk exposures, enabling users to forecast trends and make informed decisions. Its real-time data analysis empowers reinsurance professionals with actionable intelligen

In [11]:
len(chunks)

183

In [12]:
chunks[0]

Document(metadata={'source': 'knowledgebase/products/Rellm.md', 'doc_type': 'products'}, page_content='# Product Summary\n\n# Rellm: AI-Powered Enterprise Reinsurance Solution\n\n## Summary')

In [16]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: products, employees


In [17]:
for chunk in chunks:
    if 'CEO' in chunk.page_content:
        print(chunk)
        print("_________")

page_content='## Summary
- **Date of Birth**: March 15, 1985  
- **Job Title**: Co-Founder & Chief Executive Officer (CEO)  
- **Location**: San Francisco, California' metadata={'source': 'knowledgebase/employees/Avery Lancaster.md', 'doc_type': 'employees'}
_________
page_content='## Insurellm Career Progression
- **2015 - Present**: Co-Founder & CEO  
  Avery Lancaster co-founded Insurellm in 2015 and has since guided the company to its current position as a leading Insurance Tech provider. Avery is known for her innovative leadership strategies and risk management expertise that have catapulted the company into the mainstream insurance market.' metadata={'source': 'knowledgebase/employees/Avery Lancaster.md', 'doc_type': 'employees'}
_________
