<a href="https://colab.research.google.com/github/kmalhotra18/RAG/blob/main/Expert_Knowledge_Worker_langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Expert Knowledge Worker**

1.   A question answering agent that is an expert knowledge worker
2.   To be used by employees of Insurellm, an Insurance Tech company
3.   The agent needs to be accurate and the solution should be low cost.


In [None]:
!pip install -q OpenAI
!pip install -q google-generativeai
!pip install -q python-dotenv
!pip install -q anthropic
!pip install -q gradio
!pip install -q langchain-community # Install the langchain-community package

In [None]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [None]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader                # DirectoryLoader loads in entire folder, TextLoader is to load individual text files
from langchain.text_splitter import CharacterTextSplitter                         # Divides document into chunks of characters

In [None]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [None]:
# Load environment variables in a file called .env

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase
#folders = glob.glob("knowledge-base/*")

folders = glob.glob("/content/drive/MyDrive/Llms/llm_engineering/week5/knowledge-base/*")             # Get list of folders in knowledge base

text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:                                                                                # For each folder, get types of documents
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()                                                                       # To bring in all documents
    for doc in folder_docs:                                                                           # For each document, add metadata as doc type, and add to list called 'documents'
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [None]:
len(documents)

In [None]:
documents[24]

In [None]:
# Take the document, divide each doc into chunks of characters

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)                     # Divide each document in roughly 1000 chunk size. Chunk overlap - some content of doc thats common in differnet chunks.
chunks = text_splitter.split_documents(documents)

In [None]:
len(chunks)

In [None]:
chunks[6]

In [None]:
# See doc types across all the chunks

doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

In [None]:
# Look through each chunk, see which has CEO in chunks (as an example)

for chunk in chunks:
    if 'CEO' in chunk.page_content:
        print(chunk)
        print("_________")