In [1]:
!pip install -q --progress-bar on langchain_chroma langchain_huggingface langchain_community sentence-transformers langchain_openai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.5/82.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.4/21.4 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m473.0/473.0 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[

In [2]:
import os
import glob
import tiktoken
import numpy as np
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from openai import OpenAI
from langchain_core.messages import SystemMessage, HumanMessage
from google.colab import userdata
from google.colab import drive
from langchain_openai import ChatOpenAI
import gradio as gr

In [3]:
model ="gpt-4.1-nano"
db_name ="vector_db"
openai_api_key = userdata.get('OPENAI_API_KEY')
openai =OpenAI(api_key=openai_api_key)
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
knowledge_base_path = "/content/drive/MyDrive/Upgrad/knowledge-base/**/*.md"
files = glob.glob(knowledge_base_path,recursive=True)
print(f"Found {len(files)} files found in knowledge base")

Found 76 files found in knowledge base


In [5]:
entire_knowledge_base =""
for file in files:
    with open(file,"r",encoding='utf-8') as f:
        entire_knowledge_base += f.read()
        entire_knowledge_base +="\n\n"

print(f"Total characters in knowledge base :{len(entire_knowledge_base)}")

Total characters in knowledge base :304434


In [6]:
encoding = tiktoken.encoding_for_model(model)
tokens = encoding.encode(entire_knowledge_base)
print(f"Total token count: {len(tokens)}")

Total token count: 63555


In [7]:
# Loading in Langchain Loaders
folders = glob.glob("/content/drive/MyDrive/Upgrad/knowledge-base/*")
documents =[]
for folder in folders:
    doc_type =os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md",loader_cls=TextLoader, loader_kwargs={'encoding':'utf-8'})
    folder_docs= loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] =doc_type
        documents.append(doc)

print(f"Loaded {len(documents)} documents")

Loaded 76 documents


In [8]:
documents[1]

Document(metadata={'source': '/content/drive/MyDrive/Upgrad/knowledge-base/products/Bizllm.md', 'doc_type': 'products'}, page_content="# Product Summary\n\n# Bizllm\n\n## Summary\n\nBizllm is Insurellm's enterprise-grade commercial insurance platform designed to revolutionize how insurers serve business customers. From small businesses to large corporations, Bizllm provides comprehensive tools for underwriting, policy administration, and risk management across multiple commercial lines including general liability, professional liability, property, workers' compensation, and cyber insurance. By leveraging AI and industry-specific data analytics, Bizllm enables commercial insurers to assess complex risks accurately, price policies competitively, and deliver exceptional service to business clients.\n\n## Features\n\n### 1. Multi-Line Underwriting Engine\nBizllm's sophisticated underwriting platform handles diverse commercial insurance products within a single unified system. AI-powered ri

In [9]:
# Divide into chunks using RecursiveTextCharacterSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
chunks =text_splitter.split_documents(documents)

In [10]:
print(f"Divided into {len(chunks)} chunks")
print(f"First chunk:\n\n{chunks[0]}")

Divided into 413 chunks
First chunk:

page_content='# Product Summary

# Carllm

## Summary

Carllm is an innovative auto insurance product developed by Insurellm, designed to streamline the way insurance companies offer coverage to their customers. Powered by cutting-edge artificial intelligence, Carllm utilizes advanced algorithms to deliver personalized auto insurance solutions, ensuring optimal coverage while minimizing costs. With a robust infrastructure that supports both B2B and B2C customers, Carllm redefines the auto insurance landscape and empowers insurance providers to enhance customer satisfaction and retention.

## Features

- **AI-Powered Risk Assessment**: Carllm leverages artificial intelligence to analyze driver behavior, vehicle conditions, and historical claims data. This enables insurers to make informed decisions and set competitive premiums that reflect true risk profiles.' metadata={'source': '/content/drive/MyDrive/Upgrad/knowledge-base/products/Carllm.md', 'do

In [11]:
# Make vectors and store in Chroma
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
if os.path.exists(db_name):
    Chroma(persist_directory=db_name,embedding_function=embeddings).delete_collection()
vector_store = Chroma.from_documents(documents=chunks,embedding =embeddings,persist_directory=db_name)
print(f"Vector store created with {vector_store._collection.count()} documents")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store created with 413 documents


In [12]:
collection = vector_store._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 413 vectors with 384 dimensions in the vector store


In [13]:
#Visualise
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [14]:
# 2-D
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [15]:
# 3-D
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=10, b=10, l=10, t=40)
)

fig.show()

In [17]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
llm = ChatOpenAI(temperature=0, model_name=model, api_key=openai_api_key)

In [18]:
SYSTEM_PROMPT_TEMPLATE = """
You are a knowledgeable, friendly assistant representing the company Insurellm.
You are chatting with a user about Insurellm.
If relevant, use the given context to answer any question.
If you don't know the answer, say so.
Context:
{context}
"""

In [19]:
def answer_question(question:str, history):
  docs =retriever.invoke(question)
  questions ="\n\n".join(doc.page_content for doc in docs)
  system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=questions)
  response = llm.invoke([SystemMessage(content=system_prompt),HumanMessage(content=question)])
  return response.content

In [20]:
answer_question("Who is Averi Lancaster?", [])

'It seems like there might be a typo in the name. If you are referring to Avery Lancaster, she is the Co-Founder and Chief Executive Officer (CEO) of Insurellm. She co-founded the company in 2015 and has been instrumental in guiding its growth and success in the insurance technology sector. If you meant someone else, please let me know!'

In [21]:
gr.ChatInterface(answer_question).launch()


The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6ddb05fdb677a33d92.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


