In [1]:
import os 
import sys
import csv

from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
# from langchain.agents.agent_types import AgentType
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma


In [11]:
from langchain.document_loaders.pdf import PyPDFLoader
from langchain.document_loaders.pdf import PDFMinerLoader
from langchain.document_loaders import UnstructuredPDFLoader


In [3]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)
from langchain.vectorstores import Chroma


## Load Data

In [12]:
loader = PyPDFLoader(file_path='data/sample_insurance.pdf')
# loader = PDFMinerLoader(file_path='data/sample_insurance.pdf')

data = loader.load()

In [13]:
data

[Document(page_content='Sabse bada sach\nde beneﬁts SO MUCH!\n', metadata={'source': 'data/sample_insurance.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': 'data/sample_insurance.pdf', 'page': 1}),
 Document(page_content='Introduction  \nOptima Secure is a testament to the years of trust \n1.5+ crore customers have bestowed upon us.It’s great when you ask for something and get more in return, \nisn’t it? That’s why, HDFC ERGO brings to you a health insurance\nplan that gives you SO MUCH more beneﬁts than you had asked for !\nThe new HDFC ERGO O ptima Secure provides 4X coverage, at no \nadditional cost, thereby redeﬁning the value you get from health \ninsurance. It doesn’t just secure your present, but safeguards your \nfuture as well. \nSo Much\nCoverage\nSo Much\nChoiceSo Much\nTrust\nSo Much\nMore...A policy like no other.', metadata={'source': 'data/sample_insurance.pdf', 'page': 2}),
 Document(page_content='Secure Beneﬁt is a ﬁrst-of-its-kind beneﬁt of Optima Se

## Convert data into chunks

In [25]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
pages = text_splitter.split_text(data[5].page_content)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.create_documents(pages)

len(texts)


1

In [26]:
texts

[Document(page_content='With the Restore Beneﬁt of Optima Secure, if any claim, partial \nor total, is made any time during the year, then 100% of the \nbase cover gets restored in the policy cover automatically, \nat no additional cost!100% restore coverageRestore BeneﬁtGet So Much Coverage* Guaranteed^^', metadata={})]

## Embedd the documents in the vectorstor 

In [27]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-ada-002")
vectorstore = Chroma.from_documents(data,embedding=embeddings, persist_directory="./chroma_db_pdf")

In [34]:
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":2})

# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(openai_api_key=OPENAI_API_KEY), 
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True,
    verbose=True)

In [36]:
def print_res(res):
    print(res['result'])
    print("Docs::")
    for i, doc in enumerate(res["source_documents"]):
        print(f'{i}')
        print(doc.page_content)

In [39]:
query = "What all are covered in hospitalization expenses?"
result = qa({"query": query})

print_res(result)



[1m> Entering new  chain...[0m

[1m> Finished chain.[0m
 Hospitalization expenses can include tests, medicine purchases, doctor visits, nursing charges, daily cash of INR 800 per day up to a maximum of INR 4800 on hospitalization, peripheral costs incurred by the patient or their caregiver on travelling, food, lodging, and so on.
Docs::
0
Domiciliary hospitalisation
Secures all medical expenses during Domiciliary hospitalisation. 
Organ donor expenses
Get reimbursements for medical expenses incurred for organ donor’s 
in-patient treatment for harvesting of the organ donated.
Emergency air ambulance
The policy pays for air ambulance transportation services during 
your emergency needs.
This option gives discount on your premiums, by opting to pay an 
initial deductible amount for claims made in any policy year.
25,000 25%
40%
50%15%
30%
40%50,000
100,000Deductible AmountBase Sum Insured
up to 20 lakhsBase Sum Insured
above 20 lakhsAggregate deductible
Value buy
You can choose any 

In [None]:
result

In [None]:
query = "How to use neural networks to train Alexnet?"
result_02 = qa({"query": query})


In [None]:
print_res(result_02)

In [None]:
result_02