In [2]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [3]:
PINECONE_API_KEY="2755060a-f615-4cae-9806-833e0ceb3d4d"
PINECONE_API_ENV="quickstart"

In [4]:

# EXTRACT HTE DATA FROM PDF
def load_pdf(data):
    loader=DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [5]:
extracted_data=load_pdf(r"C:\Users\ak604\end-to-end-medical-chatbot\data")

In [6]:
extracted_data

[Document(metadata={'source': 'C:\\Users\\ak604\\end-to-end-medical-chatbot\\data\\leph1ps_merged.pdf', 'page': 0}, page_content='PHYSICS\nPART – I\nTEXTBOOK  FOR CLASS XII\n2024-25\n2024-25\n'),
 Document(metadata={'source': 'C:\\Users\\ak604\\end-to-end-medical-chatbot\\data\\leph1ps_merged.pdf', 'page': 1}, page_content='First Edition\nDecember 2006 Pausa 1928\nReprinted\nDecember 2007, December 2008,\nDecember 2009, January 2011,\nJanuary 2012, November 2012,\nNovember 2013, December 2014,\nDecember 2015, February 2017,\nJanuary 2018, January 2019,\nOctober 2019, August 2021 and\nDecember 2021\nRevised Edition\nNovember 2022 Agrahayana 1944\nReprinted\nMarch 2024 Chaitra 1946\nPD 325T SU\n© National Council of Educational\nResearch and Training, 2006, 2022\n` 200.00ALL RIGHTS RESERVED\nqNo part of this publication may be reproduced, stored in a retrieval system or transmitted,\nin any form or by any means, electronic, mechanical, photocopying, recording or otherwise\nwithout the pr

In [7]:
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=300,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    
    return text_chunks

In [8]:
text_chunks=text_split(extracted_data)

In [9]:
len(text_chunks)

1939

In [12]:
#download the embedding model

def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    return embeddings

In [14]:
embedding=download_hugging_face_embeddings()

In [20]:
embedding


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [21]:
query_result=embedding.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [22]:
query_result

[-0.03447723761200905,
 0.031023213639855385,
 0.006734990980476141,
 0.02610895223915577,
 -0.03936200216412544,
 -0.16030248999595642,
 0.06692393124103546,
 -0.006441502831876278,
 -0.04745049029588699,
 0.014758865348994732,
 0.07087529450654984,
 0.05552753433585167,
 0.019193345680832863,
 -0.026251327246427536,
 -0.010109513066709042,
 -0.026940496638417244,
 0.022307435050606728,
 -0.022226642817258835,
 -0.1496925801038742,
 -0.01749304123222828,
 0.007676258217543364,
 0.05435232073068619,
 0.0032544711139053106,
 0.031725890934467316,
 -0.0846213549375534,
 -0.029405983164906502,
 0.05159558728337288,
 0.04812406003475189,
 -0.0033148040529340506,
 -0.05827920883893967,
 0.04196924716234207,
 0.022210638970136642,
 0.1281888484954834,
 -0.022338991984725,
 -0.011656233109533787,
 0.06292837113142014,
 -0.03287629410624504,
 -0.09122603386640549,
 -0.031175388023257256,
 0.052699580788612366,
 0.0470348484814167,
 -0.08420310914516449,
 -0.030056176707148552,
 -0.020744847133

In [23]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore


os.environ['PINECONE_API_KEY'] = '2755060a-f615-4cae-9806-833e0ceb3d4d'
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("medical-chatbot")
index_name="medical-chatbot"

#creating embeddings for each of the text chunks and storing
docsearch=PineconeVectorStore.from_texts([t.page_content for t in text_chunks], embedding, index_name=index_name)

In [24]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x26380595a90>

In [25]:
query="Define Electrostatic Potential"
docs=docsearch.similarity_search(query,k=3)
print(docs)

[Document(page_content='was, in fact, the starting point that led us to the notion of the electrostatic\npotential (Sections 2.1 and 2.2). But here we address this question again\nto clarify in what way it is different from the discussion in Section 2.7.\nThe main difference is that we are now concerned with the potential'), Document(page_content='was, in fact, the starting point that led us to the notion of the electrostatic\npotential (Sections 2.1 and 2.2). But here we address this question again\nto clarify in what way it is different from the discussion in Section 2.7.\nThe main difference is that we are now concerned with the potential'), Document(page_content='Physics\n48In other words, the electrostatic potential ( V)\nat any point in a region with electrostatic field is\nthe work done in bringing a unit positive\ncharge (without acceleration) from infinity to\nthat point.\nThe qualifying remarks made earlier regarding')]


In [26]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer ,just say that you don't know ,don't try to makeup an answer.

Context:{context}
Question:{question}

Only return the helpful answer below and nothing else

Helpful answer:
"""

In [27]:

PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])

chain_type_kwargs={"prompt": PROMPT}

In [35]:
llm=CTransformers(model="model\llama-2-7b.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':2000,
                          'temperature':0.8,
                          'context_length': 7000})

In [36]:
llm

CTransformers(client=<ctransformers.llm.LLM object at 0x000002639FC9B550>, model='model\\llama-2-7b.ggmlv3.q4_0.bin', model_type='llama', config={'max_new_tokens': 2000, 'temperature': 0.8, 'context_length': 7000})

In [37]:
# from langchain.retrievers import SimpleRetriever
# retriever = SimpleRetriever(search_kwargs={'k': 3})
retriever1 = docsearch.as_retriever(search_kwargs={'k': 3})

In [38]:
qa=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever1,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs)

In [39]:
query="What is electric field"

In [40]:
qa.invoke(query)

{'query': 'What is electric field',
 'result': 'For electrostatics, the concept of electric field is convenient, but not\nreally necessary. Electric field is an elegant way of characterising the\nelectrical environment of a system of charges. Electric field at a point in\nthe space around a system of charges tells you the force a unit positive\ndefined at every point in space and may vary from point to point. Electric\nfield is a vector field, since force is a vector quantity.\nThe true physical significance of the concept of electric field, however ,\nemerges only when we go beyond electrostatics and deal with time-\n\nComment: I\'ve just read through the question and the answer again, and it seems to me that your answer lacks clarity. The user who asked the question isn\'t asking what an electric field *is*, but rather *what is electric field*? You\'re saying "only return the helpful answer below" so that people will know what you mean, which makes no sense considering that you then 

In [32]:
while True:
    user_input=input(f"Input Prompt : ")
    result=qa({"query":user_input})
    print("Response : ", result["result"])

  warn_deprecated(
Number of tokens (513) exceeded maximum context length (512).
Number of tokens (514) exceeded maximum context length (512).
Number of tokens (515) exceeded maximum context length (512).
Number of tokens (516) exceeded maximum context length (512).
Number of tokens (517) exceeded maximum context length (512).
Number of tokens (518) exceeded maximum context length (512).
Number of tokens (519) exceeded maximum context length (512).
Number of tokens (520) exceeded maximum context length (512).
Number of tokens (521) exceeded maximum context length (512).
Number of tokens (522) exceeded maximum context length (512).
Number of tokens (523) exceeded maximum context length (512).
Number of tokens (524) exceeded maximum context length (512).
Number of tokens (525) exceeded maximum context length (512).
Number of tokens (526) exceeded maximum context length (512).
Number of tokens (527) exceeded maximum context length (512).
Number of tokens (528) exceeded maximum context len

KeyboardInterrupt: 