In [1]:
print("OK")

OK


In [14]:
import pinecone
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [17]:
# Extract data from pdf
def load_pdf(data):
    loader = DirectoryLoader(
                data,
                glob="*.pdf",
                loader_cls=PyPDFLoader
                )
    documents = loader.load()
    return documents
    

In [22]:
extracted_data = load_pdf("data")

In [21]:
# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    
    return text_chunks
    

In [23]:
text_chunks = text_split(extracted_data)

In [24]:
def download_hugginface_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [58]:
embeddings = download_hugginface_embeddings()

In [59]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [65]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
import pinecone

# Initialize Pinecone
PINECONE_API = "285cd9e0-faa5-4453-ada7-a05f7af67b93"
pc = Pinecone(api_key="285cd9e0-faa5-4453-ada7-a05f7af67b93")
index_name = "medical-chatbot"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)


In [66]:
from langchain_pinecone import PineconeVectorStore
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [73]:
from uuid import uuid4
from langchain_core.documents import Document

documents = [Document(page_content=t.page_content) for t in text_chunks]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)


['e256ae01-2dc1-43ff-8ec5-9a89861f2b57',
 '380bfd7f-051b-4e2b-9522-5cde174b3d62',
 '167d43f2-4a87-4b19-b298-25f2a0d757a4',
 '4aad372c-47e8-415a-9834-9e94fdea6ce6',
 '8a9a97ff-a042-4a93-b240-68fa035de185',
 '3974fd08-463e-4b1f-b975-edd305b94d15',
 '3a779741-fba6-4740-8e58-e1d5930e8d19',
 '0c718b2a-81ae-4627-8ec9-09c43b4e7ec8',
 'ce4e32cb-b113-4500-8768-f901c258f50d',
 'e12ed1f8-25d8-4f8f-96ca-4f4185e38619',
 'e23d91da-c327-4c6d-a218-22fba0483131',
 '05918c11-683a-49c7-819a-a41e7a652235',
 'a73bc06b-ba2c-4a36-9aed-cbadf0f21b0b',
 '61c9af77-b04c-418e-8103-f8b94fcd32ef',
 'c8f7e61f-45dd-49df-ae6b-9e07acf3c5ef',
 'd779730c-5aea-4ef2-9c77-f65dba55fe71',
 '0838e31b-5b97-4bde-9e5b-c9f17a8527ad',
 '09fef84c-f7b3-43d5-b97e-4262e83b7bcc',
 '4b12b84d-20c8-4ac7-9de2-ecfe69029ade',
 '846b34ad-ed67-4018-922c-f5fab42b0df3',
 '3ba3fa71-74ed-4021-9351-0908a8236596',
 'c73e2735-5f56-4a58-9e8c-928a6880b013',
 '511f3511-026f-453f-b1b8-019e5d563295',
 'fd64f2f1-3316-4380-aae2-535fb5134aaa',
 '9b7e8e1d-c0ff-

In [None]:
results = vector_store.similarity_search_with_score(
    "I am having cold what could be the reason and which medicine i should take?", k=3
)
for res, score in results:
    print(f"{res.page_content}")

In [81]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 1, "score_threshold": 0.5},
)
retriever.invoke("I am having cold what could be the reason and which medicine i should take?")

[Document(page_content='medications, ( corticosteroids ,chemotherapy drugs)—\nshould consult their doctor if they get a cold. People with\nthese health problems are more likely to get a secondary\ninfection.\nDiagnosis\nColds are diagnosed by observing a person’s symp-\ntoms. There are no laboratory tests readily available to\ndetect the cold virus. However, a doctor may do a throat\nculture or blood test to rule out a secondary infection.\nInfluenza is sometimes confused with a cold, but flu')]

In [82]:
prompt_template=
"""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [84]:
PROMPT = PromptTemplate(template=prompt_template, input_variable=["context","question"])
chain_type_kwargs = {"prompt":PROMPT}

In [87]:
llm = CTransformers(
    model = "/Users/kumarrohit/Downloads/llama-2-7b-chat.ggmlv3.q4_0.bin",
    model_type = "llama",
    config={'max_new_tokens' :512,
'temperature': 0.8})

: 

: 